From 2c3c1048746a4622d8c89a29670120dc8fab93c4 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:49:45 +0200 Subject: Adding upstream version 6.1.76. Signed-off-by: Daniel Baumann --- drivers/md/Kconfig | 661 ++ drivers/md/Makefile | 114 + drivers/md/bcache/Kconfig | 38 + drivers/md/bcache/Makefile | 7 + drivers/md/bcache/alloc.c | 736 ++ drivers/md/bcache/bcache.h | 1049 ++ drivers/md/bcache/bcache_ondisk.h | 446 + drivers/md/bcache/bset.c | 1390 +++ drivers/md/bcache/bset.h | 593 ++ drivers/md/bcache/btree.c | 2800 ++++++ drivers/md/bcache/btree.h | 417 + drivers/md/bcache/closure.c | 207 + drivers/md/bcache/closure.h | 378 + drivers/md/bcache/debug.c | 263 + drivers/md/bcache/debug.h | 35 + drivers/md/bcache/extents.c | 630 ++ drivers/md/bcache/extents.h | 15 + drivers/md/bcache/features.c | 75 + drivers/md/bcache/features.h | 113 + drivers/md/bcache/io.c | 175 + drivers/md/bcache/journal.c | 1000 ++ drivers/md/bcache/journal.h | 188 + drivers/md/bcache/movinggc.c | 252 + drivers/md/bcache/request.c | 1345 +++ drivers/md/bcache/request.h | 47 + drivers/md/bcache/stats.c | 234 + drivers/md/bcache/stats.h | 63 + drivers/md/bcache/super.c | 2941 ++++++ drivers/md/bcache/sysfs.c | 1204 +++ drivers/md/bcache/sysfs.h | 142 + drivers/md/bcache/trace.c | 53 + drivers/md/bcache/util.c | 287 + drivers/md/bcache/util.h | 562 ++ drivers/md/bcache/writeback.c | 1100 ++ drivers/md/bcache/writeback.h | 155 + drivers/md/dm-audit.c | 84 + drivers/md/dm-audit.h | 66 + drivers/md/dm-bio-prison-v1.c | 458 + drivers/md/dm-bio-prison-v1.h | 138 + drivers/md/dm-bio-prison-v2.c | 364 + drivers/md/dm-bio-prison-v2.h | 152 + drivers/md/dm-bio-record.h | 56 + drivers/md/dm-bufio.c | 2181 ++++ drivers/md/dm-builtin.c | 49 + drivers/md/dm-cache-background-tracker.c | 257 + drivers/md/dm-cache-background-tracker.h | 80 + drivers/md/dm-cache-block-types.h | 54 + drivers/md/dm-cache-metadata.c | 1860 ++++ drivers/md/dm-cache-metadata.h | 153 + drivers/md/dm-cache-policy-internal.h | 163 + drivers/md/dm-cache-policy-smq.c | 1953 ++++ drivers/md/dm-cache-policy.c | 173 + drivers/md/dm-cache-policy.h | 186 + drivers/md/dm-cache-target.c | 3454 +++++++ drivers/md/dm-clone-metadata.c | 1028 ++ drivers/md/dm-clone-metadata.h | 177 + drivers/md/dm-clone-target.c | 2229 +++++ drivers/md/dm-core.h | 341 + drivers/md/dm-crypt.c | 3678 +++++++ drivers/md/dm-delay.c | 404 + drivers/md/dm-dust.c | 594 ++ drivers/md/dm-ebs-target.c | 475 + drivers/md/dm-era-target.c | 1756 ++++ drivers/md/dm-exception-store.c | 290 + drivers/md/dm-exception-store.h | 205 + drivers/md/dm-flakey.c | 530 + drivers/md/dm-ima.c | 749 ++ drivers/md/dm-ima.h | 78 + drivers/md/dm-init.c | 323 + drivers/md/dm-integrity.c | 4671 +++++++++ drivers/md/dm-io-rewind.c | 166 + drivers/md/dm-io-tracker.h | 81 + drivers/md/dm-io.c | 544 + drivers/md/dm-ioctl.c | 2271 +++++ drivers/md/dm-kcopyd.c | 988 ++ drivers/md/dm-linear.c | 229 + drivers/md/dm-log-userspace-base.c | 937 ++ drivers/md/dm-log-userspace-transfer.c | 286 + drivers/md/dm-log-userspace-transfer.h | 18 + drivers/md/dm-log-writes.c | 960 ++ drivers/md/dm-log.c | 896 ++ drivers/md/dm-mpath.c | 2256 +++++ drivers/md/dm-mpath.h | 22 + drivers/md/dm-path-selector.c | 140 + drivers/md/dm-path-selector.h | 107 + drivers/md/dm-ps-historical-service-time.c | 565 ++ drivers/md/dm-ps-io-affinity.c | 274 + drivers/md/dm-ps-queue-length.c | 286 + drivers/md/dm-ps-round-robin.c | 240 + drivers/md/dm-ps-service-time.c | 364 + drivers/md/dm-raid.c | 4112 ++++++++ drivers/md/dm-raid1.c | 1511 +++ drivers/md/dm-region-hash.c | 724 ++ drivers/md/dm-rq.c | 596 ++ drivers/md/dm-rq.h | 47 + drivers/md/dm-snap-persistent.c | 972 ++ drivers/md/dm-snap-transient.c | 155 + drivers/md/dm-snap.c | 2866 ++++++ drivers/md/dm-stats.c | 1249 +++ drivers/md/dm-stats.h | 48 + drivers/md/dm-stripe.c | 495 + drivers/md/dm-switch.c | 592 ++ drivers/md/dm-sysfs.c | 147 + drivers/md/dm-table.c | 2163 ++++ drivers/md/dm-target.c | 175 + drivers/md/dm-thin-metadata.c | 2154 ++++ drivers/md/dm-thin-metadata.h | 241 + drivers/md/dm-thin.c | 4529 +++++++++ drivers/md/dm-uevent.c | 206 + drivers/md/dm-uevent.h | 46 + drivers/md/dm-unstripe.c | 211 + drivers/md/dm-verity-fec.c | 823 ++ drivers/md/dm-verity-fec.h | 153 + drivers/md/dm-verity-loadpin.c | 86 + drivers/md/dm-verity-target.c | 1530 +++ drivers/md/dm-verity-verify-sig.c | 138 + drivers/md/dm-verity-verify-sig.h | 60 + drivers/md/dm-verity.h | 135 + drivers/md/dm-writecache.c | 2775 +++++ drivers/md/dm-zero.c | 88 + drivers/md/dm-zone.c | 648 ++ drivers/md/dm-zoned-metadata.c | 3046 ++++++ drivers/md/dm-zoned-reclaim.c | 640 ++ drivers/md/dm-zoned-target.c | 1173 +++ drivers/md/dm-zoned.h | 303 + drivers/md/dm.c | 3397 +++++++ drivers/md/dm.h | 227 + drivers/md/md-autodetect.c | 282 + drivers/md/md-bitmap.c | 2653 +++++ drivers/md/md-bitmap.h | 285 + drivers/md/md-cluster.c | 1602 +++ drivers/md/md-cluster.h | 36 + drivers/md/md-faulty.c | 363 + drivers/md/md-linear.c | 319 + drivers/md/md-linear.h | 17 + drivers/md/md-multipath.c | 470 + drivers/md/md-multipath.h | 32 + drivers/md/md.c | 10020 +++++++++++++++++++ drivers/md/md.h | 838 ++ drivers/md/persistent-data/Kconfig | 10 + drivers/md/persistent-data/Makefile | 13 + drivers/md/persistent-data/dm-array.c | 1011 ++ drivers/md/persistent-data/dm-array.h | 219 + drivers/md/persistent-data/dm-bitset.c | 317 + drivers/md/persistent-data/dm-bitset.h | 205 + drivers/md/persistent-data/dm-block-manager.c | 656 ++ drivers/md/persistent-data/dm-block-manager.h | 135 + drivers/md/persistent-data/dm-btree-internal.h | 160 + drivers/md/persistent-data/dm-btree-remove.c | 759 ++ drivers/md/persistent-data/dm-btree-spine.c | 264 + drivers/md/persistent-data/dm-btree.c | 1625 +++ drivers/md/persistent-data/dm-btree.h | 215 + .../persistent-data/dm-persistent-data-internal.h | 19 + drivers/md/persistent-data/dm-space-map-common.c | 1259 +++ drivers/md/persistent-data/dm-space-map-common.h | 145 + drivers/md/persistent-data/dm-space-map-disk.c | 280 + drivers/md/persistent-data/dm-space-map-disk.h | 25 + drivers/md/persistent-data/dm-space-map-metadata.c | 842 ++ drivers/md/persistent-data/dm-space-map-metadata.h | 44 + drivers/md/persistent-data/dm-space-map.h | 168 + .../md/persistent-data/dm-transaction-manager.c | 519 + .../md/persistent-data/dm-transaction-manager.h | 153 + drivers/md/raid0.c | 846 ++ drivers/md/raid0.h | 33 + drivers/md/raid1-10.c | 153 + drivers/md/raid1.c | 3407 +++++++ drivers/md/raid1.h | 218 + drivers/md/raid10.c | 5290 ++++++++++ drivers/md/raid10.h | 184 + drivers/md/raid5-cache.c | 3183 ++++++ drivers/md/raid5-log.h | 151 + drivers/md/raid5-ppl.c | 1532 +++ drivers/md/raid5.c | 9177 +++++++++++++++++ drivers/md/raid5.h | 826 ++ 174 files changed, 150175 insertions(+) create mode 100644 drivers/md/Kconfig create mode 100644 drivers/md/Makefile create mode 100644 drivers/md/bcache/Kconfig create mode 100644 drivers/md/bcache/Makefile create mode 100644 drivers/md/bcache/alloc.c create mode 100644 drivers/md/bcache/bcache.h create mode 100644 drivers/md/bcache/bcache_ondisk.h create mode 100644 drivers/md/bcache/bset.c create mode 100644 drivers/md/bcache/bset.h create mode 100644 drivers/md/bcache/btree.c create mode 100644 drivers/md/bcache/btree.h create mode 100644 drivers/md/bcache/closure.c create mode 100644 drivers/md/bcache/closure.h create mode 100644 drivers/md/bcache/debug.c create mode 100644 drivers/md/bcache/debug.h create mode 100644 drivers/md/bcache/extents.c create mode 100644 drivers/md/bcache/extents.h create mode 100644 drivers/md/bcache/features.c create mode 100644 drivers/md/bcache/features.h create mode 100644 drivers/md/bcache/io.c create mode 100644 drivers/md/bcache/journal.c create mode 100644 drivers/md/bcache/journal.h create mode 100644 drivers/md/bcache/movinggc.c create mode 100644 drivers/md/bcache/request.c create mode 100644 drivers/md/bcache/request.h create mode 100644 drivers/md/bcache/stats.c create mode 100644 drivers/md/bcache/stats.h create mode 100644 drivers/md/bcache/super.c create mode 100644 drivers/md/bcache/sysfs.c create mode 100644 drivers/md/bcache/sysfs.h create mode 100644 drivers/md/bcache/trace.c create mode 100644 drivers/md/bcache/util.c create mode 100644 drivers/md/bcache/util.h create mode 100644 drivers/md/bcache/writeback.c create mode 100644 drivers/md/bcache/writeback.h create mode 100644 drivers/md/dm-audit.c create mode 100644 drivers/md/dm-audit.h create mode 100644 drivers/md/dm-bio-prison-v1.c create mode 100644 drivers/md/dm-bio-prison-v1.h create mode 100644 drivers/md/dm-bio-prison-v2.c create mode 100644 drivers/md/dm-bio-prison-v2.h create mode 100644 drivers/md/dm-bio-record.h create mode 100644 drivers/md/dm-bufio.c create mode 100644 drivers/md/dm-builtin.c create mode 100644 drivers/md/dm-cache-background-tracker.c create mode 100644 drivers/md/dm-cache-background-tracker.h create mode 100644 drivers/md/dm-cache-block-types.h create mode 100644 drivers/md/dm-cache-metadata.c create mode 100644 drivers/md/dm-cache-metadata.h create mode 100644 drivers/md/dm-cache-policy-internal.h create mode 100644 drivers/md/dm-cache-policy-smq.c create mode 100644 drivers/md/dm-cache-policy.c create mode 100644 drivers/md/dm-cache-policy.h create mode 100644 drivers/md/dm-cache-target.c create mode 100644 drivers/md/dm-clone-metadata.c create mode 100644 drivers/md/dm-clone-metadata.h create mode 100644 drivers/md/dm-clone-target.c create mode 100644 drivers/md/dm-core.h create mode 100644 drivers/md/dm-crypt.c create mode 100644 drivers/md/dm-delay.c create mode 100644 drivers/md/dm-dust.c create mode 100644 drivers/md/dm-ebs-target.c create mode 100644 drivers/md/dm-era-target.c create mode 100644 drivers/md/dm-exception-store.c create mode 100644 drivers/md/dm-exception-store.h create mode 100644 drivers/md/dm-flakey.c create mode 100644 drivers/md/dm-ima.c create mode 100644 drivers/md/dm-ima.h create mode 100644 drivers/md/dm-init.c create mode 100644 drivers/md/dm-integrity.c create mode 100644 drivers/md/dm-io-rewind.c create mode 100644 drivers/md/dm-io-tracker.h create mode 100644 drivers/md/dm-io.c create mode 100644 drivers/md/dm-ioctl.c create mode 100644 drivers/md/dm-kcopyd.c create mode 100644 drivers/md/dm-linear.c create mode 100644 drivers/md/dm-log-userspace-base.c create mode 100644 drivers/md/dm-log-userspace-transfer.c create mode 100644 drivers/md/dm-log-userspace-transfer.h create mode 100644 drivers/md/dm-log-writes.c create mode 100644 drivers/md/dm-log.c create mode 100644 drivers/md/dm-mpath.c create mode 100644 drivers/md/dm-mpath.h create mode 100644 drivers/md/dm-path-selector.c create mode 100644 drivers/md/dm-path-selector.h create mode 100644 drivers/md/dm-ps-historical-service-time.c create mode 100644 drivers/md/dm-ps-io-affinity.c create mode 100644 drivers/md/dm-ps-queue-length.c create mode 100644 drivers/md/dm-ps-round-robin.c create mode 100644 drivers/md/dm-ps-service-time.c create mode 100644 drivers/md/dm-raid.c create mode 100644 drivers/md/dm-raid1.c create mode 100644 drivers/md/dm-region-hash.c create mode 100644 drivers/md/dm-rq.c create mode 100644 drivers/md/dm-rq.h create mode 100644 drivers/md/dm-snap-persistent.c create mode 100644 drivers/md/dm-snap-transient.c create mode 100644 drivers/md/dm-snap.c create mode 100644 drivers/md/dm-stats.c create mode 100644 drivers/md/dm-stats.h create mode 100644 drivers/md/dm-stripe.c create mode 100644 drivers/md/dm-switch.c create mode 100644 drivers/md/dm-sysfs.c create mode 100644 drivers/md/dm-table.c create mode 100644 drivers/md/dm-target.c create mode 100644 drivers/md/dm-thin-metadata.c create mode 100644 drivers/md/dm-thin-metadata.h create mode 100644 drivers/md/dm-thin.c create mode 100644 drivers/md/dm-uevent.c create mode 100644 drivers/md/dm-uevent.h create mode 100644 drivers/md/dm-unstripe.c create mode 100644 drivers/md/dm-verity-fec.c create mode 100644 drivers/md/dm-verity-fec.h create mode 100644 drivers/md/dm-verity-loadpin.c create mode 100644 drivers/md/dm-verity-target.c create mode 100644 drivers/md/dm-verity-verify-sig.c create mode 100644 drivers/md/dm-verity-verify-sig.h create mode 100644 drivers/md/dm-verity.h create mode 100644 drivers/md/dm-writecache.c create mode 100644 drivers/md/dm-zero.c create mode 100644 drivers/md/dm-zone.c create mode 100644 drivers/md/dm-zoned-metadata.c create mode 100644 drivers/md/dm-zoned-reclaim.c create mode 100644 drivers/md/dm-zoned-target.c create mode 100644 drivers/md/dm-zoned.h create mode 100644 drivers/md/dm.c create mode 100644 drivers/md/dm.h create mode 100644 drivers/md/md-autodetect.c create mode 100644 drivers/md/md-bitmap.c create mode 100644 drivers/md/md-bitmap.h create mode 100644 drivers/md/md-cluster.c create mode 100644 drivers/md/md-cluster.h create mode 100644 drivers/md/md-faulty.c create mode 100644 drivers/md/md-linear.c create mode 100644 drivers/md/md-linear.h create mode 100644 drivers/md/md-multipath.c create mode 100644 drivers/md/md-multipath.h create mode 100644 drivers/md/md.c create mode 100644 drivers/md/md.h create mode 100644 drivers/md/persistent-data/Kconfig create mode 100644 drivers/md/persistent-data/Makefile create mode 100644 drivers/md/persistent-data/dm-array.c create mode 100644 drivers/md/persistent-data/dm-array.h create mode 100644 drivers/md/persistent-data/dm-bitset.c create mode 100644 drivers/md/persistent-data/dm-bitset.h create mode 100644 drivers/md/persistent-data/dm-block-manager.c create mode 100644 drivers/md/persistent-data/dm-block-manager.h create mode 100644 drivers/md/persistent-data/dm-btree-internal.h create mode 100644 drivers/md/persistent-data/dm-btree-remove.c create mode 100644 drivers/md/persistent-data/dm-btree-spine.c create mode 100644 drivers/md/persistent-data/dm-btree.c create mode 100644 drivers/md/persistent-data/dm-btree.h create mode 100644 drivers/md/persistent-data/dm-persistent-data-internal.h create mode 100644 drivers/md/persistent-data/dm-space-map-common.c create mode 100644 drivers/md/persistent-data/dm-space-map-common.h create mode 100644 drivers/md/persistent-data/dm-space-map-disk.c create mode 100644 drivers/md/persistent-data/dm-space-map-disk.h create mode 100644 drivers/md/persistent-data/dm-space-map-metadata.c create mode 100644 drivers/md/persistent-data/dm-space-map-metadata.h create mode 100644 drivers/md/persistent-data/dm-space-map.h create mode 100644 drivers/md/persistent-data/dm-transaction-manager.c create mode 100644 drivers/md/persistent-data/dm-transaction-manager.h create mode 100644 drivers/md/raid0.c create mode 100644 drivers/md/raid0.h create mode 100644 drivers/md/raid1-10.c create mode 100644 drivers/md/raid1.c create mode 100644 drivers/md/raid1.h create mode 100644 drivers/md/raid10.c create mode 100644 drivers/md/raid10.h create mode 100644 drivers/md/raid5-cache.c create mode 100644 drivers/md/raid5-log.h create mode 100644 drivers/md/raid5-ppl.c create mode 100644 drivers/md/raid5.c create mode 100644 drivers/md/raid5.h (limited to 'drivers/md') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig new file mode 100644 index 000000000..db0e97020 --- /dev/null +++ b/drivers/md/Kconfig @@ -0,0 +1,661 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Block device driver configuration +# + +menuconfig MD + bool "Multiple devices driver support (RAID and LVM)" + depends on BLOCK + select SRCU + help + Support multiple physical spindles through a single logical device. + Required for RAID and logical volume management. + +if MD + +config BLK_DEV_MD + tristate "RAID support" + select BLOCK_HOLDER_DEPRECATED if SYSFS + # BLOCK_LEGACY_AUTOLOAD requirement should be removed + # after relevant mdadm enhancements - to make "names=yes" + # the default - are widely available. + select BLOCK_LEGACY_AUTOLOAD + help + This driver lets you combine several hard disk partitions into one + logical block device. This can be used to simply append one + partition to another one or to combine several redundant hard disks + into a RAID1/4/5 device so as to provide protection against hard + disk failures. This is called "Software RAID" since the combining of + the partitions is done by the kernel. "Hardware RAID" means that the + combining is done by a dedicated controller; if you have such a + controller, you do not need to say Y here. + + More information about Software RAID on Linux is contained in the + Software RAID mini-HOWTO, available from + . There you will also learn + where to get the supporting user space utilities raidtools. + + If unsure, say N. + +config MD_AUTODETECT + bool "Autodetect RAID arrays during kernel boot" + depends on BLK_DEV_MD=y + default y + help + If you say Y here, then the kernel will try to autodetect raid + arrays as part of its boot process. + + If you don't use raid and say Y, this autodetection can cause + a several-second delay in the boot time due to various + synchronisation steps that are part of this step. + + If unsure, say Y. + +config MD_LINEAR + tristate "Linear (append) mode (deprecated)" + depends on BLK_DEV_MD + help + If you say Y here, then your multiple devices driver will be able to + use the so-called linear mode, i.e. it will combine the hard disk + partitions by simply appending one to the other. + + To compile this as a module, choose M here: the module + will be called linear. + + If unsure, say Y. + +config MD_RAID0 + tristate "RAID-0 (striping) mode" + depends on BLK_DEV_MD + help + If you say Y here, then your multiple devices driver will be able to + use the so-called raid0 mode, i.e. it will combine the hard disk + partitions into one logical device in such a fashion as to fill them + up evenly, one chunk here and one chunk there. This will increase + the throughput rate if the partitions reside on distinct disks. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + . There you will also + learn where to get the supporting user space utilities raidtools. + + To compile this as a module, choose M here: the module + will be called raid0. + + If unsure, say Y. + +config MD_RAID1 + tristate "RAID-1 (mirroring) mode" + depends on BLK_DEV_MD + help + A RAID-1 set consists of several disk drives which are exact copies + of each other. In the event of a mirror failure, the RAID driver + will continue to use the operational mirrors in the set, providing + an error free MD (multiple device) to the higher levels of the + kernel. In a set with N drives, the available space is the capacity + of a single drive, and the set protects against a failure of (N - 1) + drives. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + . There you will also + learn where to get the supporting user space utilities raidtools. + + If you want to use such a RAID-1 set, say Y. To compile this code + as a module, choose M here: the module will be called raid1. + + If unsure, say Y. + +config MD_RAID10 + tristate "RAID-10 (mirrored striping) mode" + depends on BLK_DEV_MD + help + RAID-10 provides a combination of striping (RAID-0) and + mirroring (RAID-1) with easier configuration and more flexible + layout. + Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to + be the same size (or at least, only as much as the smallest device + will be used). + RAID-10 provides a variety of layouts that provide different levels + of redundancy and performance. + + RAID-10 requires mdadm-1.7.0 or later, available at: + + https://www.kernel.org/pub/linux/utils/raid/mdadm/ + + If unsure, say Y. + +config MD_RAID456 + tristate "RAID-4/RAID-5/RAID-6 mode" + depends on BLK_DEV_MD + select RAID6_PQ + select LIBCRC32C + select ASYNC_MEMCPY + select ASYNC_XOR + select ASYNC_PQ + select ASYNC_RAID6_RECOV + help + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + . There you will also + learn where to get the supporting user space utilities raidtools. + + If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To + compile this code as a module, choose M here: the module + will be called raid456. + + If unsure, say Y. + +config MD_MULTIPATH + tristate "Multipath I/O support (deprecated)" + depends on BLK_DEV_MD + help + MD_MULTIPATH provides a simple multi-path personality for use + the MD framework. It is not under active development. New + projects should consider using DM_MULTIPATH which has more + features and more testing. + + If unsure, say N. + +config MD_FAULTY + tristate "Faulty test module for MD (deprecated)" + depends on BLK_DEV_MD + help + The "faulty" module allows for a block device that occasionally returns + read or write errors. It is useful for testing. + + In unsure, say N. + + +config MD_CLUSTER + tristate "Cluster Support for MD" + depends on BLK_DEV_MD + depends on DLM + default n + help + Clustering support for MD devices. This enables locking and + synchronization across multiple systems on the cluster, so all + nodes in the cluster can access the MD devices simultaneously. + + This brings the redundancy (and uptime) of RAID levels across the + nodes of the cluster. Currently, it can work with raid1 and raid10 + (limited support). + + If unsure, say N. + +source "drivers/md/bcache/Kconfig" + +config BLK_DEV_DM_BUILTIN + bool + +config BLK_DEV_DM + tristate "Device mapper support" + select BLOCK_HOLDER_DEPRECATED if SYSFS + select BLK_DEV_DM_BUILTIN + select BLK_MQ_STACKING + depends on DAX || DAX=n + help + Device-mapper is a low level volume manager. It works by allowing + people to specify mappings for ranges of logical sectors. Various + mapping types are available, in addition people may write their own + modules containing custom mappings if they wish. + + Higher level volume managers such as LVM2 use this driver. + + To compile this as a module, choose M here: the module will be + called dm-mod. + + If unsure, say N. + +config DM_DEBUG + bool "Device mapper debugging support" + depends on BLK_DEV_DM + help + Enable this for messages that may help debug device-mapper problems. + + If unsure, say N. + +config DM_BUFIO + tristate + depends on BLK_DEV_DM + help + This interface allows you to do buffered I/O on a device and acts + as a cache, holding recently-read blocks in memory and performing + delayed writes. + +config DM_DEBUG_BLOCK_MANAGER_LOCKING + bool "Block manager locking" + depends on DM_BUFIO + help + Block manager locking can catch various metadata corruption issues. + + If unsure, say N. + +config DM_DEBUG_BLOCK_STACK_TRACING + bool "Keep stack trace of persistent data block lock holders" + depends on STACKTRACE_SUPPORT && DM_DEBUG_BLOCK_MANAGER_LOCKING + select STACKTRACE + help + Enable this for messages that may help debug problems with the + block manager locking used by thin provisioning and caching. + + If unsure, say N. + +config DM_BIO_PRISON + tristate + depends on BLK_DEV_DM + help + Some bio locking schemes used by other device-mapper targets + including thin provisioning. + +source "drivers/md/persistent-data/Kconfig" + +config DM_UNSTRIPED + tristate "Unstriped target" + depends on BLK_DEV_DM + help + Unstripes I/O so it is issued solely on a single drive in a HW + RAID0 or dm-striped target. + +config DM_CRYPT + tristate "Crypt target support" + depends on BLK_DEV_DM + depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) + depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) + select CRYPTO + select CRYPTO_CBC + select CRYPTO_ESSIV + help + This device-mapper target allows you to create a device that + transparently encrypts the data on it. You'll need to activate + the ciphers you're going to use in the cryptoapi configuration. + + For further information on dm-crypt and userspace tools see: + + + To compile this code as a module, choose M here: the module will + be called dm-crypt. + + If unsure, say N. + +config DM_SNAPSHOT + tristate "Snapshot target" + depends on BLK_DEV_DM + select DM_BUFIO + help + Allow volume managers to take writable snapshots of a device. + +config DM_THIN_PROVISIONING + tristate "Thin provisioning target" + depends on BLK_DEV_DM + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + help + Provides thin provisioning and snapshots that share a data store. + +config DM_CACHE + tristate "Cache target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + help + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. + +config DM_CACHE_SMQ + tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" + depends on DM_CACHE + default y + help + A cache policy that uses a multiqueue ordered by recent hits + to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. This SMQ policy (vs MQ) offers the promise + of less memory utilization, improved performance and increased + adaptability in the face of changing workloads. + +config DM_WRITECACHE + tristate "Writecache target" + depends on BLK_DEV_DM + help + The writecache target caches writes on persistent memory or SSD. + It is intended for databases or other programs that need extremely + low commit latency. + + The writecache target doesn't cache reads because reads are supposed + to be cached in standard RAM. + +config DM_EBS + tristate "Emulated block size target (EXPERIMENTAL)" + depends on BLK_DEV_DM && !HIGHMEM + select DM_BUFIO + help + dm-ebs emulates smaller logical block size on backing devices + with larger ones (e.g. 512 byte sectors on 4K native disks). + +config DM_ERA + tristate "Era target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + help + dm-era tracks which parts of a block device are written to + over time. Useful for maintaining cache coherency when using + vendor snapshots. + +config DM_CLONE + tristate "Clone target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + help + dm-clone produces a one-to-one copy of an existing, read-only source + device into a writable destination device. The cloned device is + visible/mountable immediately and the copy of the source device to the + destination device happens in the background, in parallel with user + I/O. + + If unsure, say N. + +config DM_MIRROR + tristate "Mirror target" + depends on BLK_DEV_DM + help + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. + +config DM_LOG_USERSPACE + tristate "Mirror userspace logging" + depends on DM_MIRROR && NET + select CONNECTOR + help + The userspace logging module provides a mechanism for + relaying the dm-dirty-log API to userspace. Log designs + which are more suited to userspace implementation (e.g. + shared storage logs) or experimental logs can be implemented + by leveraging this framework. + +config DM_RAID + tristate "RAID 1/4/5/6/10 target" + depends on BLK_DEV_DM + select MD_RAID0 + select MD_RAID1 + select MD_RAID10 + select MD_RAID456 + select BLK_DEV_MD + help + A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + +config DM_ZERO + tristate "Zero target" + depends on BLK_DEV_DM + help + A target that discards writes, and returns all zeroes for + reads. Useful in some recovery situations. + +config DM_MULTIPATH + tristate "Multipath target" + depends on BLK_DEV_DM + # nasty syntax but means make DM_MULTIPATH independent + # of SCSI_DH if the latter isn't defined but if + # it is, DM_MULTIPATH must depend on it. We get a build + # error if SCSI_DH=m and DM_MULTIPATH=y + depends on !SCSI_DH || SCSI + help + Allow volume managers to support multipath hardware. + +config DM_MULTIPATH_QL + tristate "I/O Path Selector based on the number of in-flight I/Os" + depends on DM_MULTIPATH + help + This path selector is a dynamic load balancer which selects + the path with the least number of in-flight I/Os. + + If unsure, say N. + +config DM_MULTIPATH_ST + tristate "I/O Path Selector based on the service time" + depends on DM_MULTIPATH + help + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time. + + If unsure, say N. + +config DM_MULTIPATH_HST + tristate "I/O Path Selector based on historical service time" + depends on DM_MULTIPATH + help + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time by comparing estimated service time (based on historical + service time). + + If unsure, say N. + +config DM_MULTIPATH_IOA + tristate "I/O Path Selector based on CPU submission" + depends on DM_MULTIPATH + help + This path selector selects the path based on the CPU the IO is + executed on and the CPU to path mapping setup at path addition time. + + If unsure, say N. + +config DM_DELAY + tristate "I/O delaying target" + depends on BLK_DEV_DM + help + A target that delays reads and/or writes and can send + them to different devices. Useful for testing. + + If unsure, say N. + +config DM_DUST + tristate "Bad sector simulation target" + depends on BLK_DEV_DM + help + A target that simulates bad sector behavior. + Useful for testing. + + If unsure, say N. + +config DM_INIT + bool "DM \"dm-mod.create=\" parameter support" + depends on BLK_DEV_DM=y + help + Enable "dm-mod.create=" parameter to create mapped devices at init time. + This option is useful to allow mounting rootfs without requiring an + initramfs. + See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..." + format. + + If unsure, say N. + +config DM_UEVENT + bool "DM uevents" + depends on BLK_DEV_DM + help + Generate udev events for DM events. + +config DM_FLAKEY + tristate "Flakey target" + depends on BLK_DEV_DM + help + A target that intermittently fails I/O for debugging purposes. + +config DM_VERITY + tristate "Verity target support" + depends on BLK_DEV_DM + select CRYPTO + select CRYPTO_HASH + select DM_BUFIO + help + This device-mapper target creates a read-only device that + transparently validates the data on one underlying device against + a pre-generated tree of cryptographic checksums stored on a second + device. + + You'll need to activate the digests you're going to use in the + cryptoapi configuration. + + To compile this code as a module, choose M here: the module will + be called dm-verity. + + If unsure, say N. + +config DM_VERITY_VERIFY_ROOTHASH_SIG + def_bool n + bool "Verity data device root hash signature verification support" + depends on DM_VERITY + select SYSTEM_DATA_VERIFICATION + help + Add ability for dm-verity device to be validated if the + pre-generated tree of cryptographic checksums passed has a pkcs#7 + signature file that can validate the roothash of the tree. + + By default, rely on the builtin trusted keyring. + + If unsure, say N. + +config DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING + bool "Verity data device root hash signature verification with secondary keyring" + depends on DM_VERITY_VERIFY_ROOTHASH_SIG + depends on SECONDARY_TRUSTED_KEYRING + help + Rely on the secondary trusted keyring to verify dm-verity signatures. + + If unsure, say N. + +config DM_VERITY_FEC + bool "Verity forward error correction support" + depends on DM_VERITY + select REED_SOLOMON + select REED_SOLOMON_DEC8 + help + Add forward error correction support to dm-verity. This option + makes it possible to use pre-generated error correction data to + recover from corrupted blocks. + + If unsure, say N. + +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM + help + This device-mapper target creates a device that supports an arbitrary + mapping of fixed-size regions of I/O across a fixed set of paths. + The path used for any specific region can be switched dynamically + by sending the target a message. + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + +config DM_LOG_WRITES + tristate "Log writes target support" + depends on BLK_DEV_DM + help + This device-mapper target takes two devices, one device to use + normally, one to log all write operations done to the first device. + This is for use by file system developers wishing to verify that + their fs is writing a consistent file system at all times by allowing + them to replay the log in a variety of ways and to check the + contents. + + To compile this code as a module, choose M here: the module will + be called dm-log-writes. + + If unsure, say N. + +config DM_INTEGRITY + tristate "Integrity target support" + depends on BLK_DEV_DM + select BLK_DEV_INTEGRITY + select DM_BUFIO + select CRYPTO + select CRYPTO_SKCIPHER + select ASYNC_XOR + select DM_AUDIT if AUDIT + help + This device-mapper target emulates a block device that has + additional per-sector tags that can be used for storing + integrity information. + + This integrity target is used with the dm-crypt target to + provide authenticated disk encryption or it can be used + standalone. + + To compile this code as a module, choose M here: the module will + be called dm-integrity. + +config DM_ZONED + tristate "Drive-managed zoned block device target support" + depends on BLK_DEV_DM + depends on BLK_DEV_ZONED + select CRC32 + help + This device-mapper target takes a host-managed or host-aware zoned + block device and exposes most of its capacity as a regular block + device (drive-managed zoned block device) without any write + constraints. This is mainly intended for use with file systems that + do not natively support zoned block devices but still want to + benefit from the increased capacity offered by SMR disks. Other uses + by applications using raw block devices (for example object stores) + are also possible. + + To compile this code as a module, choose M here: the module will + be called dm-zoned. + + If unsure, say N. + +config DM_AUDIT + bool "DM audit events" + depends on BLK_DEV_DM + depends on AUDIT + help + Generate audit events for device-mapper. + + Enables audit logging of several security relevant events in the + particular device-mapper targets, especially the integrity target. + +endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile new file mode 100644 index 000000000..84291e38d --- /dev/null +++ b/drivers/md/Makefile @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the kernel software RAID and LVM drivers. +# + +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ + dm-rq.o dm-io-rewind.o +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-historical-service-time-y += dm-ps-historical-service-time.o +dm-io-affinity-y += dm-ps-io-affinity.o +dm-queue-length-y += dm-ps-queue-length.o +dm-round-robin-y += dm-ps-round-robin.o +dm-service-time-y += dm-ps-service-time.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ + dm-snap-persistent.o +dm-mirror-y += dm-raid1.o +dm-log-userspace-y += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o +dm-thin-pool-y += dm-thin.o dm-thin-metadata.o +dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ + dm-cache-background-tracker.o +dm-cache-smq-y += dm-cache-policy-smq.o +dm-ebs-y += dm-ebs-target.o +dm-era-y += dm-era-target.o +dm-clone-y += dm-clone-target.o dm-clone-metadata.o +dm-verity-y += dm-verity-target.o +dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o + +md-mod-y += md.o md-bitmap.o +raid456-y += raid5.o raid5-cache.o raid5-ppl.o +linear-y += md-linear.o +multipath-y += md-multipath.o +faulty-y += md-faulty.o + +# Note: link order is important. All raid personalities +# and must come before md.o, as they each initialise +# themselves, and md.o may use the personalities when it +# auto-initialised. + +obj-$(CONFIG_MD_LINEAR) += linear.o +obj-$(CONFIG_MD_RAID0) += raid0.o +obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID456) += raid456.o +obj-$(CONFIG_MD_MULTIPATH) += multipath.o +obj-$(CONFIG_MD_FAULTY) += faulty.o +obj-$(CONFIG_MD_CLUSTER) += md-cluster.o +obj-$(CONFIG_BCACHE) += bcache/ +obj-$(CONFIG_BLK_DEV_MD) += md-mod.o +ifeq ($(CONFIG_BLK_DEV_MD),y) +obj-y += md-autodetect.o +endif +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o +obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o +obj-$(CONFIG_DM_BUFIO) += dm-bufio.o +obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o +obj-$(CONFIG_DM_CRYPT) += dm-crypt.o +obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_DUST) += dm-dust.o +obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o +obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o +obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o +obj-$(CONFIG_DM_MULTIPATH_IOA) += dm-io-affinity.o +obj-$(CONFIG_DM_SWITCH) += dm-switch.o +obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o +obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o +obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o +obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_RAID) += dm-raid.o +obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o +obj-$(CONFIG_DM_VERITY) += dm-verity.o +obj-$(CONFIG_DM_CACHE) += dm-cache.o +obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o +obj-$(CONFIG_DM_EBS) += dm-ebs.o +obj-$(CONFIG_DM_ERA) += dm-era.o +obj-$(CONFIG_DM_CLONE) += dm-clone.o +obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o +obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o +obj-$(CONFIG_DM_ZONED) += dm-zoned.o +obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o +obj-$(CONFIG_SECURITY_LOADPIN_VERITY) += dm-verity-loadpin.o + +ifeq ($(CONFIG_DM_INIT),y) +dm-mod-objs += dm-init.o +endif + +ifeq ($(CONFIG_DM_UEVENT),y) +dm-mod-objs += dm-uevent.o +endif + +ifeq ($(CONFIG_BLK_DEV_ZONED),y) +dm-mod-objs += dm-zone.o +endif + +ifeq ($(CONFIG_IMA),y) +dm-mod-objs += dm-ima.o +endif + +ifeq ($(CONFIG_DM_VERITY_FEC),y) +dm-verity-objs += dm-verity-fec.o +endif + +ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y) +dm-verity-objs += dm-verity-verify-sig.o +endif + +ifeq ($(CONFIG_DM_AUDIT),y) +dm-mod-objs += dm-audit.o +endif diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig new file mode 100644 index 000000000..529c9d04e --- /dev/null +++ b/drivers/md/bcache/Kconfig @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config BCACHE + tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS + select CRC64 + help + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. + + See Documentation/admin-guide/bcache.rst for details. + +config BCACHE_DEBUG + bool "Bcache debugging" + depends on BCACHE + help + Don't select this option unless you're a developer + + Enables extra debugging tools, allows expensive runtime checks to be + turned on. + +config BCACHE_CLOSURES_DEBUG + bool "Debug closures" + depends on BCACHE + select DEBUG_FS + help + Keeps all active closures in a linked list and provides a debugfs + interface to list them, which makes it possible to see asynchronous + operations that get stuck. + +config BCACHE_ASYNC_REGISTRATION + bool "Asynchronous device registration" + depends on BCACHE + help + Add a sysfs file /sys/fs/bcache/register_async. Writing registering + device path into this file will returns immediately and the real + registration work is handled in kernel work queue in asynchronous + way. diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile new file mode 100644 index 000000000..5b87e5967 --- /dev/null +++ b/drivers/md/bcache/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_BCACHE) += bcache.o + +bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 000000000..ce13c272c --- /dev/null +++ b/drivers/md/bcache/alloc.c @@ -0,0 +1,736 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Primary bucket allocation code + * + * Copyright 2012 Google, Inc. + * + * Allocation in bcache is done in terms of buckets: + * + * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in + * btree pointers - they must match for the pointer to be considered valid. + * + * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a + * bucket simply by incrementing its gen. + * + * The gens (along with the priorities; it's really the gens are important but + * the code is named as if it's the priorities) are written in an arbitrary list + * of buckets on disk, with a pointer to them in the journal header. + * + * When we invalidate a bucket, we have to write its new gen to disk and wait + * for that write to complete before we use it - otherwise after a crash we + * could have pointers that appeared to be good but pointed to data that had + * been overwritten. + * + * Since the gens and priorities are all stored contiguously on disk, we can + * batch this up: We fill up the free_inc list with freshly invalidated buckets, + * call prio_write(), and when prio_write() finishes we pull buckets off the + * free_inc list and optionally discard them. + * + * free_inc isn't the only freelist - if it was, we'd often to sleep while + * priorities and gens were being written before we could allocate. c->free is a + * smaller freelist, and buckets on that list are always ready to be used. + * + * If we've got discards enabled, that happens when a bucket moves from the + * free_inc list to the free list. + * + * There is another freelist, because sometimes we have buckets that we know + * have nothing pointing into them - these we can reuse without waiting for + * priorities to be rewritten. These come from freed btree nodes and buckets + * that garbage collection discovered no longer had valid keys pointing into + * them (because they were overwritten). That's the unused list - buckets on the + * unused list move to the free list, optionally being discarded in the process. + * + * It's also important to ensure that gens don't wrap around - with respect to + * either the oldest gen in the btree or the gen on disk. This is quite + * difficult to do in practice, but we explicitly guard against it anyways - if + * a bucket is in danger of wrapping around we simply skip invalidating it that + * time around, and we garbage collect or rewrite the priorities sooner than we + * would have otherwise. + * + * bch_bucket_alloc() allocates a single bucket from a specific cache. + * + * bch_bucket_alloc_set() allocates one bucket from different caches + * out of a cache set. + * + * free_some_buckets() drives all the processes described above. It's called + * from bch_bucket_alloc() and a few other places that need to make sure free + * buckets are ready. + * + * invalidate_buckets_(lru|fifo)() find buckets that are available to be + * invalidated, and then invalidate them and stick them on the free_inc list - + * in either lru or fifo order. + */ + +#include "bcache.h" +#include "btree.h" + +#include +#include +#include +#include + +#define MAX_OPEN_BUCKETS 128 + +/* Bucket heap / gen */ + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) +{ + uint8_t ret = ++b->gen; + + ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); + WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); + + return ret; +} + +void bch_rescale_priorities(struct cache_set *c, int sectors) +{ + struct cache *ca; + struct bucket *b; + unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024; + int r; + + atomic_sub(sectors, &c->rescale); + + do { + r = atomic_read(&c->rescale); + + if (r >= 0) + return; + } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); + + mutex_lock(&c->bucket_lock); + + c->min_prio = USHRT_MAX; + + ca = c->cache; + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } + + mutex_unlock(&c->bucket_lock); +} + +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ + +static inline bool can_inc_bucket_gen(struct bucket *b) +{ + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; +} + +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) +{ + BUG_ON(!ca->set->gc_mark_valid); + + return (!GC_MARK(b) || + GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && + can_inc_bucket_gen(b); +} + +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + lockdep_assert_held(&ca->set->bucket_lock); + BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); + + if (GC_SECTORS_USED(b)) + trace_bcache_invalidate(ca, b - ca->buckets); + + bch_inc_gen(ca, b); + b->prio = INITIAL_PRIO; + atomic_inc(&b->pin); +} + +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + __bch_invalidate_one_bucket(ca, b); + + fifo_push(&ca->free_inc, b - ca->buckets); +} + +/* + * Determines what order we're going to reuse buckets, smallest bucket_prio() + * first: we also take into account the number of sectors of live data in that + * bucket, and in order for that multiply to make sense we have to scale bucket + * + * Thus, we scale the bucket priorities so that the bucket with the smallest + * prio is worth 1/8th of what INITIAL_PRIO is worth. + */ + +#define bucket_prio(b) \ +({ \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) + +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) + +static void invalidate_buckets_lru(struct cache *ca) +{ + struct bucket *b; + ssize_t i; + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (!bch_can_invalidate_bucket(ca, b)) + continue; + + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_max_cmp); + } + } + + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); + + while (!fifo_full(&ca->free_inc)) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { + /* + * We don't want to be calling invalidate_buckets() + * multiple times when it can't do anything + */ + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + return; + } + + bch_invalidate_one_bucket(ca, b); + } +} + +static void invalidate_buckets_fifo(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + if (ca->fifo_last_bucket < ca->sb.first_bucket || + ca->fifo_last_bucket >= ca->sb.nbuckets) + ca->fifo_last_bucket = ca->sb.first_bucket; + + b = ca->buckets + ca->fifo_last_bucket++; + + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets_random(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + size_t n; + + get_random_bytes(&n, sizeof(n)); + + n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); + n += ca->sb.first_bucket; + + b = ca->buckets + n; + + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets / 2) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets(struct cache *ca) +{ + BUG_ON(ca->invalidate_needs_gc); + + switch (CACHE_REPLACEMENT(&ca->sb)) { + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(ca); + break; + } +} + +#define allocator_wait(ca, cond) \ +do { \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + \ + mutex_unlock(&(ca)->set->bucket_lock); \ + if (kthread_should_stop() || \ + test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ + set_current_state(TASK_RUNNING); \ + goto out; \ + } \ + \ + schedule(); \ + mutex_lock(&(ca)->set->bucket_lock); \ + } \ + __set_current_state(TASK_RUNNING); \ +} while (0) + +static int bch_allocator_push(struct cache *ca, long bucket) +{ + unsigned int i; + + /* Prios/gens are actually the most important reserve */ + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + return true; + + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) + return true; + + return false; +} + +static int bch_allocator_thread(void *arg) +{ + struct cache *ca = arg; + + mutex_lock(&ca->set->bucket_lock); + + while (1) { + /* + * First, we pull buckets off of the unused and free_inc lists, + * possibly issue discards to them, then we add the bucket to + * the free list: + */ + while (1) { + long bucket; + + if (!fifo_pop(&ca->free_inc, bucket)) + break; + + if (ca->discard) { + mutex_unlock(&ca->set->bucket_lock); + blkdev_issue_discard(ca->bdev, + bucket_to_sector(ca->set, bucket), + ca->sb.bucket_size, GFP_KERNEL); + mutex_lock(&ca->set->bucket_lock); + } + + allocator_wait(ca, bch_allocator_push(ca, bucket)); + wake_up(&ca->set->btree_cache_wait); + wake_up(&ca->set->bucket_wait); + } + + /* + * We've run out of free buckets, we need to find some buckets + * we can invalidate. First, invalidate them in memory and add + * them to the free_inc list: + */ + +retry_invalidate: + allocator_wait(ca, ca->set->gc_mark_valid && + !ca->invalidate_needs_gc); + invalidate_buckets(ca); + + /* + * Now, we write their new gens to disk so we can start writing + * new stuff to them: + */ + allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); + if (CACHE_SYNC(&ca->sb)) { + /* + * This could deadlock if an allocation with a btree + * node locked ever blocked - having the btree node + * locked would block garbage collection, but here we're + * waiting on garbage collection before we invalidate + * and free anything. + * + * But this should be safe since the btree code always + * uses btree_check_reserve() before allocating now, and + * if it fails it blocks without btree nodes locked. + */ + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } + } + } +out: + wait_for_kthread_stop(); + return 0; +} + +/* Allocation */ + +long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) +{ + DEFINE_WAIT(w); + struct bucket *b; + long r; + + + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) + return -1; + + /* fastpath */ + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) + goto out; + + if (!wait) { + trace_bcache_alloc_fail(ca, reserve); + return -1; + } + + do { + prepare_to_wait(&ca->set->bucket_wait, &w, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&ca->set->bucket_lock); + schedule(); + mutex_lock(&ca->set->bucket_lock); + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && + !fifo_pop(&ca->free[reserve], r)); + + finish_wait(&ca->set->bucket_wait, &w); +out: + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); + + trace_bcache_alloc(ca, reserve); + + if (expensive_debug_checks(ca->set)) { + size_t iter; + long i; + unsigned int j; + + for (iter = 0; iter < prio_buckets(ca) * 2; iter++) + BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each(i, &ca->free[j], iter) + BUG_ON(i == r); + fifo_for_each(i, &ca->free_inc, iter) + BUG_ON(i == r); + } + + b = ca->buckets + r; + + BUG_ON(atomic_read(&b->pin) != 1); + + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + + if (reserve <= RESERVE_PRIO) { + SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); + b->prio = INITIAL_PRIO; + } + + if (ca->set->avail_nbuckets > 0) { + ca->set->avail_nbuckets--; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } + + return r; +} + +void __bch_bucket_free(struct cache *ca, struct bucket *b) +{ + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); + + if (ca->set->avail_nbuckets < ca->set->nbuckets) { + ca->set->avail_nbuckets++; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } +} + +void bch_bucket_free(struct cache_set *c, struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); +} + +int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, bool wait) +{ + struct cache *ca; + long b; + + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return -1; + + lockdep_assert_held(&c->bucket_lock); + + bkey_init(k); + + ca = c->cache; + b = bch_bucket_alloc(ca, reserve, wait); + if (b == -1) + goto err; + + k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); + + SET_KEY_PTRS(k, 1); + + return 0; +err: + bch_bucket_free(c, k); + bkey_put(c, k); + return -1; +} + +int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, bool wait) +{ + int ret; + + mutex_lock(&c->bucket_lock); + ret = __bch_bucket_alloc_set(c, reserve, k, wait); + mutex_unlock(&c->bucket_lock); + return ret; +} + +/* Sector allocator */ + +struct open_bucket { + struct list_head list; + unsigned int last_write_point; + unsigned int sectors_free; + BKEY_PADDED(key); +}; + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we try to segregate flash + * only volume write streams from cached devices, secondly we look for a bucket + * where the last write to it was sequential with the current write, and + * failing that we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, dirty sectors of flash only volume is not reclaimable, if their + * dirty sectors mixed with dirty sectors of cached device, such buckets will + * be marked as dirty and won't be reclaimed, though the dirty data of cached + * device have been written back to backend device. + * + * And say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + unsigned int write_point, + struct bkey *alloc) +{ + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) != + UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)])) + continue; + else if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last_write_point == write_point) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->cache->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } + + if (!ret->sectors_free) + ret = NULL; + + return ret; +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +bool bch_alloc_sectors(struct cache_set *c, + struct bkey *k, + unsigned int sectors, + unsigned int write_point, + unsigned int write_prio, + bool wait) +{ + struct open_bucket *b; + BKEY_PADDED(key) alloc; + unsigned int i; + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ + + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { + unsigned int watermark = write_prio + ? RESERVE_MOVINGGC + : RESERVE_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait)) + return false; + + spin_lock(&c->data_bucket_lock); + } + + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call pick_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last_write_point = write_point; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &c->cache->sectors_written); + } + + if (b->sectors_free < c->cache->sb.block_size) + b->sectors_free = 0; + + /* + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. + */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); + + spin_unlock(&c->data_bucket_lock); + return true; +} + +/* Init */ + +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; + + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} + +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < MAX_OPEN_BUCKETS; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + + if (!b) + return -ENOMEM; + + list_add(&b->list, &c->data_buckets); + } + + return 0; +} + +int bch_cache_allocator_start(struct cache *ca) +{ + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(k)) + return PTR_ERR(k); + + ca->alloc_thread = k; + return 0; +} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h new file mode 100644 index 000000000..e86fa736d --- /dev/null +++ b/drivers/md/bcache/bcache.h @@ -0,0 +1,1049 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_H +#define _BCACHE_H + +/* + * SOME HIGH LEVEL CODE DOCUMENTATION: + * + * Bcache mostly works with cache sets, cache devices, and backing devices. + * + * Support for multiple cache devices hasn't quite been finished off yet, but + * it's about 95% plumbed through. A cache set and its cache devices is sort of + * like a md raid array and its component devices. Most of the code doesn't care + * about individual cache devices, the main abstraction is the cache set. + * + * Multiple cache devices is intended to give us the ability to mirror dirty + * cached data and metadata, without mirroring clean cached data. + * + * Backing devices are different, in that they have a lifetime independent of a + * cache set. When you register a newly formatted backing device it'll come up + * in passthrough mode, and then you can attach and detach a backing device from + * a cache set at runtime - while it's mounted and in use. Detaching implicitly + * invalidates any cached data for that backing device. + * + * A cache set can have multiple (many) backing devices attached to it. + * + * There's also flash only volumes - this is the reason for the distinction + * between struct cached_dev and struct bcache_device. A flash only volume + * works much like a bcache device that has a backing device, except the + * "cached" data is always dirty. The end result is that we get thin + * provisioning with very little additional code. + * + * Flash only volumes work but they're not production ready because the moving + * garbage collector needs more work. More on that later. + * + * BUCKETS/ALLOCATION: + * + * Bcache is primarily designed for caching, which means that in normal + * operation all of our available space will be allocated. Thus, we need an + * efficient way of deleting things from the cache so we can write new things to + * it. + * + * To do this, we first divide the cache device up into buckets. A bucket is the + * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ + * works efficiently. + * + * Each bucket has a 16 bit priority, and an 8 bit generation associated with + * it. The gens and priorities for all the buckets are stored contiguously and + * packed on disk (in a linked list of buckets - aside from the superblock, all + * of bcache's metadata is stored in buckets). + * + * The priority is used to implement an LRU. We reset a bucket's priority when + * we allocate it or on cache it, and every so often we decrement the priority + * of each bucket. It could be used to implement something more sophisticated, + * if anyone ever gets around to it. + * + * The generation is used for invalidating buckets. Each pointer also has an 8 + * bit generation embedded in it; for a pointer to be considered valid, its gen + * must match the gen of the bucket it points into. Thus, to reuse a bucket all + * we have to do is increment its gen (and write its new gen to disk; we batch + * this up). + * + * Bcache is entirely COW - we never write twice to a bucket, even buckets that + * contain metadata (including btree nodes). + * + * THE BTREE: + * + * Bcache is in large part design around the btree. + * + * At a high level, the btree is just an index of key -> ptr tuples. + * + * Keys represent extents, and thus have a size field. Keys also have a variable + * number of pointers attached to them (potentially zero, which is handy for + * invalidating the cache). + * + * The key itself is an inode:offset pair. The inode number corresponds to a + * backing device or a flash only volume. The offset is the ending offset of the + * extent within the inode - not the starting offset; this makes lookups + * slightly more convenient. + * + * Pointers contain the cache device id, the offset on that device, and an 8 bit + * generation number. More on the gen later. + * + * Index lookups are not fully abstracted - cache lookups in particular are + * still somewhat mixed in with the btree code, but things are headed in that + * direction. + * + * Updates are fairly well abstracted, though. There are two different ways of + * updating the btree; insert and replace. + * + * BTREE_INSERT will just take a list of keys and insert them into the btree - + * overwriting (possibly only partially) any extents they overlap with. This is + * used to update the index after a write. + * + * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is + * overwriting a key that matches another given key. This is used for inserting + * data into the cache after a cache miss, and for background writeback, and for + * the moving garbage collector. + * + * There is no "delete" operation; deleting things from the index is + * accomplished by either by invalidating pointers (by incrementing a bucket's + * gen) or by inserting a key with 0 pointers - which will overwrite anything + * previously present at that location in the index. + * + * This means that there are always stale/invalid keys in the btree. They're + * filtered out by the code that iterates through a btree node, and removed when + * a btree node is rewritten. + * + * BTREE NODES: + * + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node + * - no less than 1/4th - but a bucket still contains no more than a single + * btree node. I'd actually like to change this, but for now we rely on the + * bucket's gen for deleting btree nodes when we rewrite/split a node.) + * + * Anyways, btree nodes are big - big enough to be inefficient with a textbook + * btree implementation. + * + * The way this is solved is that btree nodes are internally log structured; we + * can append new keys to an existing btree node without rewriting it. This + * means each set of keys we write is sorted, but the node is not. + * + * We maintain this log structure in memory - keeping 1Mb of keys sorted would + * be expensive, and we have to distinguish between the keys we have written and + * the keys we haven't. So to do a lookup in a btree node, we have to search + * each sorted set. But we do merge written sets together lazily, so the cost of + * these extra searches is quite low (normally most of the keys in a btree node + * will be in one big set, and then there'll be one or two sets that are much + * smaller). + * + * This log structure makes bcache's btree more of a hybrid between a + * conventional btree and a compacting data structure, with some of the + * advantages of both. + * + * GARBAGE COLLECTION: + * + * We can't just invalidate any bucket - it might contain dirty data or + * metadata. If it once contained dirty data, other writes might overwrite it + * later, leaving no valid pointers into that bucket in the index. + * + * Thus, the primary purpose of garbage collection is to find buckets to reuse. + * It also counts how much valid data it each bucket currently contains, so that + * allocation can reuse buckets sooner when they've been mostly overwritten. + * + * It also does some things that are really internal to the btree + * implementation. If a btree node contains pointers that are stale by more than + * some threshold, it rewrites the btree node to avoid the bucket's generation + * wrapping around. It also merges adjacent btree nodes if they're empty enough. + * + * THE JOURNAL: + * + * Bcache's journal is not necessary for consistency; we always strictly + * order metadata writes so that the btree and everything else is consistent on + * disk in the event of an unclean shutdown, and in fact bcache had writeback + * caching (with recovery from unclean shutdown) before journalling was + * implemented. + * + * Rather, the journal is purely a performance optimization; we can't complete a + * write until we've updated the index on disk, otherwise the cache would be + * inconsistent in the event of an unclean shutdown. This means that without the + * journal, on random write workloads we constantly have to update all the leaf + * nodes in the btree, and those writes will be mostly empty (appending at most + * a few keys each) - highly inefficient in terms of amount of metadata writes, + * and it puts more strain on the various btree resorting/compacting code. + * + * The journal is just a log of keys we've inserted; on startup we just reinsert + * all the keys in the open journal entries. That means that when we're updating + * a node in the btree, we can wait until a 4k block of keys fills up before + * writing them out. + * + * For simplicity, we only journal updates to leaf nodes; updates to parent + * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth + * the complexity to deal with journalling them (in particular, journal replay) + * - updates to non leaf nodes just happen synchronously (see btree_split()). + */ + +#define pr_fmt(fmt) "bcache: %s() " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bcache_ondisk.h" +#include "bset.h" +#include "util.h" +#include "closure.h" + +struct bucket { + atomic_t pin; + uint16_t prio; + uint8_t gen; + uint8_t last_gc; /* Most out of date gen in the btree */ + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ +}; + +/* + * I'd use bitfields for these, but I don't trust the compiler not to screw me + * as multiple threads touch struct bucket without locking + */ + +BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); +#define GC_MARK_RECLAIMABLE 1 +#define GC_MARK_DIRTY 2 +#define GC_MARK_METADATA 3 +#define GC_SECTORS_USED_SIZE 13 +#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); + +#include "journal.h" +#include "stats.h" +struct search; +struct btree; +struct keybuf; + +struct keybuf_key { + struct rb_node node; + BKEY_PADDED(key); + void *private; +}; + +struct keybuf { + struct bkey last_scanned; + spinlock_t lock; + + /* + * Beginning and end of range in rb tree - so that we can skip taking + * lock and checking the rb tree when we need to check for overlapping + * keys. + */ + struct bkey start; + struct bkey end; + + struct rb_root keys; + +#define KEYBUF_NR 500 + DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); +}; + +struct bcache_device { + struct closure cl; + + struct kobject kobj; + + struct cache_set *c; + unsigned int id; +#define BCACHEDEVNAME_SIZE 12 + char name[BCACHEDEVNAME_SIZE]; + + struct gendisk *disk; + + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 +#define BCACHE_DEV_WB_RUNNING 3 +#define BCACHE_DEV_RATE_DW_RUNNING 4 + int nr_stripes; +#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT) + unsigned int stripe_size; + atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; + + struct bio_set bio_split; + + unsigned int data_csum:1; + + int (*cache_miss)(struct btree *b, struct search *s, + struct bio *bio, unsigned int sectors); + int (*ioctl)(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg); +}; + +struct io { + /* Used to track sequential IO so it can be skipped */ + struct hlist_node hash; + struct list_head lru; + + unsigned long jiffies; + unsigned int sequential; + sector_t last; +}; + +enum stop_on_failure { + BCH_CACHED_DEV_STOP_AUTO = 0, + BCH_CACHED_DEV_STOP_ALWAYS, + BCH_CACHED_DEV_STOP_MODE_MAX, +}; + +struct cached_dev { + struct list_head list; + struct bcache_device disk; + struct block_device *bdev; + + struct cache_sb sb; + struct cache_sb_disk *sb_disk; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + struct closure sb_write; + struct semaphore sb_write_mutex; + + /* Refcount on the cache set. Always nonzero when we're caching. */ + refcount_t count; + struct work_struct detach; + + /* + * Device might not be running if it's dirty and the cache set hasn't + * showed up yet. + */ + atomic_t running; + + /* + * Writes take a shared lock from start to finish; scanning for dirty + * data to refill the rb tree requires an exclusive lock. + */ + struct rw_semaphore writeback_lock; + + /* + * Nonzero, and writeback has a refcount (d->count), iff there is dirty + * data in the cache. Protected by writeback_lock; must have an + * shared lock to set and exclusive lock to clear. + */ + atomic_t has_dirty; + +#define BCH_CACHE_READA_ALL 0 +#define BCH_CACHE_READA_META_ONLY 1 + unsigned int cache_readahead_policy; + struct bch_ratelimit writeback_rate; + struct delayed_work writeback_rate_update; + + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; + struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; + + struct keybuf writeback_keys; + + struct task_struct *status_update_thread; + /* + * Order the write-half of writeback operations strongly in dispatch + * order. (Maintain LBA order; don't allow reads completing out of + * order to re-order the writes...) + */ + struct closure_waitlist writeback_ordering_wait; + atomic_t writeback_sequence_next; + + /* For tracking sequential IO */ +#define RECENT_IO_BITS 7 +#define RECENT_IO (1 << RECENT_IO_BITS) + struct io io[RECENT_IO]; + struct hlist_head io_hash[RECENT_IO + 1]; + struct list_head io_lru; + spinlock_t io_lock; + + struct cache_accounting accounting; + + /* The rest of this all shows up in sysfs */ + unsigned int sequential_cutoff; + + unsigned int io_disable:1; + unsigned int verify:1; + unsigned int bypass_torture_test:1; + + unsigned int partial_stripes_expensive:1; + unsigned int writeback_metadata:1; + unsigned int writeback_running:1; + unsigned int writeback_consider_fragment:1; + unsigned char writeback_percent; + unsigned int writeback_delay; + + uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; + int32_t writeback_rate_change; + + unsigned int writeback_rate_update_seconds; + unsigned int writeback_rate_i_term_inverse; + unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_fp_term_low; + unsigned int writeback_rate_fp_term_mid; + unsigned int writeback_rate_fp_term_high; + unsigned int writeback_rate_minimum; + + enum stop_on_failure stop_when_cache_set_failed; +#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 + atomic_t io_errors; + unsigned int error_limit; + unsigned int offline_seconds; + + /* + * Retry to update writeback_rate if contention happens for + * down_read(dc->writeback_lock) in update_writeback_rate() + */ +#define BCH_WBRATE_UPDATE_MAX_SKIPS 15 + unsigned int rate_update_retry; +}; + +enum alloc_reserve { + RESERVE_BTREE, + RESERVE_PRIO, + RESERVE_MOVINGGC, + RESERVE_NONE, + RESERVE_NR, +}; + +struct cache { + struct cache_set *set; + struct cache_sb sb; + struct cache_sb_disk *sb_disk; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + + struct kobject kobj; + struct block_device *bdev; + + struct task_struct *alloc_thread; + + struct closure prio; + struct prio_set *disk_buckets; + + /* + * When allocating new buckets, prio_write() gets first dibs - since we + * may not be allocate at all without writing priorities and gens. + * prio_last_buckets[] contains the last buckets we wrote priorities to + * (so gc can mark them as metadata), prio_buckets[] contains the + * buckets allocated for the next prio write. + */ + uint64_t *prio_buckets; + uint64_t *prio_last_buckets; + + /* + * free: Buckets that are ready to be used + * + * free_inc: Incoming buckets - these are buckets that currently have + * cached data in them, and we can't reuse them until after we write + * their new gen to disk. After prio_write() finishes writing the new + * gens/prios, they'll be moved to the free list (and possibly discarded + * in the process) + */ + DECLARE_FIFO(long, free)[RESERVE_NR]; + DECLARE_FIFO(long, free_inc); + + size_t fifo_last_bucket; + + /* Allocation stuff: */ + struct bucket *buckets; + + DECLARE_HEAP(struct bucket *, heap); + + /* + * If nonzero, we know we aren't going to find any buckets to invalidate + * until a gc finishes - otherwise we could pointlessly burn a ton of + * cpu + */ + unsigned int invalidate_needs_gc; + + bool discard; /* Get rid of? */ + + struct journal_device journal; + + /* The rest of this all shows up in sysfs */ +#define IO_ERROR_SHIFT 20 + atomic_t io_errors; + atomic_t io_count; + + atomic_long_t meta_sectors_written; + atomic_long_t btree_sectors_written; + atomic_long_t sectors_written; +}; + +struct gc_stat { + size_t nodes; + size_t nodes_pre; + size_t key_bytes; + + size_t nkeys; + uint64_t data; /* sectors */ + unsigned int in_use; /* percent */ +}; + +/* + * Flag bits, for how the cache set is shutting down, and what phase it's at: + * + * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching + * all the backing devices first (their cached data gets invalidated, and they + * won't automatically reattach). + * + * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; + * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. + * flushing dirty data). + * + * CACHE_SET_RUNNING means all cache devices have been registered and journal + * replay is complete. + * + * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all + * external and internal I/O should be denied when this flag is set. + * + */ +#define CACHE_SET_UNREGISTERING 0 +#define CACHE_SET_STOPPING 1 +#define CACHE_SET_RUNNING 2 +#define CACHE_SET_IO_DISABLE 3 + +struct cache_set { + struct closure cl; + + struct list_head list; + struct kobject kobj; + struct kobject internal; + struct dentry *debug; + struct cache_accounting accounting; + + unsigned long flags; + atomic_t idle_counter; + atomic_t at_max_writeback_rate; + + struct cache *cache; + + struct bcache_device **devices; + unsigned int devices_max_used; + atomic_t attached_dev_nr; + struct list_head cached_devs; + uint64_t cached_dev_sectors; + atomic_long_t flash_dev_dirty_sectors; + struct closure caching; + + struct closure sb_write; + struct semaphore sb_write_mutex; + + mempool_t search; + mempool_t bio_meta; + struct bio_set bio_split; + + /* For the btree cache */ + struct shrinker shrink; + + /* For the btree cache and anything allocation related */ + struct mutex bucket_lock; + + /* log2(bucket_size), in sectors */ + unsigned short bucket_bits; + + /* log2(block_size), in sectors */ + unsigned short block_bits; + + /* + * Default number of pages for a new btree node - may be less than a + * full bucket + */ + unsigned int btree_pages; + + /* + * Lists of struct btrees; lru is the list for structs that have memory + * allocated for actual btree node, freed is for structs that do not. + * + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct list_head btree_cache; + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; + + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned int btree_cache_used; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: + */ + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; + + /* + * When we free a btree node, we increment the gen of the bucket the + * node is in - but we can't rewrite the prios and gens until we + * finished whatever it is we were doing, otherwise after a crash the + * btree node would be freed but for say a split, we might not have the + * pointers to the new nodes inserted into the btree yet. + * + * This is a refcount that blocks prio_write() until the new keys are + * written. + */ + atomic_t prio_blocked; + wait_queue_head_t bucket_wait; + + /* + * For any bio we don't skip we subtract the number of sectors from + * rescale; when it hits 0 we rescale all the bucket priorities. + */ + atomic_t rescale; + /* + * used for GC, identify if any front side I/Os is inflight + */ + atomic_t search_inflight; + /* + * When we invalidate buckets, we use both the priority and the amount + * of good data to determine which buckets to reuse first - to weight + * those together consistently we keep track of the smallest nonzero + * priority of any bucket. + */ + uint16_t min_prio; + + /* + * max(gen - last_gc) for all buckets. When it gets too big we have to + * gc to keep gens from wrapping around. + */ + uint8_t need_gc; + struct gc_stat gc_stats; + size_t nbuckets; + size_t avail_nbuckets; + + struct task_struct *gc_thread; + /* Where in the btree gc currently is */ + struct bkey gc_done; + + /* + * For automatical garbage collection after writeback completed, this + * varialbe is used as bit fields, + * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback + * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback + * This is an optimization for following write request after writeback + * finished, but read hit rate dropped due to clean data on cache is + * discarded. Unless user explicitly sets it via sysfs, it won't be + * enabled. + */ +#define BCH_ENABLE_AUTO_GC 1 +#define BCH_DO_AUTO_GC 2 + uint8_t gc_after_writeback; + + /* + * The allocation code needs gc_mark in struct bucket to be correct, but + * it's not while a gc is in progress. Protected by bucket_lock. + */ + int gc_mark_valid; + + /* Counts how many sectors bio_insert has added to the cache */ + atomic_t sectors_to_gc; + wait_queue_head_t gc_wait; + + struct keybuf moving_gc_keys; + /* Number of moving GC bios in flight */ + struct semaphore moving_in_flight; + + struct workqueue_struct *moving_gc_wq; + + struct btree *root; + +#ifdef CONFIG_BCACHE_DEBUG + struct btree *verify_data; + struct bset *verify_ondisk; + struct mutex verify_lock; +#endif + + uint8_t set_uuid[16]; + unsigned int nr_uuids; + struct uuid_entry *uuids; + BKEY_PADDED(uuid_bucket); + struct closure uuid_write; + struct semaphore uuid_write_mutex; + + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - have to dynamically allocate them. + * bch_cache_set_alloc() will make sure the pool can allocate iterators + * equipped with enough room that can host + * (sb.bucket_size / sb.block_size) + * btree_iter_sets, which is more than static MAX_BSETS. + */ + mempool_t fill_iter; + + struct bset_sort_state sort; + + /* List of buckets we're currently writing data to */ + struct list_head data_buckets; + spinlock_t data_bucket_lock; + + struct journal journal; + +#define CONGESTED_MAX 1024 + unsigned int congested_last_us; + atomic_t congested; + + /* The rest of this all shows up in sysfs */ + unsigned int congested_read_threshold_us; + unsigned int congested_write_threshold_us; + + struct time_stats btree_gc_time; + struct time_stats btree_split_time; + struct time_stats btree_read_time; + + atomic_long_t cache_read_races; + atomic_long_t writeback_keys_done; + atomic_long_t writeback_keys_failed; + + atomic_long_t reclaim; + atomic_long_t reclaimed_journal_buckets; + atomic_long_t flush_write; + + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, + } on_error; +#define DEFAULT_IO_ERROR_LIMIT 8 + unsigned int error_limit; + unsigned int error_decay; + + unsigned short journal_delay_ms; + bool expensive_debug_checks; + unsigned int verify:1; + unsigned int key_merging_disabled:1; + unsigned int gc_always_rewrite:1; + unsigned int shrinker_disabled:1; + unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; + +#define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +}; + +struct bbio { + unsigned int submit_time_us; + union { + struct bkey key; + uint64_t _pad[3]; + /* + * We only need pad = 3 here because we only ever carry around a + * single pointer - i.e. the pointer we're doing io to/from. + */ + }; + struct bio bio; +}; + +#define BTREE_PRIO USHRT_MAX +#define INITIAL_PRIO 32768U + +#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) +#define btree_blocks(b) \ + ((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) + +#define btree_default_blocks(c) \ + ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) + +#define bucket_bytes(ca) ((ca)->sb.bucket_size << 9) +#define block_bytes(ca) ((ca)->sb.block_size << 9) + +static inline unsigned int meta_bucket_pages(struct cache_sb *sb) +{ + unsigned int n, max_pages; + + max_pages = min_t(unsigned int, + __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS, + MAX_ORDER_NR_PAGES); + + n = sb->bucket_size / PAGE_SECTORS; + if (n > max_pages) + n = max_pages; + + return n; +} + +static inline unsigned int meta_bucket_bytes(struct cache_sb *sb) +{ + return meta_bucket_pages(sb) << PAGE_SHIFT; +} + +#define prios_per_bucket(ca) \ + ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \ + sizeof(struct bucket_disk)) + +#define prio_buckets(ca) \ + DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca)) + +static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) +{ + return s >> c->bucket_bits; +} + +static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) +{ + return ((sector_t) b) << c->bucket_bits; +} + +static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) +{ + return s & (c->cache->sb.bucket_size - 1); +} + +static inline size_t PTR_BUCKET_NR(struct cache_set *c, + const struct bkey *k, + unsigned int ptr) +{ + return sector_to_bucket(c, PTR_OFFSET(k, ptr)); +} + +static inline struct bucket *PTR_BUCKET(struct cache_set *c, + const struct bkey *k, + unsigned int ptr) +{ + return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr); +} + +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + + return r > 128U ? 0 : r; +} + +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned int i) +{ + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); +} + +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned int i) +{ + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache; +} + +/* Btree key macros */ + +/* + * This is used for various on disk data structures - cache_sb, prio_set, bset, + * jset: The checksum is _always_ the first 8 bytes of these structs + */ +#define csum_set(i) \ + bch_crc64(((void *) (i)) + sizeof(uint64_t), \ + ((void *) bset_bkey_last(i)) - \ + (((void *) (i)) + sizeof(uint64_t))) + +/* Error handling macros */ + +#define btree_bug(b, ...) \ +do { \ + if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ + dump_stack(); \ +} while (0) + +#define cache_bug(c, ...) \ +do { \ + if (bch_cache_set_error(c, __VA_ARGS__)) \ + dump_stack(); \ +} while (0) + +#define btree_bug_on(cond, b, ...) \ +do { \ + if (cond) \ + btree_bug(b, __VA_ARGS__); \ +} while (0) + +#define cache_bug_on(cond, c, ...) \ +do { \ + if (cond) \ + cache_bug(c, __VA_ARGS__); \ +} while (0) + +#define cache_set_err_on(cond, c, ...) \ +do { \ + if (cond) \ + bch_cache_set_error(c, __VA_ARGS__); \ +} while (0) + +/* Looping macros */ + +#define for_each_bucket(b, ca) \ + for (b = (ca)->buckets + (ca)->sb.first_bucket; \ + b < (ca)->buckets + (ca)->sb.nbuckets; b++) + +static inline void cached_dev_put(struct cached_dev *dc) +{ + if (refcount_dec_and_test(&dc->count)) + schedule_work(&dc->detach); +} + +static inline bool cached_dev_get(struct cached_dev *dc) +{ + if (!refcount_inc_not_zero(&dc->count)) + return false; + + /* Paired with the mb in cached_dev_attach */ + smp_mb__after_atomic(); + return true; +} + +/* + * bucket_gc_gen() returns the difference between the bucket's current gen and + * the oldest gen of any pointer into that bucket in the btree (last_gc). + */ + +static inline uint8_t bucket_gc_gen(struct bucket *b) +{ + return b->gen - b->last_gc; +} + +#define BUCKET_GC_GEN_MAX 96U + +#define kobj_attribute_write(n, fn) \ + static struct kobj_attribute ksysfs_##n = __ATTR(n, 0200, NULL, fn) + +#define kobj_attribute_rw(n, show, store) \ + static struct kobj_attribute ksysfs_##n = \ + __ATTR(n, 0600, show, store) + +static inline void wake_up_allocators(struct cache_set *c) +{ + struct cache *ca = c->cache; + + wake_up_process(ca->alloc_thread); +} + +static inline void closure_bio_submit(struct cache_set *c, + struct bio *bio, + struct closure *cl) +{ + closure_get(cl); + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + submit_bio_noacct(bio); +} + +/* + * Prevent the kthread exits directly, and make sure when kthread_stop() + * is called to stop a kthread, it is still alive. If a kthread might be + * stopped by CACHE_SET_IO_DISABLE bit set, wait_for_kthread_stop() is + * necessary before the kthread returns. + */ +static inline void wait_for_kthread_stop(void) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + +/* Forward declarations */ + +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); +void bch_count_io_errors(struct cache *ca, blk_status_t error, + int is_read, const char *m); +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_free(struct bio *bio, struct cache_set *c); +struct bio *bch_bbio_alloc(struct cache_set *c); + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c); +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned int ptr); + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b); +void bch_rescale_priorities(struct cache_set *c, int sectors); + +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b); +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b); + +void __bch_bucket_free(struct cache *ca, struct bucket *b); +void bch_bucket_free(struct cache_set *c, struct bkey *k); + +long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); +int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, bool wait); +int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, bool wait); +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, + unsigned int sectors, unsigned int write_point, + unsigned int write_prio, bool wait); +bool bch_cached_dev_error(struct cached_dev *dc); + +__printf(2, 3) +bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); + +int bch_prio_write(struct cache *ca, bool wait); +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); + +extern struct workqueue_struct *bcache_wq; +extern struct workqueue_struct *bch_journal_wq; +extern struct workqueue_struct *bch_flush_wq; +extern struct mutex bch_register_lock; +extern struct list_head bch_cache_sets; + +extern struct kobj_type bch_cached_dev_ktype; +extern struct kobj_type bch_flash_dev_ktype; +extern struct kobj_type bch_cache_set_ktype; +extern struct kobj_type bch_cache_set_internal_ktype; +extern struct kobj_type bch_cache_ktype; + +void bch_cached_dev_release(struct kobject *kobj); +void bch_flash_dev_release(struct kobject *kobj); +void bch_cache_set_release(struct kobject *kobj); +void bch_cache_release(struct kobject *kobj); + +int bch_uuid_write(struct cache_set *c); +void bcache_write_super(struct cache_set *c); + +int bch_flash_dev_create(struct cache_set *c, uint64_t size); + +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); +void bch_cached_dev_detach(struct cached_dev *dc); +int bch_cached_dev_run(struct cached_dev *dc); +void bcache_device_stop(struct bcache_device *d); + +void bch_cache_set_unregister(struct cache_set *c); +void bch_cache_set_stop(struct cache_set *c); + +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb); +void bch_btree_cache_free(struct cache_set *c); +int bch_btree_cache_alloc(struct cache_set *c); +void bch_moving_init_cache_set(struct cache_set *c); +int bch_open_buckets_alloc(struct cache_set *c); +void bch_open_buckets_free(struct cache_set *c); + +int bch_cache_allocator_start(struct cache *ca); + +void bch_debug_exit(void); +void bch_debug_init(void); +void bch_request_exit(void); +int bch_request_init(void); +void bch_btree_exit(void); +int bch_btree_init(void); + +#endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h new file mode 100644 index 000000000..f96034e0b --- /dev/null +++ b/drivers/md/bcache/bcache_ondisk.h @@ -0,0 +1,446 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_BCACHE_H +#define _LINUX_BCACHE_H + +/* + * Bcache on disk data structures + */ + +#include + +#define BITMASK(name, type, field, offset, size) \ +static inline __u64 name(const type *k) \ +{ return (k->field >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << size) << offset); \ + k->field |= (v & ~(~0ULL << size)) << offset; \ +} + +/* Btree keys - all units are in sectors */ + +struct bkey { + __u64 high; + __u64 low; + __u64 ptr[]; +}; + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ +static inline __u64 name(const struct bkey *k, unsigned int i) \ +{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \ +{ \ + k->ptr[i] &= ~(~(~0ULL << size) << offset); \ + k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ +} + +#define KEY_SIZE_BITS 16 +#define KEY_MAX_U64S 8 + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(__PAD0, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(__PAD1, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline __u64 KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) +{ + k->low = v; +} + +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(inode, offset, size) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ + .low = (offset) \ +}) + +#define ZERO_KEY KEY(0, 0, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (~0ULL >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) + +#define PTR_DEV_BITS 12 + +PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) + +#define MAKE_PTR(gen, offset, dev) \ + ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) + +/* Bkey utility code */ + +static inline unsigned long bkey_u64s(const struct bkey *k) +{ + return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); +} + +static inline unsigned long bkey_bytes(const struct bkey *k) +{ + return bkey_u64s(k) * sizeof(__u64); +} + +#define bkey_copy(_dest, _src) unsafe_memcpy(_dest, _src, bkey_bytes(_src), \ + /* bkey is always padded */) + +static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +{ + SET_KEY_INODE(dest, KEY_INODE(src)); + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +} + +static inline struct bkey *bkey_next(const struct bkey *k) +{ + __u64 *d = (void *) k; + + return (struct bkey *) (d + bkey_u64s(k)); +} + +static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) +{ + __u64 *d = (void *) k; + + return (struct bkey *) (d + nr_keys); +} +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } + +/* Superblock */ + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5 +#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6 +#define BCACHE_SB_MAX_VERSION 6 + +#define SB_SECTOR 8 +#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb_disk { + __le64 csum; + __le64 offset; /* sector where this sb was written */ + __le64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __le64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __le64 flags; + __le64 seq; + + __le64 feature_compat; + __le64 feature_incompat; + __le64 feature_ro_compat; + + __le64 pad[5]; + + union { + struct { + /* Cache devices */ + __le64 nbuckets; /* device size */ + + __le16 block_size; /* sectors */ + __le16 bucket_size; /* sectors */ + + __le16 nr_in_set; + __le16 nr_this_dev; + }; + struct { + /* Backing devices */ + __le64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __le32 last_mount; /* time overflow in y2106 */ + + __le16 first_bucket; + union { + __le16 njournal_buckets; + __le16 keys; + }; + __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ + __le16 obso_bucket_size_hi; /* obsoleted */ +}; + +/* + * This is for in-memory bcache super block. + * NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member + * size, ordering and even whole struct size may be different + * from cache_sb_disk. + */ +struct cache_sb { + __u64 offset; /* sector where this sb was written */ + __u64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __u64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __u64 flags; + __u64 seq; + + __u64 feature_compat; + __u64 feature_incompat; + __u64 feature_ro_compat; + + union { + struct { + /* Cache devices */ + __u64 nbuckets; /* device size */ + + __u16 block_size; /* sectors */ + __u16 nr_in_set; + __u16 nr_this_dev; + __u32 bucket_size; /* sectors */ + }; + struct { + /* Backing devices */ + __u64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __u32 last_mount; /* time overflow in y2106 */ + + __u16 first_bucket; + union { + __u16 njournal_buckets; + __u16 keys; + }; + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +static inline __u64 jset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ JSET_MAGIC; +} + +static inline __u64 pset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ PSET_MAGIC; +} + +static inline __u64 bset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ BSET_MAGIC; +} + +/* + * Journal + * + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ +#define BCACHE_JSET_VERSION 1 + +struct jset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + __u64 last_seq; + + BKEY_PADDED(uuid_bucket); + BKEY_PADDED(btree_root); + __u16 btree_level; + __u16 pad[3]; + + __u64 prio_bucket[MAX_CACHES_PER_SET]; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* Bucket prios/gens */ + +struct prio_set { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 pad; + + __u64 next_bucket; + + struct bucket_disk { + __u16 prio; + __u8 gen; + } __attribute((packed)) data[]; +}; + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry { + union { + struct { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; /* time overflow in y2106 */ + __u32 last_reg; + __u32 invalidated; + + __u32 flags; + /* Size of flash only volumes */ + __u64 sectors; + }; + + __u8 pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + +/* Btree nodes */ + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_CSUM 1 +#define BCACHE_BSET_VERSION 1 + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* OBSOLETE */ + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry_v0 { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + __u32 pad; +}; + +#endif /* _LINUX_BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c new file mode 100644 index 000000000..2bba4d6aa --- /dev/null +++ b/drivers/md/bcache/bset.c @@ -0,0 +1,1390 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a + * btree node + * + * Copyright 2012 Google, Inc. + */ + +#define pr_fmt(fmt) "bcache: %s() " fmt, __func__ + +#include "util.h" +#include "bset.h" + +#include +#include +#include +#include + +#ifdef CONFIG_BCACHE_DEBUG + +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set) +{ + struct bkey *k, *next; + + for (k = i->start; k < bset_bkey_last(i); k = next) { + next = bkey_next(k); + + pr_err("block %u key %u/%u: ", set, + (unsigned int) ((u64 *) k - i->d), i->keys); + + if (b->ops->key_dump) + b->ops->key_dump(b, k); + else + pr_cont("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + + if (next < bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? + &START_KEY(next) : next) > 0) + pr_err("Key skipped backwards\n"); + } +} + +void bch_dump_bucket(struct btree_keys *b) +{ + unsigned int i; + + console_lock(); + for (i = 0; i <= b->nsets; i++) + bch_dump_bset(b, b->set[i].data, + bset_sector_offset(b, b->set[i].data)); + console_unlock(); +} + +int __bch_count_data(struct btree_keys *b) +{ + unsigned int ret = 0; + struct btree_iter iter; + struct bkey *k; + + if (b->ops->is_extents) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} + +void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + const char *err; + + for_each_key(b, k, &iter) { + if (b->ops->is_extents) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; + } + p = k; + } +#if 0 + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; +#endif + return; +bug: + bch_dump_bucket(b); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + panic("bch_check_keys error: %s:\n", err); +} + +static void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->ops->is_extents ? + &START_KEY(next) : next) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } +} + +#else + +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#endif + +/* Keylists */ + +int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; + uint64_t *new_keys; + + newsize = roundup_pow_of_two(newsize); + + if (newsize <= KEYLIST_INLINE || + roundup_pow_of_two(oldsize) == newsize) + return 0; + + new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); + + if (!new_keys) + return -ENOMEM; + + if (!old_keys) + memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); + + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; + + return 0; +} + +/* Pop the top key of keylist by pointing l->top to its previous key */ +struct bkey *bch_keylist_pop(struct keylist *l) +{ + struct bkey *k = l->keys; + + if (k == l->top) + return NULL; + + while (bkey_next(k) != l->top) + k = bkey_next(k); + + return l->top = k; +} + +/* Pop the bottom key of keylist and update l->top_p */ +void bch_keylist_pop_front(struct keylist *l) +{ + l->top_p -= bkey_u64s(l->keys); + + memmove(l->keys, + bkey_next(l->keys), + bch_keylist_bytes(l)); +} + +/* Key/pointer manipulation */ + +void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, + unsigned int i) +{ + BUG_ON(i > KEY_PTRS(src)); + + /* Only copy the header, key, and one pointer. */ + memcpy(dest, src, 2 * sizeof(uint64_t)); + dest->ptr[0] = src->ptr[i]; + SET_KEY_PTRS(dest, 1); + /* We didn't copy the checksum so clear that bit. */ + SET_KEY_CSUM(dest, 0); +} + +bool __bch_cut_front(const struct bkey *where, struct bkey *k) +{ + unsigned int i, len = 0; + + if (bkey_cmp(where, &START_KEY(k)) <= 0) + return false; + + if (bkey_cmp(where, k) < 0) + len = KEY_OFFSET(k) - KEY_OFFSET(where); + else + bkey_copy_key(k, where); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len); + + BUG_ON(len > KEY_SIZE(k)); + SET_KEY_SIZE(k, len); + return true; +} + +bool __bch_cut_back(const struct bkey *where, struct bkey *k) +{ + unsigned int len = 0; + + if (bkey_cmp(where, k) >= 0) + return false; + + BUG_ON(KEY_INODE(where) != KEY_INODE(k)); + + if (bkey_cmp(where, &START_KEY(k)) > 0) + len = KEY_OFFSET(where) - KEY_START(k); + + bkey_copy_key(k, where); + + BUG_ON(len > KEY_SIZE(k)); + SET_KEY_SIZE(k, len); + return true; +} + +/* Auxiliary search trees */ + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned int exponent:BKEY_EXPONENT_BITS; + unsigned int m:BKEY_MID_BITS; + unsigned int mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree_keys *b) +{ + return PAGE_SIZE << b->page_order; +} + +static inline size_t btree_keys_cachelines(struct btree_keys *b) +{ + return btree_keys_bytes(b) / BSET_CACHELINE; +} + +/* Space required for the auxiliary search trees */ +static inline size_t bset_tree_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(struct bkey_float); +} + +/* Space required for the prev pointers */ +static inline size_t bset_prev_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(uint8_t); +} + +/* Memory allocation */ + +void bch_btree_keys_free(struct btree_keys *b) +{ + struct bset_tree *t = b->set; + + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); + + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); + + free_pages((unsigned long) t->data, b->page_order); + + t->prev = NULL; + t->tree = NULL; + t->data = NULL; +} + +int bch_btree_keys_alloc(struct btree_keys *b, + unsigned int page_order, + gfp_t gfp) +{ + struct bset_tree *t = b->set; + + BUG_ON(t->data); + + b->page_order = page_order; + + t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + return 0; +err: + bch_btree_keys_free(b); + return -ENOMEM; +} + +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) +{ + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + + /* + * struct btree_keys in embedded in struct btree, and struct + * bset_tree is embedded into struct btree_keys. They are all + * initialized as 0 by kzalloc() in mca_bucket_alloc(), and + * b->set[0].data is allocated in bch_btree_keys_alloc(), so we + * don't have to initiate b->set[].size and b->set[].data here + * any more. + */ +} + +/* Binary tree stuff for auxiliary search trees */ + +/* + * return array index next to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ +static unsigned int inorder_next(unsigned int j, unsigned int size) +{ + if (j * 2 + 1 < size) { + j = j * 2 + 1; + + while (j * 2 < size) + j *= 2; + } else + j >>= ffz(j) + 1; + + return j; +} + +/* + * return array index previous to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ +static unsigned int inorder_prev(unsigned int j, unsigned int size) +{ + if (j * 2 < size) { + j = j * 2; + + while (j * 2 + 1 < size) + j = j * 2 + 1; + } else + j >>= ffs(j); + + return j; +} + +/* + * I have no idea why this code works... and I'm the one who wrote it + * + * However, I do know what it does: + * Given a binary tree constructed in an array (i.e. how you normally implement + * a heap), it converts a node in the tree - referenced by array index - to the + * index it would have if you did an inorder traversal. + * + * Also tested for every j, size up to size somewhere around 6 million. + * + * The binary tree starts at array index 1, not 0 + * extra is a function of size: + * extra = (size - rounddown_pow_of_two(size - 1)) << 1; + */ +static unsigned int __to_inorder(unsigned int j, + unsigned int size, + unsigned int extra) +{ + unsigned int b = fls(j); + unsigned int shift = fls(size - 1) - b; + + j ^= 1U << (b - 1); + j <<= 1; + j |= 1; + j <<= shift; + + if (j > extra) + j -= (j - extra) >> 1; + + return j; +} + +/* + * Return the cacheline index in bset_tree->data, where j is index + * from a linear array which stores the auxiliar binary tree + */ +static unsigned int to_inorder(unsigned int j, struct bset_tree *t) +{ + return __to_inorder(j, t->size, t->extra); +} + +static unsigned int __inorder_to_tree(unsigned int j, + unsigned int size, + unsigned int extra) +{ + unsigned int shift; + + if (j > extra) + j += j - extra; + + shift = ffs(j); + + j >>= shift; + j |= roundup_pow_of_two(size) >> shift; + + return j; +} + +/* + * Return an index from a linear array which stores the auxiliar binary + * tree, j is the cacheline index of t->data. + */ +static unsigned int inorder_to_tree(unsigned int j, struct bset_tree *t) +{ + return __inorder_to_tree(j, t->size, t->extra); +} + +#if 0 +void inorder_test(void) +{ + unsigned long done = 0; + ktime_t start = ktime_get(); + + for (unsigned int size = 2; + size < 65536000; + size++) { + unsigned int extra = + (size - rounddown_pow_of_two(size - 1)) << 1; + unsigned int i = 1, j = rounddown_pow_of_two(size - 1); + + if (!(size % 4096)) + pr_notice("loop %u, %llu per us\n", size, + done / ktime_us_delta(ktime_get(), start)); + + while (1) { + if (__inorder_to_tree(i, size, extra) != j) + panic("size %10u j %10u i %10u", size, j, i); + + if (__to_inorder(j, size, extra) != i) + panic("size %10u j %10u i %10u", size, j, i); + + if (j == rounddown_pow_of_two(size) - 1) + break; + + BUG_ON(inorder_prev(inorder_next(j, size), size) != j); + + j = inorder_next(j, size); + i++; + } + + done += size - 1; + } +} +#endif + +/* + * Cacheline/offset <-> bkey pointer arithmetic: + * + * t->tree is a binary search tree in an array; each node corresponds to a key + * in one cacheline in t->set (BSET_CACHELINE bytes). + * + * This means we don't have to store the full index of the key that a node in + * the binary tree points to; to_inorder() gives us the cacheline, and then + * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. + * + * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to + * make this work. + * + * To construct the bfloat for an arbitrary key we need to know what the key + * immediately preceding it is: we have to check if the two keys differ in the + * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size + * of the previous key so we can walk backwards to it from t->tree[j]'s key. + */ + +static struct bkey *cacheline_to_bkey(struct bset_tree *t, + unsigned int cacheline, + unsigned int offset) +{ + return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; +} + +static unsigned int bkey_to_cacheline(struct bset_tree *t, struct bkey *k) +{ + return ((void *) k - (void *) t->data) / BSET_CACHELINE; +} + +static unsigned int bkey_to_cacheline_offset(struct bset_tree *t, + unsigned int cacheline, + struct bkey *k) +{ + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); +} + +static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned int j) +{ + return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); +} + +static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned int j) +{ + return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); +} + +/* + * For the write set - the one we're currently inserting keys into - we don't + * maintain a full search tree, we just keep a simple lookup table in t->prev. + */ +static struct bkey *table_to_bkey(struct bset_tree *t, unsigned int cacheline) +{ + return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); +} + +static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) +{ + low >>= shift; + low |= (high << 1) << (63U - shift); + return low; +} + +/* + * Calculate mantissa value for struct bkey_float. + * If most significant bit of f->exponent is not set, then + * - f->exponent >> 6 is 0 + * - p[0] points to bkey->low + * - p[-1] borrows bits from KEY_INODE() of bkey->high + * if most isgnificant bits of f->exponent is set, then + * - f->exponent >> 6 is 1 + * - p[0] points to bits from KEY_INODE() of bkey->high + * - p[-1] points to other bits from KEY_INODE() of + * bkey->high too. + * See make_bfloat() to check when most significant bit of f->exponent + * is set or not. + */ +static inline unsigned int bfloat_mantissa(const struct bkey *k, + struct bkey_float *f) +{ + const uint64_t *p = &k->low - (f->exponent >> 6); + + return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; +} + +static void make_bfloat(struct bset_tree *t, unsigned int j) +{ + struct bkey_float *f = &t->tree[j]; + struct bkey *m = tree_to_bkey(t, j); + struct bkey *p = tree_to_prev_bkey(t, j); + + struct bkey *l = is_power_of_2(j) + ? t->data->start + : tree_to_prev_bkey(t, j >> ffs(j)); + + struct bkey *r = is_power_of_2(j + 1) + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) + : tree_to_bkey(t, j >> (ffz(j) + 1)); + + BUG_ON(m < l || m > r); + BUG_ON(bkey_next(p) != m); + + /* + * If l and r have different KEY_INODE values (different backing + * device), f->exponent records how many least significant bits + * are different in KEY_INODE values and sets most significant + * bits to 1 (by +64). + * If l and r have same KEY_INODE value, f->exponent records + * how many different bits in least significant bits of bkey->low. + * See bfloat_mantiss() how the most significant bit of + * f->exponent is used to calculate bfloat mantissa value. + */ + if (KEY_INODE(l) != KEY_INODE(r)) + f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; + else + f->exponent = fls64(r->low ^ l->low); + + f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0); + + /* + * Setting f->exponent = 127 flags this node as failed, and causes the + * lookup code to fall back to comparing against the original key. + */ + + if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f)) + f->mantissa = bfloat_mantissa(m, f) - 1; + else + f->exponent = 127; +} + +static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) +{ + if (t != b->set) { + unsigned int j = roundup(t[-1].size, + 64 / sizeof(struct bkey_float)); + + t->tree = t[-1].tree + j; + t->prev = t[-1].prev + j; + } + + while (t < b->set + MAX_BSETS) + t++->size = 0; +} + +static void bch_bset_build_unwritten_tree(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(b->last_set_unwritten); + b->last_set_unwritten = 1; + + bset_alloc_tree(b, t); + + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); + t->size = 1; + } +} + +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) +{ + if (i != b->set->data) { + b->set[++b->nsets].data = i; + i->seq = b->set->data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = magic; + i->version = 0; + i->keys = 0; + + bch_bset_build_unwritten_tree(b); +} + +/* + * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to + * accelerate bkey search in a btree node (pointed by bset_tree->data in + * memory). After search in the auxiliar tree by calling bset_search_tree(), + * a struct bset_search_iter is returned which indicates range [l, r] from + * bset_tree->data where the searching bkey might be inside. Then a followed + * linear comparison does the exact search, see __bch_bset_search() for how + * the auxiliary tree is used. + */ +void bch_bset_build_written_tree(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + struct bkey *prev = NULL, *k = t->data->start; + unsigned int j, cacheline = 1; + + b->last_set_unwritten = 0; + + bset_alloc_tree(b, t); + + t->size = min_t(unsigned int, + bkey_to_cacheline(t, bset_bkey_last(t->data)), + b->set->tree + btree_keys_cachelines(b) - t->tree); + + if (t->size < 2) { + t->size = 0; + return; + } + + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ + for (j = inorder_next(0, t->size); + j; + j = inorder_next(j, t->size)) { + while (bkey_to_cacheline(t, k) < cacheline) { + prev = k; + k = bkey_next(k); + } + + t->prev[j] = bkey_u64s(prev); + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); + } + + while (bkey_next(k) != bset_bkey_last(t->data)) + k = bkey_next(k); + + t->end = *k; + + /* Then we build the tree */ + for (j = inorder_next(0, t->size); + j; + j = inorder_next(j, t->size)) + make_bfloat(t, j); +} + +/* Insert */ + +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) +{ + struct bset_tree *t; + unsigned int inorder, j = 1; + + for (t = b->set; t <= bset_tree_last(b); t++) + if (k < bset_bkey_last(t->data)) + goto found_set; + + BUG(); +found_set: + if (!t->size || !bset_written(b, t)) + return; + + inorder = bkey_to_cacheline(t, k); + + if (k == t->data->start) + goto fix_left; + + if (bkey_next(k) == bset_bkey_last(t->data)) { + t->end = *k; + goto fix_right; + } + + j = inorder_to_tree(inorder, t); + + if (j && + j < t->size && + k == tree_to_bkey(t, j)) +fix_left: do { + make_bfloat(t, j); + j = j * 2; + } while (j < t->size); + + j = inorder_to_tree(inorder + 1, t); + + if (j && + j < t->size && + k == tree_to_prev_bkey(t, j)) +fix_right: do { + make_bfloat(t, j); + j = j * 2 + 1; + } while (j < t->size); +} + +static void bch_bset_fix_lookup_table(struct btree_keys *b, + struct bset_tree *t, + struct bkey *k) +{ + unsigned int shift = bkey_u64s(k); + unsigned int j = bkey_to_cacheline(t, k); + + /* We're getting called from btree_split() or btree_gc, just bail out */ + if (!t->size) + return; + + /* + * k is the key we just inserted; we need to find the entry in the + * lookup table for the first key that is strictly greater than k: + * it's either k's cacheline or the next one + */ + while (j < t->size && + table_to_bkey(t, j) <= k) + j++; + + /* + * Adjust all the lookup table entries, and find a new key for any that + * have gotten too big + */ + for (; j < t->size; j++) { + t->prev[j] += shift; + + if (t->prev[j] > 7) { + k = table_to_bkey(t, j - 1); + + while (k < cacheline_to_bkey(t, j, 0)) + k = bkey_next(k); + + t->prev[j] = bkey_to_cacheline_offset(t, j, k); + } + } + + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) + return; + + /* Possibly add a new entry to the end of the lookup table */ + + for (k = table_to_bkey(t, t->size - 1); + k != bset_bkey_last(t->data); + k = bkey_next(k)) + if (t->size == bkey_to_cacheline(t, k)) { + t->prev[t->size] = + bkey_to_cacheline_offset(t, t->size, k); + t->size++; + } +} + +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) +{ + if (!b->ops->key_merge) + return false; + + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + if (!bch_bkey_equal_header(l, r) || + bkey_cmp(l, &START_KEY(r))) + return false; + + return b->ops->key_merge(b, l, r); +} + +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(!b->last_set_unwritten); + BUG_ON(bset_byte_offset(b, t->data) + + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > + PAGE_SIZE << b->page_order); + + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) bset_bkey_last(t->data) - (void *) where); + + t->data->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, t, where); +} + +unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; + struct bset *i = bset_tree_last(b)->data; + struct bkey *m, *prev = NULL; + struct btree_iter iter; + struct bkey preceding_key_on_stack = ZERO_KEY; + struct bkey *preceding_key_p = &preceding_key_on_stack; + + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + + /* + * If k has preceding key, preceding_key_p will be set to address + * of k's preceding key; otherwise preceding_key_p will be set + * to NULL inside preceding_key(). + */ + if (b->ops->is_extents) + preceding_key(&START_KEY(k), &preceding_key_p); + else + preceding_key(k, &preceding_key_p); + + m = bch_btree_iter_init(b, &iter, preceding_key_p); + + if (b->ops->insert_fixup(b, k, &iter, replace_key)) + return status; + + status = BTREE_INSERT_STATUS_INSERT; + + while (m != bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) { + prev = m; + m = bkey_next(m); + } + + /* prev is in the tree, if we merge we're done */ + status = BTREE_INSERT_STATUS_BACK_MERGE; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; +#if 0 + status = BTREE_INSERT_STATUS_OVERWROTE; + if (m != bset_bkey_last(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; +#endif + status = BTREE_INSERT_STATUS_FRONT_MERGE; + if (m != bset_bkey_last(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + + bch_bset_insert(b, m, k); +copy: bkey_copy(m, k); +merged: + return status; +} + +/* Lookup */ + +struct bset_search_iter { + struct bkey *l, *r; +}; + +static struct bset_search_iter bset_search_write_set(struct bset_tree *t, + const struct bkey *search) +{ + unsigned int li = 0, ri = t->size; + + while (li + 1 != ri) { + unsigned int m = (li + ri) >> 1; + + if (bkey_cmp(table_to_bkey(t, m), search) > 0) + ri = m; + else + li = m; + } + + return (struct bset_search_iter) { + table_to_bkey(t, li), + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) + }; +} + +static struct bset_search_iter bset_search_tree(struct bset_tree *t, + const struct bkey *search) +{ + struct bkey *l, *r; + struct bkey_float *f; + unsigned int inorder, j, n = 1; + + do { + unsigned int p = n << 4; + + if (p < t->size) + prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; + + if (likely(f->exponent != 127)) { + if (f->mantissa >= bfloat_mantissa(search, f)) + n = j * 2; + else + n = j * 2 + 1; + } else { + if (bkey_cmp(tree_to_bkey(t, j), search) > 0) + n = j * 2; + else + n = j * 2 + 1; + } + } while (n < t->size); + + inorder = to_inorder(j, t); + + /* + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ + if (n & 1) { + l = cacheline_to_bkey(t, inorder, f->m); + + if (++inorder != t->size) { + f = &t->tree[inorder_next(j, t->size)]; + r = cacheline_to_bkey(t, inorder, f->m); + } else + r = bset_bkey_last(t->data); + } else { + r = cacheline_to_bkey(t, inorder, f->m); + + if (--inorder) { + f = &t->tree[inorder_prev(j, t->size)]; + l = cacheline_to_bkey(t, inorder, f->m); + } else + l = t->data->start; + } + + return (struct bset_search_iter) {l, r}; +} + +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, + const struct bkey *search) +{ + struct bset_search_iter i; + + /* + * First, we search for a cacheline, then lastly we do a linear search + * within that cacheline. + * + * To search for the cacheline, there's three different possibilities: + * * The set is too small to have a search tree, so we just do a linear + * search over the whole set. + * * The set is the one we're currently inserting into; keeping a full + * auxiliary search tree up to date would be too expensive, so we + * use a much simpler lookup table to do a binary search - + * bset_search_write_set(). + * * Or we use the auxiliary search tree we constructed earlier - + * bset_search_tree() + */ + + if (unlikely(!t->size)) { + i.l = t->data->start; + i.r = bset_bkey_last(t->data); + } else if (bset_written(b, t)) { + /* + * Each node in the auxiliary search tree covers a certain range + * of bits, and keys above and below the set it covers might + * differ outside those bits - so we have to special case the + * start and end - handle that here: + */ + + if (unlikely(bkey_cmp(search, &t->end) >= 0)) + return bset_bkey_last(t->data); + + if (unlikely(bkey_cmp(search, t->data->start) < 0)) + return t->data->start; + + i = bset_search_tree(t, search); + } else { + BUG_ON(!b->nsets && + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); + + i = bset_search_write_set(t, search); + } + + if (btree_keys_expensive_checks(b)) { + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); + + BUG_ON(i.r != bset_bkey_last(t->data) && + bkey_cmp(i.r, search) <= 0); + } + + while (likely(i.l != i.r) && + bkey_cmp(i.l, search) <= 0) + i.l = bkey_next(i.l); + + return i.l; +} + +/* Btree iterator */ + +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); + +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + return bkey_cmp(l.k, r.k) > 0; +} + +static inline bool btree_iter_end(struct btree_iter *iter) +{ + return !iter->used; +} + +void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, + struct bkey *end) +{ + if (k != end) + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); +} + +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) +{ + struct bkey *ret = NULL; + + iter->size = ARRAY_SIZE(iter->data); + iter->used = 0; + +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + + for (; start <= bset_tree_last(b); start++) { + ret = bch_bset_search(b, start, search); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + } + + return ret; +} + +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->set); +} + +static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, + btree_iter_cmp_fn *cmp) +{ + struct btree_iter_set b __maybe_unused; + struct bkey *ret = NULL; + + if (!btree_iter_end(iter)) { + bch_btree_iter_next_check(iter); + + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); + + if (iter->data->k > iter->data->end) { + WARN_ONCE(1, "bset was corrupt!\n"); + iter->data->k = iter->data->end; + } + + if (iter->data->k == iter->data->end) + heap_pop(iter, b, cmp); + else + heap_sift(iter, 0, cmp); + } + + return ret; +} + +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + return __bch_btree_iter_next(iter, btree_iter_cmp); + +} + +struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, + struct btree_keys *b, ptr_filter_fn fn) +{ + struct bkey *ret; + + do { + ret = bch_btree_iter_next(iter); + } while (ret && fn(b, ret)); + + return ret; +} + +/* Mergesort */ + +void bch_bset_sort_state_free(struct bset_sort_state *state) +{ + mempool_exit(&state->pool); +} + +int bch_bset_sort_state_init(struct bset_sort_state *state, + unsigned int page_order) +{ + spin_lock_init(&state->time.lock); + + state->page_order = page_order; + state->crit_factor = int_sqrt(1 << page_order); + + return mempool_init_page_pool(&state->pool, 1, page_order); +} + +static void btree_mergesort(struct btree_keys *b, struct bset *out, + struct btree_iter *iter, + bool fixup, bool remove_stale) +{ + int i; + struct bkey *k, *last = NULL; + BKEY_PADDED(k) tmp; + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale + ? bch_ptr_bad + : bch_ptr_invalid; + + /* Heapify the iterator, using our comparison function */ + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); + + while (!btree_iter_end(iter)) { + if (b->ops->sort_fixup && fixup) + k = b->ops->sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); + + if (bad(b, k)) + continue; + + if (!last) { + last = out->start; + bkey_copy(last, k); + } else if (!bch_bkey_try_merge(b, last, k)) { + last = bkey_next(last); + bkey_copy(last, k); + } + } + + out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; + + pr_debug("sorted %i keys\n", out->keys); +} + +static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, + unsigned int start, unsigned int order, bool fixup, + struct bset_sort_state *state) +{ + uint64_t start_time; + bool used_mempool = false; + struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, + order); + if (!out) { + struct page *outp; + + BUG_ON(order > state->page_order); + + outp = mempool_alloc(&state->pool, GFP_NOIO); + out = page_address(outp); + used_mempool = true; + order = state->page_order; + } + + start_time = local_clock(); + + btree_mergesort(b, out, iter, fixup, false); + b->nsets = start; + + if (!start && order == b->page_order) { + /* + * Our temporary buffer is the same size as the btree node's + * buffer, we can just swap buffers instead of doing a big + * memcpy() + * + * Don't worry event 'out' is allocated from mempool, it can + * still be swapped here. Because state->pool is a page mempool + * created by mempool_init_page_pool(), which allocates + * pages by alloc_pages() indeed. + */ + + out->magic = b->set->data->magic; + out->seq = b->set->data->seq; + out->version = b->set->data->version; + swap(out, b->set->data); + } else { + b->set[start].data->keys = out->keys; + memcpy(b->set[start].data->start, out->start, + (void *) bset_bkey_last(out) - (void *) out->start); + } + + if (used_mempool) + mempool_free(virt_to_page(out), &state->pool); + else + free_pages((unsigned long) out, order); + + bch_bset_build_written_tree(b); + + if (!start) + bch_time_stats_update(&state->time, start_time); +} + +void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, + struct bset_sort_state *state) +{ + size_t order = b->page_order, keys = 0; + struct btree_iter iter; + int oldsize = bch_count_data(b); + + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + + if (start) { + unsigned int i; + + for (i = start; i <= b->nsets; i++) + keys += b->set[i].data->keys; + + order = get_order(__set_bytes(b->set->data, keys)); + } + + __btree_sort(b, &iter, start, order, false, state); + + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); +} + +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state) +{ + __btree_sort(b, iter, 0, b->page_order, true, state); +} + +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state) +{ + uint64_t start_time = local_clock(); + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, NULL); + + btree_mergesort(b, new->set->data, &iter, false, true); + + bch_time_stats_update(&state->time, start_time); + + new->set->size = 0; // XXX: why? +} + +#define SORT_CRIT (4096 / sizeof(uint64_t)) + +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) +{ + unsigned int crit = SORT_CRIT; + int i; + + /* Don't sort if nothing to do */ + if (!b->nsets) + goto out; + + for (i = b->nsets - 1; i >= 0; --i) { + crit *= state->crit_factor; + + if (b->set[i].data->keys < crit) { + bch_btree_sort_partial(b, i, state); + return; + } + } + + /* Sort if we'd overflow */ + if (b->nsets + 1 == MAX_BSETS) { + bch_btree_sort(b, state); + return; + } + +out: + bch_bset_build_written_tree(b); +} + +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) +{ + unsigned int i; + + for (i = 0; i <= b->nsets; i++) { + struct bset_tree *t = &b->set[i]; + size_t bytes = t->data->keys * sizeof(uint64_t); + size_t j; + + if (bset_written(b, t)) { + stats->sets_written++; + stats->bytes_written += bytes; + + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) + if (t->tree[j].exponent == 127) + stats->failed++; + } else { + stats->sets_unwritten++; + stats->bytes_unwritten += bytes; + } + } +} diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h new file mode 100644 index 000000000..d795c8424 --- /dev/null +++ b/drivers/md/bcache/bset.h @@ -0,0 +1,593 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_BSET_H +#define _BCACHE_BSET_H + +#include +#include + +#include "bcache_ondisk.h" +#include "util.h" /* for time_stats */ + +/* + * BKEYS: + * + * A bkey contains a key, a size field, a variable number of pointers, and some + * ancillary flag bits. + * + * We use two different functions for validating bkeys, bch_ptr_invalid and + * bch_ptr_bad(). + * + * bch_ptr_invalid() primarily filters out keys and pointers that would be + * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and + * pointer that occur in normal practice but don't point to real data. + * + * The one exception to the rule that ptr_invalid() filters out invalid keys is + * that it also filters out keys of size 0 - these are keys that have been + * completely overwritten. It'd be safe to delete these in memory while leaving + * them on disk, just unnecessary work - so we filter them out when resorting + * instead. + * + * We can't filter out stale keys when we're resorting, because garbage + * collection needs to find them to ensure bucket gens don't wrap around - + * unless we're rewriting the btree node those stale keys still exist on disk. + * + * We also implement functions here for removing some number of sectors from the + * front or the back of a bkey - this is mainly used for fixing overlapping + * extents, by removing the overlapping sectors from the older key. + * + * BSETS: + * + * A bset is an array of bkeys laid out contiguously in memory in sorted order, + * along with a header. A btree node is made up of a number of these, written at + * different times. + * + * There could be many of them on disk, but we never allow there to be more than + * 4 in memory - we lazily resort as needed. + * + * We implement code here for creating and maintaining auxiliary search trees + * (described below) for searching an individial bset, and on top of that we + * implement a btree iterator. + * + * BTREE ITERATOR: + * + * Most of the code in bcache doesn't care about an individual bset - it needs + * to search entire btree nodes and iterate over them in sorted order. + * + * The btree iterator code serves both functions; it iterates through the keys + * in a btree node in sorted order, starting from either keys after a specific + * point (if you pass it a search key) or the start of the btree node. + * + * AUXILIARY SEARCH TREES: + * + * Since keys are variable length, we can't use a binary search on a bset - we + * wouldn't be able to find the start of the next key. But binary searches are + * slow anyways, due to terrible cache behaviour; bcache originally used binary + * searches and that code topped out at under 50k lookups/second. + * + * So we need to construct some sort of lookup table. Since we only insert keys + * into the last (unwritten) set, most of the keys within a given btree node are + * usually in sets that are mostly constant. We use two different types of + * lookup tables to take advantage of this. + * + * Both lookup tables share in common that they don't index every key in the + * set; they index one key every BSET_CACHELINE bytes, and then a linear search + * is used for the rest. + * + * For sets that have been written to disk and are no longer being inserted + * into, we construct a binary search tree in an array - traversing a binary + * search tree in an array gives excellent locality of reference and is very + * fast, since both children of any node are adjacent to each other in memory + * (and their grandchildren, and great grandchildren...) - this means + * prefetching can be used to great effect. + * + * It's quite useful performance wise to keep these nodes small - not just + * because they're more likely to be in L2, but also because we can prefetch + * more nodes on a single cacheline and thus prefetch more iterations in advance + * when traversing this tree. + * + * Nodes in the auxiliary search tree must contain both a key to compare against + * (we don't want to fetch the key from the set, that would defeat the purpose), + * and a pointer to the key. We use a few tricks to compress both of these. + * + * To compress the pointer, we take advantage of the fact that one node in the + * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have + * a function (to_inorder()) that takes the index of a node in a binary tree and + * returns what its index would be in an inorder traversal, so we only have to + * store the low bits of the offset. + * + * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To + * compress that, we take advantage of the fact that when we're traversing the + * search tree at every iteration we know that both our search key and the key + * we're looking for lie within some range - bounded by our previous + * comparisons. (We special case the start of a search so that this is true even + * at the root of the tree). + * + * So we know the key we're looking for is between a and b, and a and b don't + * differ higher than bit 50, we don't need to check anything higher than bit + * 50. + * + * We don't usually need the rest of the bits, either; we only need enough bits + * to partition the key range we're currently checking. Consider key n - the + * key our auxiliary search tree node corresponds to, and key p, the key + * immediately preceding n. The lowest bit we need to store in the auxiliary + * search tree is the highest bit that differs between n and p. + * + * Note that this could be bit 0 - we might sometimes need all 80 bits to do the + * comparison. But we'd really like our nodes in the auxiliary search tree to be + * of fixed size. + * + * The solution is to make them fixed size, and when we're constructing a node + * check if p and n differed in the bits we needed them to. If they don't we + * flag that node, and when doing lookups we fallback to comparing against the + * real key. As long as this doesn't happen to often (and it seems to reliably + * happen a bit less than 1% of the time), we win - even on failures, that key + * is then more likely to be in cache than if we were doing binary searches all + * the way, since we're touching so much less memory. + * + * The keys in the auxiliary search tree are stored in (software) floating + * point, with an exponent and a mantissa. The exponent needs to be big enough + * to address all the bits in the original key, but the number of bits in the + * mantissa is somewhat arbitrary; more bits just gets us fewer failures. + * + * We need 7 bits for the exponent and 3 bits for the key's offset (since keys + * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. + * We need one node per 128 bytes in the btree node, which means the auxiliary + * search trees take up 3% as much memory as the btree itself. + * + * Constructing these auxiliary search trees is moderately expensive, and we + * don't want to be constantly rebuilding the search tree for the last set + * whenever we insert another key into it. For the unwritten set, we use a much + * simpler lookup table - it's just a flat array, so index i in the lookup table + * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing + * within each byte range works the same as with the auxiliary search trees. + * + * These are much easier to keep up to date when we insert a key - we do it + * somewhat lazily; when we shift a key up we usually just increment the pointer + * to it, only when it would overflow do we go to the trouble of finding the + * first key in that range of bytes again. + */ + +struct btree_keys; +struct btree_iter; +struct btree_iter_set; +struct bkey_float; + +#define MAX_BSETS 4U + +struct bset_tree { + /* + * We construct a binary tree in an array as if the array + * started at 1, so that things line up on the same cachelines + * better: see comments in bset.c at cacheline_to_bkey() for + * details + */ + + /* size of the binary tree and prev array */ + unsigned int size; + + /* function of size - precalculated for to_inorder() */ + unsigned int extra; + + /* copy of the last key in the set */ + struct bkey end; + struct bkey_float *tree; + + /* + * The nodes in the bset tree point to specific keys - this + * array holds the sizes of the previous key. + * + * Conceptually it's a member of struct bkey_float, but we want + * to keep bkey_float to 4 bytes and prev isn't used in the fast + * path. + */ + uint8_t *prev; + + /* The actual btree node, with pointers to each sorted set */ + struct bset *data; +}; + +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); + struct bkey *(*sort_fixup)(struct btree_iter *iter, + struct bkey *tmp); + bool (*insert_fixup)(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key); + bool (*key_invalid)(struct btree_keys *bk, + const struct bkey *k); + bool (*key_bad)(struct btree_keys *bk, + const struct bkey *k); + bool (*key_merge)(struct btree_keys *bk, + struct bkey *l, struct bkey *r); + void (*key_to_text)(char *buf, + size_t size, + const struct bkey *k); + void (*key_dump)(struct btree_keys *keys, + const struct bkey *k); + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + +struct btree_keys { + const struct btree_keys_ops *ops; + uint8_t page_order; + uint8_t nsets; + unsigned int last_set_unwritten:1; + bool *expensive_debug_checks; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; +}; + +static inline struct bset_tree *bset_tree_last(struct btree_keys *b) +{ + return b->set + b->nsets; +} + +static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) +{ + return t <= b->set + b->nsets - b->last_set_unwritten; +} + +static inline bool bkey_written(struct btree_keys *b, struct bkey *k) +{ + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; +} + +static inline unsigned int bset_byte_offset(struct btree_keys *b, + struct bset *i) +{ + return ((size_t) i) - ((size_t) b->set->data); +} + +static inline unsigned int bset_sector_offset(struct btree_keys *b, + struct bset *i) +{ + return bset_byte_offset(b, i) >> 9; +} + +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, block_bytes) \ + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) +#define set_blocks(i, block_bytes) \ + __set_blocks(i, (i)->keys, block_bytes) + +static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON((PAGE_SIZE << b->page_order) < + (bset_byte_offset(b, t->data) + set_bytes(t->data))); + + if (!b->last_set_unwritten) + return 0; + + return ((PAGE_SIZE << b->page_order) - + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / + sizeof(u64); +} + +static inline struct bset *bset_next_set(struct btree_keys *b, + unsigned int block_bytes) +{ + struct bset *i = bset_tree_last(b)->data; + + return ((void *) i) + roundup(set_bytes(i), block_bytes); +} + +void bch_btree_keys_free(struct btree_keys *b); +int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, + gfp_t gfp); +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks); + +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic); +void bch_bset_build_written_tree(struct btree_keys *b); +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k); +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r); +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert); +unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key); + +enum { + BTREE_INSERT_STATUS_NO_INSERT = 0, + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + +/* Btree key iteration */ + +struct btree_iter { + size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree_keys *b; +#endif + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; +}; + +typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); + +struct bkey *bch_btree_iter_next(struct btree_iter *iter); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, + struct btree_keys *b, + ptr_filter_fn fn); + +void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, + struct bkey *end); +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search); + +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, + const struct bkey *search); + +/* + * Returns the first key that is strictly greater than search + */ +static inline struct bkey *bch_bset_search(struct btree_keys *b, + struct bset_tree *t, + const struct bkey *search) +{ + return search ? __bch_bset_search(b, t, search) : t->data->start; +} + +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + +/* Sorting */ + +struct bset_sort_state { + mempool_t pool; + + unsigned int page_order; + unsigned int crit_factor; + + struct time_stats time; +}; + +void bch_bset_sort_state_free(struct bset_sort_state *state); +int bch_bset_sort_state_init(struct bset_sort_state *state, + unsigned int page_order); +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state); +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state); +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state); +void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, + struct bset_sort_state *state); + +static inline void bch_btree_sort(struct btree_keys *b, + struct bset_sort_state *state) +{ + bch_btree_sort_partial(b, 0, state); +} + +struct bset_stats { + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); + +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \ + (unsigned int)(i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) +{ + return bkey_idx(i->start, idx); +} + +static inline void bkey_init(struct bkey *k) +{ + *k = ZERO_KEY; +} + +static __always_inline int64_t bkey_cmp(const struct bkey *l, + const struct bkey *r) +{ + return unlikely(KEY_INODE(l) != KEY_INODE(r)) + ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) + : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); +} + +void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, + unsigned int i); +bool __bch_cut_front(const struct bkey *where, struct bkey *k); +bool __bch_cut_back(const struct bkey *where, struct bkey *k); + +static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, k) > 0); + return __bch_cut_front(where, k); +} + +static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); + return __bch_cut_back(where, k); +} + +/* + * Pointer '*preceding_key_p' points to a memory object to store preceding + * key of k. If the preceding key does not exist, set '*preceding_key_p' to + * NULL. So the caller of preceding_key() needs to take care of memory + * which '*preceding_key_p' pointed to before calling preceding_key(). + * Currently the only caller of preceding_key() is bch_btree_insert_key(), + * and it points to an on-stack variable, so the memory release is handled + * by stackframe itself. + */ +static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p) +{ + if (KEY_INODE(k) || KEY_OFFSET(k)) { + (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0); + if (!(*preceding_key_p)->low) + (*preceding_key_p)->high--; + (*preceding_key_p)->low--; + } else { + (*preceding_key_p) = NULL; + } +} + +static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_invalid(b, k); +} + +static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_bad(b, k); +} + +static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, + size_t size, const struct bkey *k) +{ + return b->ops->key_to_text(buf, size, k); +} + +static inline bool bch_bkey_equal_header(const struct bkey *l, + const struct bkey *r) +{ + return (KEY_DIRTY(l) == KEY_DIRTY(r) && + KEY_PTRS(l) == KEY_PTRS(r) && + KEY_CSUM(l) == KEY_CSUM(r)); +} + +/* Keylists */ + +struct keylist { + union { + struct bkey *keys; + uint64_t *keys_p; + }; + union { + struct bkey *top; + uint64_t *top_p; + }; + + /* Enough room for btree_split's keys without realloc */ +#define KEYLIST_INLINE 16 + uint64_t inline_keys[KEYLIST_INLINE]; +}; + +static inline void bch_keylist_init(struct keylist *l) +{ + l->top_p = l->keys_p = l->inline_keys; +} + +static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) +{ + l->keys = k; + l->top = bkey_next(k); +} + +static inline void bch_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} + +static inline void bch_keylist_add(struct keylist *l, struct bkey *k) +{ + bkey_copy(l->top, k); + bch_keylist_push(l); +} + +static inline bool bch_keylist_empty(struct keylist *l) +{ + return l->top == l->keys; +} + +static inline void bch_keylist_reset(struct keylist *l) +{ + l->top = l->keys; +} + +static inline void bch_keylist_free(struct keylist *l) +{ + if (l->keys_p != l->inline_keys) + kfree(l->keys_p); +} + +static inline size_t bch_keylist_nkeys(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch_keylist_bytes(struct keylist *l) +{ + return bch_keylist_nkeys(l) * sizeof(uint64_t); +} + +struct bkey *bch_keylist_pop(struct keylist *l); +void bch_keylist_pop_front(struct keylist *l); +int __bch_keylist_realloc(struct keylist *l, unsigned int u64s); + +/* Debug stuff */ + +#ifdef CONFIG_BCACHE_DEBUG + +int __bch_count_data(struct btree_keys *b); +void __printf(2, 3) __bch_check_keys(struct btree_keys *b, + const char *fmt, + ...); +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); +void bch_dump_bucket(struct btree_keys *b); + +#else + +static inline int __bch_count_data(struct btree_keys *b) { return -1; } +static inline void __printf(2, 3) + __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void bch_dump_bucket(struct btree_keys *b) {} +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); + +#endif + +static inline bool btree_keys_expensive_checks(struct btree_keys *b) +{ +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif +} + +static inline int bch_count_data(struct btree_keys *b) +{ + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; +} + +#define bch_check_keys(b, ...) \ +do { \ + if (btree_keys_expensive_checks(b)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) + +#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 000000000..6a2f57ae0 --- /dev/null +++ b/drivers/md/bcache/btree.c @@ -0,0 +1,2800 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Todo: + * register_bcache: Return errors out to userspace correctly + * + * Writeback: don't undirty key until after a cache flush + * + * Create an iterator for key pointers + * + * On btree write error, mark bucket such that it won't be freed from the cache + * + * Journalling: + * Check for bad keys in replay + * Propagate barriers + * Refcount journal entries in journal_replay + * + * Garbage collection: + * Finish incremental gc + * Gc should free old UUIDs, data for invalid UUIDs + * + * Provide a way to list backing device UUIDs we have data cached for, and + * probably how long it's been since we've seen them, and a way to invalidate + * dirty data for devices that will never be attached again + * + * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so + * that based on that and how much dirty data we have we can keep writeback + * from being starved + * + * Add a tracepoint or somesuch to watch for writeback starvation + * + * When btree depth > 1 and splitting an interior node, we have to make sure + * alloc_bucket() cannot fail. This should be true but is not completely + * obvious. + * + * Plugging? + * + * If data write is less than hard sector size of ssd, round up offset in open + * bucket to the next whole sector + * + * Superblock needs to be fleshed out for multiple cache devices + * + * Add a sysfs tunable for the number of writeback IOs in flight + * + * Add a sysfs tunable for the number of open data buckets + * + * IO tracking: Can we track when one process is doing io on behalf of another? + * IO tracking: Don't use just an average, weigh more recent stuff higher + * + * Test module load/unload + */ + +#define MAX_NEED_GC 64 +#define MAX_SAVE_PRIO 72 +#define MAX_GC_TIMES 100 +#define MIN_GC_NODES 100 +#define GC_SLEEP_MS 100 + +#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) + +#define PTR_HASH(c, k) \ + (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) + +static struct workqueue_struct *btree_io_wq; + +#define insert_lock(s, b) ((b)->level <= (s)->lock) + + +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); +} + +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->cache->sb)); + +} + +/* Btree key manipulation */ + +void bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + +/* Btree IO */ + +static uint64_t btree_csum_set(struct btree *b, struct bset *i) +{ + uint64_t crc = b->key.ptr[0]; + void *data = (void *) i + 8, *end = bset_bkey_last(i); + + crc = crc64_be(crc, data, end - data); + return crc ^ 0xffffffffffffffffULL; +} + +void bch_btree_node_read_done(struct btree *b) +{ + const char *err = "bad btree header"; + struct bset *i = btree_bset_first(b); + struct btree_iter *iter; + + /* + * c->fill_iter can allocate an iterator with more memory space + * than static MAX_BSETS. + * See the comment arount cache_set->fill_iter. + */ + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter->used = 0; + +#ifdef CONFIG_BCACHE_DEBUG + iter->b = &b->keys; +#endif + + if (!i->seq) + goto err; + + for (; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; + i = write_block(b)) { + err = "unsupported bset version"; + if (i->version > BCACHE_BSET_VERSION) + goto err; + + err = "bad btree header"; + if (b->written + set_blocks(i, block_bytes(b->c->cache)) > + btree_blocks(b)) + goto err; + + err = "bad magic"; + if (i->magic != bset_magic(&b->c->cache->sb)) + goto err; + + err = "bad checksum"; + switch (i->version) { + case 0: + if (i->csum != csum_set(i)) + goto err; + break; + case BCACHE_BSET_VERSION: + if (i->csum != btree_csum_set(b, i)) + goto err; + break; + } + + err = "empty set"; + if (i != b->keys.set[0].data && !i->keys) + goto err; + + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); + + b->written += set_blocks(i, block_bytes(b->c->cache)); + } + + err = "corrupted btree"; + for (i = write_block(b); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); + i = ((void *) i) + block_bytes(b->c->cache)) + if (i->seq == b->keys.set[0].data->seq) + goto err; + + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); + + i = b->keys.set[0].data; + err = "short btree key"; + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) + goto err; + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->cache->sb)); +out: + mempool_free(iter, &b->c->fill_iter); + return; +err: + set_btree_node_io_error(b); + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", + err, PTR_BUCKET_NR(b->c, &b->key, 0), + bset_block_offset(b, i), i->keys); + goto out; +} + +static void btree_node_read_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + closure_put(cl); +} + +static void bch_btree_node_read(struct btree *b) +{ + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + + trace_bcache_btree_read(b); + + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); + bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + bio->bi_opf = REQ_OP_READ | REQ_META; + + bch_bio_map(bio, b->keys.set[0].data); + + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); + + if (bio->bi_status) + set_btree_node_io_error(b); + + bch_bbio_free(bio, b->c); + + if (btree_node_io_error(b)) + goto err; + + bch_btree_node_read_done(b); + bch_time_stats_update(&b->c->btree_read_time, start_time); + + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %zu", + PTR_BUCKET_NR(b->c, &b->key, 0)); +} + +static void btree_complete_write(struct btree *b, struct btree_write *w) +{ + if (w->prio_blocked && + !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) + wake_up_allocators(b->c); + + if (w->journal) { + atomic_dec_bug(w->journal); + __closure_wake_up(&b->c->journal.wait); + } + + w->prio_blocked = 0; + w->journal = NULL; +} + +static void btree_node_write_unlock(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + up(&b->io_mutex); +} + +static void __btree_node_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + struct btree_write *w = btree_prev_write(b); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + btree_complete_write(b, w); + + if (btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + + closure_return_with_destructor(cl, btree_node_write_unlock); +} + +static void btree_node_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + bio_free_pages(b->bio); + __btree_node_write_done(cl); +} + +static void btree_node_write_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io); + + if (bio->bi_status) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) +{ + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); + BKEY_PADDED(key) k; + + i->version = BCACHE_BSET_VERSION; + i->csum = btree_csum_set(b, i); + + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = cl; + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); + b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; + bch_bio_map(b->bio, i); + + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + + bkey_copy(&k.key, &b->key); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(&b->keys, i)); + + if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + struct bio_vec *bv; + void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, b->bio, iter_all) { + memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + addr += PAGE_SIZE; + } + + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + continue_at(cl, btree_node_write_done, NULL); + } else { + /* + * No problem for multipage bvec since the bio is + * just allocated + */ + b->bio->bi_vcnt = 0; + bch_bio_map(b->bio, i); + + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + closure_sync(cl); + continue_at_nobarrier(cl, __btree_node_write_done, NULL); + } +} + +void __bch_btree_node_write(struct btree *b, struct closure *parent) +{ + struct bset *i = btree_bset_last(b); + + lockdep_assert_held(&b->write_lock); + + trace_bcache_btree_write(b); + + BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(btree_bset_first(b)->seq != i->seq); + bch_check_keys(&b->keys, "writing"); + + cancel_delayed_work(&b->work); + + /* If caller isn't waiting for write, parent refcount is cache set */ + down(&b->io_mutex); + closure_init(&b->io, parent ?: &b->c->cl); + + clear_bit(BTREE_NODE_dirty, &b->flags); + change_bit(BTREE_NODE_write_idx, &b->flags); + + do_btree_node_write(b); + + atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, + &b->c->cache->btree_sectors_written); + + b->written += set_blocks(i, block_bytes(b->c->cache)); +} + +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned int nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); + + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (nsets && !b->keys.nsets) + bch_btree_verify(b); + + bch_btree_init_next(b); +} + +static void bch_btree_node_write_sync(struct btree *b) +{ + struct closure cl; + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); +} + +static void btree_node_write_work(struct work_struct *w) +{ + struct btree *b = container_of(to_delayed_work(w), struct btree, work); + + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); +} + +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) +{ + struct bset *i = btree_bset_last(b); + struct btree_write *w = btree_current_write(b); + + lockdep_assert_held(&b->write_lock); + + BUG_ON(!b->written); + BUG_ON(!i->keys); + + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + + set_btree_node_dirty(b); + + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ + if (journal_ref) { + if (w->journal && + journal_pin_cmp(b->c, w->journal, journal_ref)) { + atomic_dec_bug(w->journal); + w->journal = NULL; + } + + if (!w->journal) { + w->journal = journal_ref; + atomic_inc(w->journal); + } + } + + /* Force write if set is too big */ + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); +} + +/* + * Btree in memory cache - allocation/freeing + * mca -> memory cache + */ + +#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ + ? c->root->level : 1) * 8 + 16) +#define mca_can_free(c) \ + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) + +static void mca_data_free(struct btree *b) +{ + BUG_ON(b->io_mutex.count != 1); + + bch_btree_keys_free(&b->keys); + + b->c->btree_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); +} + +static void mca_bucket_free(struct btree *b) +{ + BUG_ON(btree_node_dirty(b)); + + b->key.ptr[0] = 0; + hlist_del_init_rcu(&b->hash); + list_move(&b->list, &b->c->btree_cache_freeable); +} + +static unsigned int btree_order(struct bkey *k) +{ + return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); +} + +static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) +{ + if (!bch_btree_keys_alloc(&b->keys, + max_t(unsigned int, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } +} + +static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) +{ + /* + * kzalloc() is necessary here for initialization, + * see code comments in bch_btree_keys_init(). + */ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + + if (!b) + return NULL; + + init_rwsem(&b->lock); + lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); + INIT_LIST_HEAD(&b->list); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); + b->c = c; + sema_init(&b->io_mutex, 1); + + mca_data_alloc(b, k, gfp); + return b; +} + +static int mca_reap(struct btree *b, unsigned int min_order, bool flush) +{ + struct closure cl; + + closure_init_stack(&cl); + lockdep_assert_held(&b->c->bucket_lock); + + if (!down_write_trylock(&b->lock)) + return -ENOMEM; + + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); + + if (b->keys.page_order < min_order) + goto out_unlock; + + if (!flush) { + if (btree_node_dirty(b)) + goto out_unlock; + + if (down_trylock(&b->io_mutex)) + goto out_unlock; + up(&b->io_mutex); + } + +retry: + /* + * BTREE_NODE_dirty might be cleared in btree_flush_btree() by + * __bch_btree_node_write(). To avoid an extra flush, acquire + * b->write_lock before checking BTREE_NODE_dirty bit. + */ + mutex_lock(&b->write_lock); + /* + * If this btree node is selected in btree_flush_write() by journal + * code, delay and retry until the node is flushed by journal code + * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). + */ + if (btree_node_journal_flush(b)) { + pr_debug("bnode %p is flushing by journal, retry\n", b); + mutex_unlock(&b->write_lock); + udelay(1); + goto retry; + } + + if (btree_node_dirty(b)) + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); + + /* wait for any in flight btree write */ + down(&b->io_mutex); + up(&b->io_mutex); + + return 0; +out_unlock: + rw_unlock(true, b); + return -ENOMEM; +} + +static unsigned long bch_mca_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct btree *b, *t; + unsigned long i, nr = sc->nr_to_scan; + unsigned long freed = 0; + unsigned int btree_cache_used; + + if (c->shrinker_disabled) + return SHRINK_STOP; + + if (c->btree_cache_alloc_lock) + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_IO) + mutex_lock(&c->bucket_lock); + else if (!mutex_trylock(&c->bucket_lock)) + return -1; + + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ + nr /= c->btree_pages; + if (nr == 0) + nr = 1; + nr = min_t(unsigned long, nr, mca_can_free(c)); + + i = 0; + btree_cache_used = c->btree_cache_used; + list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { + if (nr <= 0) + goto out; + + if (!mca_reap(b, 0, false)) { + mca_data_free(b); + rw_unlock(true, b); + freed++; + } + nr--; + i++; + } + + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (nr <= 0 || i >= btree_cache_used) + goto out; + + if (!mca_reap(b, 0, false)) { + mca_bucket_free(b); + mca_data_free(b); + rw_unlock(true, b); + freed++; + } + + nr--; + i++; + } +out: + mutex_unlock(&c->bucket_lock); + return freed * c->btree_pages; +} + +static unsigned long bch_mca_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + + if (c->shrinker_disabled) + return 0; + + if (c->btree_cache_alloc_lock) + return 0; + + return mca_can_free(c) * c->btree_pages; +} + +void bch_btree_cache_free(struct cache_set *c) +{ + struct btree *b; + struct closure cl; + + closure_init_stack(&cl); + + if (c->shrink.list.next) + unregister_shrinker(&c->shrink); + + mutex_lock(&c->bucket_lock); + +#ifdef CONFIG_BCACHE_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); +#endif + + list_splice(&c->btree_cache_freeable, + &c->btree_cache); + + while (!list_empty(&c->btree_cache)) { + b = list_first_entry(&c->btree_cache, struct btree, list); + + /* + * This function is called by cache_set_free(), no I/O + * request on cache now, it is unnecessary to acquire + * b->write_lock before clearing BTREE_NODE_dirty anymore. + */ + if (btree_node_dirty(b)) { + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + } + mca_data_free(b); + } + + while (!list_empty(&c->btree_cache_freed)) { + b = list_first_entry(&c->btree_cache_freed, + struct btree, list); + list_del(&b->list); + cancel_delayed_work_sync(&b->work); + kfree(b); + } + + mutex_unlock(&c->bucket_lock); +} + +int bch_btree_cache_alloc(struct cache_set *c) +{ + unsigned int i; + + for (i = 0; i < mca_reserve(c); i++) + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; + + list_splice_init(&c->btree_cache, + &c->btree_cache_freeable); + +#ifdef CONFIG_BCACHE_DEBUG + mutex_init(&c->verify_lock); + + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL|__GFP_COMP, + ilog2(meta_bucket_pages(&c->cache->sb))); + if (!c->verify_ondisk) { + /* + * Don't worry about the mca_rereserve buckets + * allocated in previous for-loop, they will be + * handled properly in bch_cache_set_unregister(). + */ + return -ENOMEM; + } + + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + if (c->verify_data && + c->verify_data->keys.set->data) + list_del_init(&c->verify_data->list); + else + c->verify_data = NULL; +#endif + + c->shrink.count_objects = bch_mca_count; + c->shrink.scan_objects = bch_mca_scan; + c->shrink.seeks = 4; + c->shrink.batch = c->btree_pages * 2; + + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) + pr_warn("bcache: %s: could not register shrinker\n", + __func__); + + return 0; +} + +/* Btree in memory cache - hash table */ + +static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) +{ + return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; +} + +static struct btree *mca_find(struct cache_set *c, struct bkey *k) +{ + struct btree *b; + + rcu_read_lock(); + hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) + if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) + goto out; + b = NULL; +out: + rcu_read_unlock(); + return b; +} + +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) +{ + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); + return -EINTR; + } + spin_unlock(&c->btree_cannibalize_lock); + + return 0; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) +{ + struct btree *b; + + trace_bcache_btree_cache_cannibalize(c); + + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; + + WARN(1, "btree cache cannibalize failed\n"); + return ERR_PTR(-ENOMEM); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch_cannibalize_unlock(struct cache_set *c) +{ + spin_lock(&c->btree_cannibalize_lock); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); + } + spin_unlock(&c->btree_cannibalize_lock); +} + +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) +{ + struct btree *b; + + BUG_ON(current->bio_list); + + lockdep_assert_held(&c->bucket_lock); + + if (mca_find(c, k)) + return NULL; + + /* btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b, &c->btree_cache_freeable, list) + if (!mca_reap(b, btree_order(k), false)) + goto out; + + /* We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &c->btree_cache_freed, list) + if (!mca_reap(b, 0, false)) { + mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); + if (!b->keys.set[0].data) + goto err; + else + goto out; + } + + b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); + if (!b) + goto err; + + BUG_ON(!down_write_trylock(&b->lock)); + if (!b->keys.set->data) + goto err; +out: + BUG_ON(b->io_mutex.count != 1); + + bkey_copy(&b->key, k); + list_move(&b->list, &c->btree_cache); + hlist_del_init_rcu(&b->hash); + hlist_add_head_rcu(&b->hash, mca_hash(c, k)); + + lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); + b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; + b->level = level; + + if (!b->level) + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); + else + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); + + return b; +err: + if (b) + rw_unlock(true, b); + + b = mca_cannibalize(c, op, k); + if (!IS_ERR(b)) + goto out; + + return b; +} + +/* + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. + * + * The btree node will have either a read or a write lock held, depending on + * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. + */ +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write, + struct btree *parent) +{ + int i = 0; + struct btree *b; + + BUG_ON(level < 0); +retry: + b = mca_find(c, k); + + if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, op, k, level); + mutex_unlock(&c->bucket_lock); + + if (!b) + goto retry; + if (IS_ERR(b)) + return b; + + bch_btree_node_read(b); + + if (!write) + downgrade_write(&b->lock); + } else { + rw_lock(write, b, level); + if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { + rw_unlock(write, b); + goto retry; + } + BUG_ON(b->level != level); + } + + if (btree_node_io_error(b)) { + rw_unlock(write, b); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); + + b->parent = parent; + + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); + } + + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); + + return b; +} + +static void btree_node_prefetch(struct btree *parent, struct bkey *k) +{ + struct btree *b; + + mutex_lock(&parent->c->bucket_lock); + b = mca_alloc(parent->c, NULL, k, parent->level - 1); + mutex_unlock(&parent->c->bucket_lock); + + if (!IS_ERR_OR_NULL(b)) { + b->parent = parent; + bch_btree_node_read(b); + rw_unlock(true, b); + } +} + +/* Btree alloc */ + +static void btree_node_free(struct btree *b) +{ + trace_bcache_btree_node_free(b); + + BUG_ON(b == b->c->root); + +retry: + mutex_lock(&b->write_lock); + /* + * If the btree node is selected and flushing in btree_flush_write(), + * delay and retry until the BTREE_NODE_journal_flush bit cleared, + * then it is safe to free the btree node here. Otherwise this btree + * node will be in race condition. + */ + if (btree_node_journal_flush(b)) { + mutex_unlock(&b->write_lock); + pr_debug("bnode %p journal_flush set, retry\n", b); + udelay(1); + goto retry; + } + + if (btree_node_dirty(b)) { + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + } + + mutex_unlock(&b->write_lock); + + cancel_delayed_work(&b->work); + + mutex_lock(&b->c->bucket_lock); + bch_bucket_free(b->c, &b->key); + mca_bucket_free(b); + mutex_unlock(&b->c->bucket_lock); +} + +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent) +{ + BKEY_PADDED(key) k; + struct btree *b; + + mutex_lock(&c->bucket_lock); +retry: + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) + goto err; + + bkey_put(c, &k.key); + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, op, &k.key, level); + if (IS_ERR(b)) + goto err_free; + + if (!b) { + cache_bug(c, + "Tried to allocate bucket that was in btree cache"); + goto retry; + } + + b->parent = parent; + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); + + mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); + return b; +err_free: + bch_bucket_free(c, &k.key); +err: + mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(c); + return b; +} + +static struct btree *bch_btree_node_alloc(struct cache_set *c, + struct btree_op *op, int level, + struct btree *parent) +{ + return __bch_btree_node_alloc(c, op, level, op != NULL, parent); +} + +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct btree_op *op) +{ + struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); + + if (!IS_ERR(n)) { + mutex_lock(&n->write_lock); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + mutex_unlock(&n->write_lock); + } + + return n; +} + +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ + unsigned int i; + + mutex_lock(&b->c->bucket_lock); + + atomic_inc(&b->c->prio_blocked); + + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_GEN(k, i, + bch_inc_gen(b->c->cache, + PTR_BUCKET(b->c, &b->key, i))); + + mutex_unlock(&b->c->bucket_lock); +} + +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ + struct cache_set *c = b->c; + struct cache *ca = c->cache; + unsigned int reserve = (c->root->level - b->level) * 2 + 1; + + mutex_lock(&c->bucket_lock); + + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } + + mutex_unlock(&c->bucket_lock); + + return mca_cannibalize_lock(b->c, op); +} + +/* Garbage collection */ + +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + struct bkey *k) +{ + uint8_t stale = 0; + unsigned int i; + struct bucket *g; + + /* + * ptr_invalid() can't return true for the keys that mark btree nodes as + * freed, but since ptr_bad() returns true we'll never actually use them + * for anything and thus we don't want mark their pointers here + */ + if (!bkey_cmp(k, &ZERO_KEY)) + return stale; + + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(c, k, i)) + continue; + + g = PTR_BUCKET(c, k, i); + + if (gen_after(g->last_gc, PTR_GEN(k, i))) + g->last_gc = PTR_GEN(k, i); + + if (ptr_stale(c, k, i)) { + stale = max(stale, ptr_stale(c, k, i)); + continue; + } + + cache_bug_on(GC_MARK(g) && + (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), + c, "inconsistent ptrs: mark = %llu, level = %i", + GC_MARK(g), level); + + if (level) + SET_GC_MARK(g, GC_MARK_METADATA); + else if (KEY_DIRTY(k)) + SET_GC_MARK(g, GC_MARK_DIRTY); + else if (!GC_MARK(g)) + SET_GC_MARK(g, GC_MARK_RECLAIMABLE); + + /* guard against overflow */ + SET_GC_SECTORS_USED(g, min_t(unsigned int, + GC_SECTORS_USED(g) + KEY_SIZE(k), + MAX_GC_SECTORS_USED)); + + BUG_ON(!GC_SECTORS_USED(g)); + } + + return stale; +} + +#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) + +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { + struct bucket *b = PTR_BUCKET(c, k, i); + + b->gen = PTR_GEN(k, i); + + if (level && bkey_cmp(k, &ZERO_KEY)) + b->prio = BTREE_PRIO; + else if (!level && b->prio == BTREE_PRIO) + b->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, level, k); +} + +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) +{ + uint8_t stale = 0; + unsigned int keys = 0, good_keys = 0; + struct bkey *k; + struct btree_iter iter; + struct bset_tree *t; + + gc->nodes++; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { + stale = max(stale, btree_mark_key(b, k)); + keys++; + + if (bch_ptr_bad(&b->keys, k)) + continue; + + gc->key_bytes += bkey_u64s(k); + gc->nkeys++; + good_keys++; + + gc->data += KEY_SIZE(k); + } + + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) + btree_bug_on(t->size && + bset_written(&b->keys, t) && + bkey_cmp(&b->key, &t->end) < 0, + b, "found short btree key in gc"); + + if (b->c->gc_always_rewrite) + return true; + + if (stale > 10) + return true; + + if ((keys - good_keys) * 2 > keys) + return true; + + return false; +} + +#define GC_MERGE_NODES 4U + +struct gc_merge_info { + struct btree *b; + unsigned int keys; +}; + +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct gc_stat *gc, struct gc_merge_info *r) +{ + unsigned int i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; + struct keylist keylist; + struct closure cl; + struct bkey *k; + + bch_keylist_init(&keylist); + + if (btree_check_reserve(b, NULL)) + return 0; + + memset(new_nodes, 0, sizeof(new_nodes)); + closure_init_stack(&cl); + + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) + keys += r[nodes++].keys; + + blocks = btree_default_blocks(b->c) * 2 / 3; + + if (nodes < 2 || + __set_blocks(b->keys.set[0].data, keys, + block_bytes(b->c->cache)) > blocks * (nodes - 1)) + return 0; + + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); + if (IS_ERR(new_nodes[i])) + goto out_nocoalesce; + } + + /* + * We have to check the reserve here, after we've allocated our new + * nodes, to make sure the insert below will succeed - we also check + * before as an optimization to potentially avoid a bunch of expensive + * allocs/sorts + */ + if (btree_check_reserve(b, NULL)) + goto out_nocoalesce; + + for (i = 0; i < nodes; i++) + mutex_lock(&new_nodes[i]->write_lock); + + for (i = nodes - 1; i > 0; --i) { + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); + struct bkey *k, *last = NULL; + + keys = 0; + + if (i > 1) { + for (k = n2->start; + k < bset_bkey_last(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), + block_bytes(b->c->cache)) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { + /* + * Last node we're not getting rid of - we're getting + * rid of the node at r[0]. Have to try and fit all of + * the remaining keys into this node; we can't ensure + * they will always fit due to rounding and variable + * length keys (shouldn't be possible in practice, + * though) + */ + if (__set_blocks(n1, n1->keys + n2->keys, + block_bytes(b->c->cache)) > + btree_blocks(new_nodes[i])) + goto out_unlock_nocoalesce; + + keys = n2->keys; + /* Take the key of the node we're getting rid of */ + last = &r->b->key; + } + + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > + btree_blocks(new_nodes[i])); + + if (last) + bkey_copy_key(&new_nodes[i]->key, last); + + memcpy(bset_bkey_last(n1), + n2->start, + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); + + n1->keys += keys; + r[i].keys = n1->keys; + + memmove(n2->start, + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); + + n2->keys -= keys; + + if (__bch_keylist_realloc(&keylist, + bkey_u64s(&new_nodes[i]->key))) + goto out_unlock_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(&keylist, &new_nodes[i]->key); + } + + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + + closure_sync(&cl); + + /* We emptied out this node */ + BUG_ON(btree_bset_first(new_nodes[0])->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + new_nodes[0] = NULL; + + for (i = 0; i < nodes; i++) { + if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) + goto out_nocoalesce; + + make_btree_freeing_key(r[i].b, keylist.top); + bch_keylist_push(&keylist); + } + + bch_btree_insert_node(b, op, &keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keylist)); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + + trace_bcache_btree_gc_coalesce(nodes); + gc->nodes--; + + bch_keylist_free(&keylist); + + /* Invalidated our iterator */ + return -EINTR; + +out_unlock_nocoalesce: + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + +out_nocoalesce: + closure_sync(&cl); + + while ((k = bch_keylist_pop(&keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + bch_keylist_free(&keylist); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; +} + +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, + struct btree *replace) +{ + struct keylist keys; + struct btree *n; + + if (btree_check_reserve(b, NULL)) + return 0; + + n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; + + /* recheck reserve after allocating replacement node */ + if (btree_check_reserve(b, NULL)) { + btree_node_free(n); + rw_unlock(true, n); + return 0; + } + + bch_btree_node_write_sync(n); + + bch_keylist_init(&keys); + bch_keylist_add(&keys, &n->key); + + make_btree_freeing_key(replace, keys.top); + bch_keylist_push(&keys); + + bch_btree_insert_node(b, op, &keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); + + btree_node_free(replace); + rw_unlock(true, n); + + /* Invalidated our iterator */ + return -EINTR; +} + +static unsigned int btree_gc_count_keys(struct btree *b) +{ + struct bkey *k; + struct btree_iter iter; + unsigned int ret = 0; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); + + return ret; +} + +static size_t btree_gc_min_nodes(struct cache_set *c) +{ + size_t min_nodes; + + /* + * Since incremental GC would stop 100ms when front + * side I/O comes, so when there are many btree nodes, + * if GC only processes constant (100) nodes each time, + * GC would last a long time, and the front side I/Os + * would run out of the buckets (since no new bucket + * can be allocated during GC), and be blocked again. + * So GC should not process constant nodes, but varied + * nodes according to the number of btree nodes, which + * realized by dividing GC into constant(100) times, + * so when there are many btree nodes, GC can process + * more nodes each time, otherwise, GC will process less + * nodes each time (but no less than MIN_GC_NODES) + */ + min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; + if (min_nodes < MIN_GC_NODES) + min_nodes = MIN_GC_NODES; + + return min_nodes; +} + + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + int ret = 0; + bool should_rewrite; + struct bkey *k; + struct btree_iter iter; + struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; + + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + + for (i = r; i < r + ARRAY_SIZE(r); i++) + i->b = ERR_PTR(-EINTR); + + while (1) { + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, op, k, b->level - 1, + true, b); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, gc, r); + if (ret) + break; + } + + if (!last->b) + break; + + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + ret = btree_gc_rewrite_node(b, op, last->b); + if (ret) + break; + } + + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } + + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + mutex_lock(&last->b->write_lock); + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + mutex_unlock(&last->b->write_lock); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; + + if (atomic_read(&b->c->search_inflight) && + gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + gc->nodes_pre = gc->nodes; + ret = -EAGAIN; + break; + } + + if (need_resched()) { + ret = -EAGAIN; + break; + } + } + + for (i = r; i < r + ARRAY_SIZE(r); i++) + if (!IS_ERR_OR_NULL(i->b)) { + mutex_lock(&i->b->write_lock); + if (btree_node_dirty(i->b)) + bch_btree_node_write(i->b, writes); + mutex_unlock(&i->b->write_lock); + rw_unlock(true, i->b); + } + + return ret; +} + +static int bch_btree_gc_root(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + struct btree *n = NULL; + int ret = 0; + bool should_rewrite; + + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(b, NULL); + + if (!IS_ERR(n)) { + bch_btree_node_write_sync(n); + + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); + + return -EINTR; + } + } + + __bch_btree_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; + } + + bkey_copy_key(&b->c->gc_done, &b->key); + + return ret; +} + +static void btree_gc_start(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + + if (!c->gc_mark_valid) + return; + + mutex_lock(&c->bucket_lock); + + c->gc_mark_valid = 0; + c->gc_done = ZERO_KEY; + + ca = c->cache; + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); + } + } + + mutex_unlock(&c->bucket_lock); +} + +static void bch_btree_gc_finish(struct cache_set *c) +{ + struct bucket *b; + struct cache *ca; + unsigned int i, j; + uint64_t *k; + + mutex_lock(&c->bucket_lock); + + set_gc_sectors(c); + c->gc_mark_valid = 1; + c->need_gc = 0; + + for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) + SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), + GC_MARK_METADATA); + + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->devices_max_used; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + + c->avail_nbuckets = 0; + + ca = c->cache; + ca->invalidate_needs_gc = 0; + + for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); + + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); + + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + + if (atomic_read(&b->pin)) + continue; + + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + c->avail_nbuckets++; + } + + mutex_unlock(&c->bucket_lock); +} + +static void bch_btree_gc(struct cache_set *c) +{ + int ret; + struct gc_stat stats; + struct closure writes; + struct btree_op op; + uint64_t start_time = local_clock(); + + trace_bcache_gc_start(c); + + memset(&stats, 0, sizeof(struct gc_stat)); + closure_init_stack(&writes); + bch_btree_op_init(&op, SHRT_MAX); + + btree_gc_start(c); + + /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ + do { + ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); + cond_resched(); + + if (ret == -EAGAIN) + schedule_timeout_interruptible(msecs_to_jiffies + (GC_SLEEP_MS)); + else if (ret) + pr_warn("gc failed!\n"); + } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + bch_btree_gc_finish(c); + wake_up_allocators(c); + + bch_time_stats_update(&c->btree_gc_time, start_time); + + stats.key_bytes *= sizeof(uint64_t); + stats.data <<= 9; + bch_update_bucket_in_use(c, &stats); + memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); + + trace_bcache_gc_end(c); + + bch_moving_gc(c); +} + +static bool gc_should_run(struct cache_set *c) +{ + struct cache *ca = c->cache; + + if (ca->invalidate_needs_gc) + return true; + + if (atomic_read(&c->sectors_to_gc) < 0) + return true; + + return false; +} + +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; + + while (1) { + wait_event_interruptible(c->gc_wait, + kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags) || + gc_should_run(c)); + + if (kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + break; + + set_gc_sectors(c); + bch_btree_gc(c); + } + + wait_for_kthread_stop(); + return 0; +} + +int bch_gc_thread_start(struct cache_set *c) +{ + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); + return PTR_ERR_OR_ZERO(c->gc_thread); +} + +/* Initial partial gc */ + +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) +{ + int ret = 0; + struct bkey *k, *p = NULL; + struct btree_iter iter; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(b->c, b->level, k); + + bch_initial_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + bch_btree_iter_init(&b->keys, &iter, NULL); + + do { + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); + if (k) { + btree_node_prefetch(b, k); + /* + * initiallize c->gc_stats.nodes + * for incremental GC + */ + b->c->gc_stats.nodes++; + } + + if (p) + ret = bcache_btree(check_recurse, p, b, op); + + p = k; + } while (p && !ret); + } + + return ret; +} + + +static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op)->wait); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + + + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + +int bch_btree_check(struct cache_set *c) +{ + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state check_state; + + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + memset(&check_state, 0, sizeof(struct btree_check_state)); + check_state.c = c; + check_state.total_threads = bch_btree_chkthread_nr(); + check_state.key_idx = 0; + spin_lock_init(&check_state.idx_lock); + atomic_set(&check_state.started, 0); + atomic_set(&check_state.enough, 0); + init_waitqueue_head(&check_state.wait); + + rw_lock(0, c->root, c->root->level); + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state.enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state.total_threads; i++) { + /* fetch latest check_state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&check_state.enough)) + break; + + check_state.infos[i].result = 0; + check_state.infos[i].state = &check_state; + + check_state.infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state.infos[i], + "bch_btrchk[%d]", i); + if (IS_ERR(check_state.infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]\n", i); + for (--i; i >= 0; i--) + kthread_stop(check_state.infos[i].thread); + ret = -ENOMEM; + goto out; + } + atomic_inc(&check_state.started); + } + + /* + * Must wait for all threads to stop. + */ + wait_event(check_state.wait, atomic_read(&check_state.started) == 0); + + for (i = 0; i < check_state.total_threads; i++) { + if (check_state.infos[i].result) { + ret = check_state.infos[i].result; + goto out; + } + } + +out: + rw_unlock(0, c->root); + return ret; +} + +void bch_initial_gc_finish(struct cache_set *c) +{ + struct cache *ca = c->cache; + struct bucket *b; + + bch_btree_gc_finish(c); + + mutex_lock(&c->bucket_lock); + + /* + * We need to put some unused buckets directly on the prio freelist in + * order to get the allocator thread started - it needs freed buckets in + * order to rewrite the prios and gens, and it needs to rewrite prios + * and gens in order to free buckets. + * + * This is only safe for buckets that have no live data in them, which + * there should always be some of. + */ + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) + break; + + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); + } + } + + mutex_unlock(&c->bucket_lock); +} + +/* Btree insertion */ + +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned int status; + + BUG_ON(bkey_cmp(k, &b->key) > 0); + + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); + + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; + } else + return false; +} + +static size_t insert_u64s_remaining(struct btree *b) +{ + long ret = bch_btree_keys_u64s_remaining(&b->keys); + + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; + + return max(ret, 0L); +} + +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) +{ + bool ret = false; + int oldsize = bch_count_data(&b->keys); + + while (!bch_keylist_empty(insert_keys)) { + struct bkey *k = insert_keys->keys; + + if (bkey_u64s(k) > insert_u64s_remaining(b)) + break; + + if (bkey_cmp(k, &b->key) <= 0) { + if (!b->level) + bkey_put(b->c, k); + + ret |= btree_insert_key(b, k, replace_key); + bch_keylist_pop_front(insert_keys); + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, insert_keys->keys); + + bch_cut_back(&b->key, &temp.key); + bch_cut_front(&b->key, insert_keys->keys); + + ret |= btree_insert_key(b, &temp.key, replace_key); + break; + } else { + break; + } + } + + if (!ret) + op->insert_collision = true; + + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); + + BUG_ON(bch_count_data(&b->keys) < oldsize); + return ret; +} + +static int btree_split(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) +{ + bool split; + struct btree *n1, *n2 = NULL, *n3 = NULL; + uint64_t start_time = local_clock(); + struct closure cl; + struct keylist parent_keys; + + closure_init_stack(&cl); + bch_keylist_init(&parent_keys); + + if (btree_check_reserve(b, op)) { + if (!b->level) + return -EINTR; + else + WARN(1, "insufficient reserve for split\n"); + } + + n1 = btree_node_alloc_replacement(b, op); + if (IS_ERR(n1)) + goto err; + + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; + + if (split) { + unsigned int keys = 0; + + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); + + n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); + if (IS_ERR(n2)) + goto err_free1; + + if (!b->parent) { + n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); + if (IS_ERR(n3)) + goto err_free2; + } + + mutex_lock(&n1->write_lock); + mutex_lock(&n2->write_lock); + + bch_btree_insert_keys(n1, op, insert_keys, replace_key); + + /* + * Has to be a linear search because we don't have an auxiliary + * search tree yet + */ + + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), + keys)); + + bkey_copy_key(&n1->key, + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); + + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; + + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); + + bkey_copy_key(&n2->key, &b->key); + + bch_keylist_add(&parent_keys, &n2->key); + bch_btree_node_write(n2, &cl); + mutex_unlock(&n2->write_lock); + rw_unlock(true, n2); + } else { + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); + + mutex_lock(&n1->write_lock); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); + } + + bch_keylist_add(&parent_keys, &n1->key); + bch_btree_node_write(n1, &cl); + mutex_unlock(&n1->write_lock); + + if (n3) { + /* Depth increases, make a new root */ + mutex_lock(&n3->write_lock); + bkey_copy_key(&n3->key, &MAX_KEY); + bch_btree_insert_keys(n3, op, &parent_keys, NULL); + bch_btree_node_write(n3, &cl); + mutex_unlock(&n3->write_lock); + + closure_sync(&cl); + bch_btree_set_root(n3); + rw_unlock(true, n3); + } else if (!b->parent) { + /* Root filled up but didn't need to be split */ + closure_sync(&cl); + bch_btree_set_root(n1); + } else { + /* Split a non root node */ + closure_sync(&cl); + make_btree_freeing_key(b, parent_keys.top); + bch_keylist_push(&parent_keys); + + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&parent_keys)); + } + + btree_node_free(b); + rw_unlock(true, n1); + + bch_time_stats_update(&b->c->btree_split_time, start_time); + + return 0; +err_free2: + bkey_put(b->c, &n2->key); + btree_node_free(n2); + rw_unlock(true, n2); +err_free1: + bkey_put(b->c, &n1->key); + btree_node_free(n1); + rw_unlock(true, n1); +err: + WARN(1, "bcache: btree split failed (level %u)", b->level); + + if (n3 == ERR_PTR(-EAGAIN) || + n2 == ERR_PTR(-EAGAIN) || + n1 == ERR_PTR(-EAGAIN)) + return -EAGAIN; + + return -ENOMEM; +} + +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key) +{ + struct closure cl; + + BUG_ON(b->level && replace_key); + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + + if (write_block(b) != btree_bset_last(b) && + b->keys.last_set_unwritten) + bch_btree_init_next(b); /* just wrote a set */ + + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { + mutex_unlock(&b->write_lock); + goto split; + } + + BUG_ON(write_block(b) != btree_bset_last(b)); + + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write(b, &cl); + } + + mutex_unlock(&b->write_lock); + + /* wait for btree node write if necessary, after unlock */ + closure_sync(&cl); + + return 0; +split: + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + int ret = btree_split(b, op, insert_keys, replace_key); + + if (bch_keylist_empty(insert_keys)) + return 0; + else if (!ret) + return -EINTR; + return ret; + } +} + +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; + + bch_keylist_init(&insert); + + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) { + op->lock = b->level; + goto out; + } + } + + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + + bch_keylist_add(&insert, check_key); + + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; +} + +struct btree_insert_op { + struct btree_op op; + struct keylist *keys; + atomic_t *journal_ref; + struct bkey *replace_key; +}; + +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) +{ + struct btree_insert_op *op = container_of(b_op, + struct btree_insert_op, op); + + int ret = bch_btree_insert_node(b, &op->op, op->keys, + op->journal_ref, op->replace_key); + if (ret && !bch_keylist_empty(op->keys)) + return ret; + else + return MAP_DONE; +} + +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key) +{ + struct btree_insert_op op; + int ret = 0; + + BUG_ON(current->bio_list); + BUG_ON(bch_keylist_empty(keys)); + + bch_btree_op_init(&op.op, 0); + op.keys = keys; + op.journal_ref = journal_ref; + op.replace_key = replace_key; + + while (!ret && !bch_keylist_empty(keys)) { + op.op.lock = 0; + ret = bch_btree_map_leaf_nodes(&op.op, c, + &START_KEY(keys->keys), + btree_insert_fn); + } + + if (ret) { + struct bkey *k; + + pr_err("error %i\n", ret); + + while ((k = bch_keylist_pop(keys))) + bkey_put(c, k); + } else if (op.op.insert_collision) + ret = -ESRCH; + + return ret; +} + +void bch_btree_set_root(struct btree *b) +{ + unsigned int i; + struct closure cl; + + closure_init_stack(&cl); + + trace_bcache_btree_set_root(b); + + BUG_ON(!b->written); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); + + mutex_lock(&b->c->bucket_lock); + list_del_init(&b->list); + mutex_unlock(&b->c->bucket_lock); + + b->c->root = b; + + bch_journal_meta(b->c, &cl); + closure_sync(&cl); +} + +/* Map across nodes or keys */ + +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) +{ + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(&b->keys, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad))) { + ret = bcache_btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + } + + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + + return ret; +} + +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) +{ + return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); +} + +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) +{ + int ret = MAP_CONTINUE; + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(&b->keys, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : bcache_btree(map_keys_recurse, k, + b, op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); + + return ret; +} + +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); +} + +/* Keybuf code */ + +static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) +{ + /* Overlapping keys compare equal */ + if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) + return -1; + if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) + return 1; + return 0; +} + +static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, + struct keybuf_key *r) +{ + return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); +} + +struct refill { + struct btree_op op; + unsigned int nr_found; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; + +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; + + if (bkey_cmp(k, refill->end) > 0) { + ret = MAP_DONE; + goto out; + } + + if (!KEY_SIZE(k)) /* end key */ + goto out; + + if (refill->pred(buf, k)) { + struct keybuf_key *w; + + spin_lock(&buf->lock); + + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } + + w->private = NULL; + bkey_copy(&w->key, k); + + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); + else + refill->nr_found++; + + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; + + spin_unlock(&buf->lock); + } +out: + buf->last_scanned = *k; + return ret; +} + +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end, keybuf_pred_fn *pred) +{ + struct bkey start = buf->last_scanned; + struct refill refill; + + cond_resched(); + + bch_btree_op_init(&refill.op, -1); + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); + + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); + + spin_lock(&buf->lock); + + if (!RB_EMPTY_ROOT(&buf->keys)) { + struct keybuf_key *w; + + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + buf->start = START_KEY(&w->key); + + w = RB_LAST(&buf->keys, struct keybuf_key, node); + buf->end = w->key; + } else { + buf->start = MAX_KEY; + buf->end = MAX_KEY; + } + + spin_unlock(&buf->lock); +} + +static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + rb_erase(&w->node, &buf->keys); + array_free(&buf->freelist, w); +} + +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + spin_lock(&buf->lock); + __bch_keybuf_del(buf, w); + spin_unlock(&buf->lock); +} + +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end) +{ + bool ret = false; + struct keybuf_key *p, *w, s; + + s.key = *start; + + if (bkey_cmp(end, &buf->start) <= 0 || + bkey_cmp(start, &buf->end) >= 0) + return false; + + spin_lock(&buf->lock); + w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); + + while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { + p = w; + w = RB_NEXT(w, node); + + if (p->private) + ret = true; + else + __bch_keybuf_del(buf, p); + } + + spin_unlock(&buf->lock); + return ret; +} + +struct keybuf_key *bch_keybuf_next(struct keybuf *buf) +{ + struct keybuf_key *w; + + spin_lock(&buf->lock); + + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + + while (w && w->private) + w = RB_NEXT(w, node); + + if (w) + w->private = ERR_PTR(-EINTR); + + spin_unlock(&buf->lock); + return w; +} + +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) +{ + struct keybuf_key *ret; + + while (1) { + ret = bch_keybuf_next(buf); + if (ret) + break; + + if (bkey_cmp(&buf->last_scanned, end) >= 0) { + pr_debug("scan finished\n"); + break; + } + + bch_refill_keybuf(c, buf, end, pred); + } + + return ret; +} + +void bch_keybuf_init(struct keybuf *buf) +{ + buf->last_scanned = MAX_KEY; + buf->keys = RB_ROOT; + + spin_lock_init(&buf->lock); + array_allocator_init(&buf->freelist); +} + +void bch_btree_exit(void) +{ + if (btree_io_wq) + destroy_workqueue(btree_io_wq); +} + +int __init bch_btree_init(void) +{ + btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + if (!btree_io_wq) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h new file mode 100644 index 000000000..a2920bbfc --- /dev/null +++ b/drivers/md/bcache/btree.h @@ -0,0 +1,417 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_BTREE_H +#define _BCACHE_BTREE_H + +/* + * THE BTREE: + * + * At a high level, bcache's btree is relatively standard b+ tree. All keys and + * pointers are in the leaves; interior nodes only have pointers to the child + * nodes. + * + * In the interior nodes, a struct bkey always points to a child btree node, and + * the key is the highest key in the child node - except that the highest key in + * an interior node is always MAX_KEY. The size field refers to the size on disk + * of the child node - this would allow us to have variable sized btree nodes + * (handy for keeping the depth of the btree 1 by expanding just the root). + * + * Btree nodes are themselves log structured, but this is hidden fairly + * thoroughly. Btree nodes on disk will in practice have extents that overlap + * (because they were written at different times), but in memory we never have + * overlapping extents - when we read in a btree node from disk, the first thing + * we do is resort all the sets of keys with a mergesort, and in the same pass + * we check for overlapping extents and adjust them appropriately. + * + * struct btree_op is a central interface to the btree code. It's used for + * specifying read vs. write locking, and the embedded closure is used for + * waiting on IO or reserve memory. + * + * BTREE CACHE: + * + * Btree nodes are cached in memory; traversing the btree might require reading + * in btree nodes which is handled mostly transparently. + * + * bch_btree_node_get() looks up a btree node in the cache and reads it in from + * disk if necessary. This function is almost never called directly though - the + * btree() macro is used to get a btree node, call some function on it, and + * unlock the node after the function returns. + * + * The root is special cased - it's taken out of the cache's lru (thus pinning + * it in memory), so we can find the root of the btree by just dereferencing a + * pointer instead of looking it up in the cache. This makes locking a bit + * tricky, since the root pointer is protected by the lock in the btree node it + * points to - the btree_root() macro handles this. + * + * In various places we must be able to allocate memory for multiple btree nodes + * in order to make forward progress. To do this we use the btree cache itself + * as a reserve; if __get_free_pages() fails, we'll find a node in the btree + * cache we can reuse. We can't allow more than one thread to be doing this at a + * time, so there's a lock, implemented by a pointer to the btree_op closure - + * this allows the btree_root() macro to implicitly release this lock. + * + * BTREE IO: + * + * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles + * this. + * + * For writing, we have two btree_write structs embeddded in struct btree - one + * write in flight, and one being set up, and we toggle between them. + * + * Writing is done with a single function - bch_btree_write() really serves two + * different purposes and should be broken up into two different functions. When + * passing now = false, it merely indicates that the node is now dirty - calling + * it ensures that the dirty keys will be written at some point in the future. + * + * When passing now = true, bch_btree_write() causes a write to happen + * "immediately" (if there was already a write in flight, it'll cause the write + * to happen as soon as the previous write completes). It returns immediately + * though - but it takes a refcount on the closure in struct btree_op you passed + * to it, so a closure_sync() later can be used to wait for the write to + * complete. + * + * This is handy because btree_split() and garbage collection can issue writes + * in parallel, reducing the amount of time they have to hold write locks. + * + * LOCKING: + * + * When traversing the btree, we may need write locks starting at some level - + * inserting a key into the btree will typically only require a write lock on + * the leaf node. + * + * This is specified with the lock field in struct btree_op; lock = 0 means we + * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() + * checks this field and returns the node with the appropriate lock held. + * + * If, after traversing the btree, the insertion code discovers it has to split + * then it must restart from the root and take new locks - to do this it changes + * the lock field and returns -EINTR, which causes the btree_root() macro to + * loop. + * + * Handling cache misses require a different mechanism for upgrading to a write + * lock. We do cache lookups with only a read lock held, but if we get a cache + * miss and we wish to insert this data into the cache, we have to insert a + * placeholder key to detect races - otherwise, we could race with a write and + * overwrite the data that was just written to the cache with stale data from + * the backing device. + * + * For this we use a sequence number that write locks and unlocks increment - to + * insert the check key it unlocks the btree node and then takes a write lock, + * and fails if the sequence number doesn't match. + */ + +#include "bset.h" +#include "debug.h" + +struct btree_write { + atomic_t *journal; + + /* If btree_split() frees a btree node, it writes a new pointer to that + * btree node indicating it was freed; it takes a refcount on + * c->prio_blocked because we can't write the gens until the new + * pointer is on disk. This allows btree_write_endio() to release the + * refcount that btree_split() took. + */ + int prio_blocked; +}; + +struct btree { + /* Hottest entries first */ + struct hlist_node hash; + + /* Key/pointer for this btree node */ + BKEY_PADDED(key); + + unsigned long seq; + struct rw_semaphore lock; + struct cache_set *c; + struct btree *parent; + + struct mutex write_lock; + + unsigned long flags; + uint16_t written; /* would be nice to kill */ + uint8_t level; + + struct btree_keys keys; + + /* For outstanding btree writes, used as a lock - protects write_idx */ + struct closure io; + struct semaphore io_mutex; + + struct list_head list; + struct delayed_work work; + + struct btree_write writes[2]; + struct bio *bio; +}; + + + + +#define BTREE_FLAG(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void set_btree_node_ ## flag(struct btree *b) \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } + +enum btree_flags { + BTREE_NODE_io_error, + BTREE_NODE_dirty, + BTREE_NODE_write_idx, + BTREE_NODE_journal_flush, +}; + +BTREE_FLAG(io_error); +BTREE_FLAG(dirty); +BTREE_FLAG(write_idx); +BTREE_FLAG(journal_flush); + +static inline struct btree_write *btree_current_write(struct btree *b) +{ + return b->writes + btree_node_write_idx(b); +} + +static inline struct btree_write *btree_prev_write(struct btree *b) +{ + return b->writes + (btree_node_write_idx(b) ^ 1); +} + +static inline struct bset *btree_bset_first(struct btree *b) +{ + return b->keys.set->data; +} + +static inline struct bset *btree_bset_last(struct btree *b) +{ + return bset_tree_last(&b->keys)->data; +} + +static inline unsigned int bset_block_offset(struct btree *b, struct bset *i) +{ + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; +} + +static inline void set_gc_sectors(struct cache_set *c) +{ + atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16); +} + +void bkey_put(struct cache_set *c, struct bkey *k); + +/* Looping macros */ + +#define for_each_cached_btree(b, c, iter) \ + for (iter = 0; \ + iter < ARRAY_SIZE((c)->bucket_hash); \ + iter++) \ + hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) + +/* Recursing down the btree */ + +struct btree_op { + /* for waiting on btree reserve in btree_split() */ + wait_queue_entry_t wait; + + /* Btree level at which we start taking write locks */ + short lock; + + unsigned int insert_collision:1; +}; + +struct btree_check_state; +struct btree_check_info { + struct btree_check_state *state; + struct task_struct *thread; + int result; +}; + +#define BCH_BTR_CHKTHREAD_MAX 12 +struct btree_check_state { + struct cache_set *c; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX]; +}; + +static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) +{ + memset(op, 0, sizeof(struct btree_op)); + init_wait(&op->wait); + op->lock = write_lock_level; +} + +static inline void rw_lock(bool w, struct btree *b, int level) +{ + w ? down_write_nested(&b->lock, level + 1) + : down_read_nested(&b->lock, level + 1); + if (w) + b->seq++; +} + +static inline void rw_unlock(bool w, struct btree *b) +{ + if (w) + b->seq++; + (w ? up_write : up_read)(&b->lock); +} + +void bch_btree_node_read_done(struct btree *b); +void __bch_btree_node_write(struct btree *b, struct closure *parent); +void bch_btree_node_write(struct btree *b, struct closure *parent); + +void bch_btree_set_root(struct btree *b); +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent); +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write, + struct btree *parent); + +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key); +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key); + +int bch_gc_thread_start(struct cache_set *c); +void bch_initial_gc_finish(struct cache_set *c); +void bch_moving_gc(struct cache_set *c); +int bch_btree_check(struct cache_set *c); +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); +void bch_cannibalize_unlock(struct cache_set *c); + +static inline void wake_up_gc(struct cache_set *c) +{ + wake_up(&c->gc_wait); +} + +static inline void force_wake_up_gc(struct cache_set *c) +{ + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * calling wake_up_gc() won't start gc thread if sectors_to_gc is + * not a nagetive value. + * Therefore sectors_to_gc is set to -1 here, before waking up + * gc thread by calling wake_up_gc(). Then gc_should_run() will + * give a chance to permit gc thread to run. "Give a chance" means + * before going into gc_should_run(), there is still possibility + * that c->sectors_to_gc being set to other positive value. So + * this routine won't 100% make sure gc thread will be woken up + * to run. + */ + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); +} + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define bcache_btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ + if (!IS_ERR(_child)) { \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define bcache_btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + +#define MAP_DONE 0 +#define MAP_CONTINUE 1 + +#define MAP_ALL_NODES 0 +#define MAP_LEAF_NODES 1 + +#define MAP_END_KEY 1 + +typedef int (btree_map_nodes_fn)(struct btree_op *b_op, struct btree *b); +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags); + +static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); +} + +static inline int bch_btree_map_leaf_nodes(struct btree_op *op, + struct cache_set *c, + struct bkey *from, + btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); +} + +typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, + struct bkey *k); +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags); + +typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); + +void bch_keybuf_init(struct keybuf *buf); +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end, keybuf_pred_fn *pred); +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end); +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w); +struct keybuf_key *bch_keybuf_next(struct keybuf *buf); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred); +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); +#endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c new file mode 100644 index 000000000..d8d9394a6 --- /dev/null +++ b/drivers/md/bcache/closure.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Asynchronous refcounty things + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include +#include +#include +#include + +#include "closure.h" + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + int r = flags & CLOSURE_REMAINING_MASK; + + BUG_ON(flags & CLOSURE_GUARD_MASK); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + + if (!r) { + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { + atomic_set(&cl->remaining, + CLOSURE_REMAINING_INITIALIZER); + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + closure_fn *destructor = cl->fn; + + closure_debug_destroy(cl); + + if (destructor) + destructor(cl); + + if (parent) + closure_put(parent); + } + } +} + +/* For clearing flags with the same atomic op as a put */ +void closure_sub(struct closure *cl, int v) +{ + closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +} + +/* + * closure_put - decrement a closure's refcount + */ +void closure_put(struct closure *cl) +{ + closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +} + +/* + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ +void __closure_wake_up(struct closure_waitlist *wait_list) +{ + struct llist_node *list; + struct closure *cl, *t; + struct llist_node *reverse = NULL; + + list = llist_del_all(&wait_list->list); + + /* We first reverse the list to preserve FIFO ordering and fairness */ + reverse = llist_reverse_order(list); + + /* Then do the wakeups */ + llist_for_each_entry_safe(cl, t, reverse, list) { + closure_set_waiting(cl, 0); + closure_sub(cl, CLOSURE_WAITING + 1); + } +} + +/** + * closure_wait - add a closure to a waitlist + * @waitlist: will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * @cl: closure pointer. + * + */ +bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +{ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + + closure_set_waiting(cl, _RET_IP_); + atomic_add(CLOSURE_WAITING + 1, &cl->remaining); + llist_add(&cl->list, &waitlist->list); + + return true; +} + +struct closure_syncer { + struct task_struct *task; + int done; +}; + +static void closure_sync_fn(struct closure *cl) +{ + struct closure_syncer *s = cl->s; + struct task_struct *p; + + rcu_read_lock(); + p = READ_ONCE(s->task); + s->done = 1; + wake_up_process(p); + rcu_read_unlock(); +} + +void __sched __closure_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + + cl->s = &s; + continue_at(cl, closure_sync_fn, NULL); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); +} + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + +static LIST_HEAD(closure_list); +static DEFINE_SPINLOCK(closure_list_lock); + +void closure_debug_create(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_ALIVE; + + spin_lock_irqsave(&closure_list_lock, flags); + list_add(&cl->all, &closure_list); + spin_unlock_irqrestore(&closure_list_lock, flags); +} + +void closure_debug_destroy(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_DEAD; + + spin_lock_irqsave(&closure_list_lock, flags); + list_del(&cl->all); + spin_unlock_irqrestore(&closure_list_lock, flags); +} + +static struct dentry *closure_debug; + +static int debug_show(struct seq_file *f, void *data) +{ + struct closure *cl; + + spin_lock_irq(&closure_list_lock); + + list_for_each_entry(cl, &closure_list, all) { + int r = atomic_read(&cl->remaining); + + seq_printf(f, "%p: %pS -> %pS p %p r %i ", + cl, (void *) cl->ip, cl->fn, cl->parent, + r & CLOSURE_REMAINING_MASK); + + seq_printf(f, "%s%s\n", + test_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(&cl->work)) ? "Q" : "", + r & CLOSURE_RUNNING ? "R" : ""); + + if (r & CLOSURE_WAITING) + seq_printf(f, " W %pS\n", + (void *) cl->waiting_on); + + seq_printf(f, "\n"); + } + + spin_unlock_irq(&closure_list_lock); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(debug); + +void __init closure_debug_init(void) +{ + if (!IS_ERR_OR_NULL(bcache_debug)) + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ + closure_debug = debugfs_create_file( + "closures", 0400, bcache_debug, NULL, &debug_fops); +} +#endif + +MODULE_AUTHOR("Kent Overstreet "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h new file mode 100644 index 000000000..c88cdc4ae --- /dev/null +++ b/drivers/md/bcache/closure.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CLOSURE_H +#define _LINUX_CLOSURE_H + +#include +#include +#include +#include + +/* + * Closure is perhaps the most overused and abused term in computer science, but + * since I've been unable to come up with anything better you're stuck with it + * again. + * + * What are closures? + * + * They embed a refcount. The basic idea is they count "things that are in + * progress" - in flight bios, some other thread that's doing something else - + * anything you might want to wait on. + * + * The refcount may be manipulated with closure_get() and closure_put(). + * closure_put() is where many of the interesting things happen, when it causes + * the refcount to go to 0. + * + * Closures can be used to wait on things both synchronously and asynchronously, + * and synchronous and asynchronous use can be mixed without restriction. To + * wait synchronously, use closure_sync() - you will sleep until your closure's + * refcount hits 1. + * + * To wait asynchronously, use + * continue_at(cl, next_function, workqueue); + * + * passing it, as you might expect, the function to run when nothing is pending + * and the workqueue to run that function out of. + * + * continue_at() also, critically, requires a 'return' immediately following the + * location where this macro is referenced, to return to the calling function. + * There's good reason for this. + * + * To use safely closures asynchronously, they must always have a refcount while + * they are running owned by the thread that is running them. Otherwise, suppose + * you submit some bios and wish to have a function run when they all complete: + * + * foo_endio(struct bio *bio) + * { + * closure_put(cl); + * } + * + * closure_init(cl); + * + * do_stuff(); + * closure_get(cl); + * bio1->bi_endio = foo_endio; + * bio_submit(bio1); + * + * do_more_stuff(); + * closure_get(cl); + * bio2->bi_endio = foo_endio; + * bio_submit(bio2); + * + * continue_at(cl, complete_some_read, system_wq); + * + * If closure's refcount started at 0, complete_some_read() could run before the + * second bio was submitted - which is almost always not what you want! More + * importantly, it wouldn't be possible to say whether the original thread or + * complete_some_read()'s thread owned the closure - and whatever state it was + * associated with! + * + * So, closure_init() initializes a closure's refcount to 1 - and when a + * closure_fn is run, the refcount will be reset to 1 first. + * + * Then, the rule is - if you got the refcount with closure_get(), release it + * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount + * on a closure because you called closure_init() or you were run out of a + * closure - _always_ use continue_at(). Doing so consistently will help + * eliminate an entire class of particularly pernicious races. + * + * Lastly, you might have a wait list dedicated to a specific event, and have no + * need for specifying the condition - you just want to wait until someone runs + * closure_wake_up() on the appropriate wait list. In that case, just use + * closure_wait(). It will return either true or false, depending on whether the + * closure was already on a wait list or not - a closure can only be on one wait + * list at a time. + * + * Parents: + * + * closure_init() takes two arguments - it takes the closure to initialize, and + * a (possibly null) parent. + * + * If parent is non null, the new closure will have a refcount for its lifetime; + * a closure is considered to be "finished" when its refcount hits 0 and the + * function to run is null. Hence + * + * continue_at(cl, NULL, NULL); + * + * returns up the (spaghetti) stack of closures, precisely like normal return + * returns up the C stack. continue_at() with non null fn is better thought of + * as doing a tail call. + * + * All this implies that a closure should typically be embedded in a particular + * struct (which its refcount will normally control the lifetime of), and that + * struct can very much be thought of as a stack frame. + */ + +struct closure; +struct closure_syncer; +typedef void (closure_fn) (struct closure *); +extern struct dentry *bcache_debug; + +struct closure_waitlist { + struct llist_head list; +}; + +enum closure_state { + /* + * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by + * the thread that owns the closure, and cleared by the thread that's + * waking up the closure. + * + * The rest are for debugging and don't affect behaviour: + * + * CLOSURE_RUNNING: Set when a closure is running (i.e. by + * closure_init() and when closure_put() runs then next function), and + * must be cleared before remaining hits 0. Primarily to help guard + * against incorrect usage and accidentally transferring references. + * continue_at() and closure_return() clear it for you, if you're doing + * something unusual you can use closure_set_dead() which also helps + * annotate where references are being transferred. + */ + + CLOSURE_BITS_START = (1U << 26), + CLOSURE_DESTRUCTOR = (1U << 26), + CLOSURE_WAITING = (1U << 28), + CLOSURE_RUNNING = (1U << 30), +}; + +#define CLOSURE_GUARD_MASK \ + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) + +#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) + +struct closure { + union { + struct { + struct workqueue_struct *wq; + struct closure_syncer *s; + struct llist_node list; + closure_fn *fn; + }; + struct work_struct work; + }; + + struct closure *parent; + + atomic_t remaining; + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#define CLOSURE_MAGIC_DEAD 0xc054dead +#define CLOSURE_MAGIC_ALIVE 0xc054a11e + + unsigned int magic; + struct list_head all; + unsigned long ip; + unsigned long waiting_on; +#endif +}; + +void closure_sub(struct closure *cl, int v); +void closure_put(struct closure *cl); +void __closure_wake_up(struct closure_waitlist *list); +bool closure_wait(struct closure_waitlist *list, struct closure *cl); +void __closure_sync(struct closure *cl); + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +static inline void closure_sync(struct closure *cl) +{ + if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) + __closure_sync(cl); +} + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + +void closure_debug_init(void); +void closure_debug_create(struct closure *cl); +void closure_debug_destroy(struct closure *cl); + +#else + +static inline void closure_debug_init(void) {} +static inline void closure_debug_create(struct closure *cl) {} +static inline void closure_debug_destroy(struct closure *cl) {} + +#endif + +static inline void closure_set_ip(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->ip = _THIS_IP_; +#endif +} + +static inline void closure_set_ret_ip(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->ip = _RET_IP_; +#endif +} + +static inline void closure_set_waiting(struct closure *cl, unsigned long f) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->waiting_on = f; +#endif +} + +static inline void closure_set_stopped(struct closure *cl) +{ + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} + +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic(); +} + +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} + +/** + * closure_get - increment a closure's refcount + */ +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} + +/** + * closure_init - Initialize a closure, setting the refcount to 1 + * @cl: closure to initialize + * @parent: parent of the new closure. cl will take a refcount on it for its + * lifetime; may be NULL. + */ +static inline void closure_init(struct closure *cl, struct closure *parent) +{ + memset(cl, 0, sizeof(struct closure)); + cl->parent = parent; + if (parent) + closure_get(parent); + + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + + closure_debug_create(cl); + closure_set_ip(cl); +} + +static inline void closure_init_stack(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +} + +/** + * closure_wake_up - wake up all closures on a wait list, + * with memory barrier + */ +static inline void closure_wake_up(struct closure_waitlist *list) +{ + /* Memory barrier for the wait list */ + smp_mb(); + __closure_wake_up(list); +} + +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. + * + * Note you are expected to immediately return after using this macro. + */ +#define continue_at(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_sub(_cl, CLOSURE_RUNNING + 1); \ +} while (0) + +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ +#define closure_return(_cl) continue_at((_cl), NULL, NULL) + +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ +#define continue_at_nobarrier(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_queue(_cl); \ +} while (0) + +/** + * closure_return_with_destructor - finish execution of a closure, + * with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ +#define closure_return_with_destructor(_cl, _destructor) \ +do { \ + set_closure_fn(_cl, _destructor, NULL); \ + closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +} while (0) + +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ +static inline void closure_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + closure_init(cl, parent); + continue_at_nobarrier(cl, fn, wq); +} + +#endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c new file mode 100644 index 000000000..7510d1c98 --- /dev/null +++ b/drivers/md/bcache/debug.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Assorted bcache debug code + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" + +#include +#include +#include +#include +#include + +struct dentry *bcache_debug; + +#ifdef CONFIG_BCACHE_DEBUG + +#define for_each_written_bset(b, start, i) \ + for (i = (start); \ + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ + i->seq == (start)->seq; \ + i = (void *) i + set_blocks(i, block_bytes(b->c->cache)) * \ + block_bytes(b->c->cache)) + +void bch_btree_verify(struct btree *b) +{ + struct btree *v = b->c->verify_data; + struct bset *ondisk, *sorted, *inmemory; + struct bio *bio; + + if (!b->c->verify || !b->c->verify_ondisk) + return; + + down(&b->io_mutex); + mutex_lock(&b->c->verify_lock); + + ondisk = b->c->verify_ondisk; + sorted = b->c->verify_data->keys.set->data; + inmemory = b->keys.set->data; + + bkey_copy(&v->key, &b->key); + v->written = 0; + v->level = b->level; + v->keys.ops = b->keys.ops; + + bio = bch_bbio_alloc(b->c); + bio_set_dev(bio, b->c->cache->bdev); + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bio->bi_opf = REQ_OP_READ | REQ_META; + bch_bio_map(bio, sorted); + + submit_bio_wait(bio); + bch_bbio_free(bio, b->c); + + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); + + bch_btree_node_read_done(v); + sorted = v->keys.set->data; + + if (inmemory->keys != sorted->keys || + memcmp(inmemory->start, + sorted->start, + (void *) bset_bkey_last(inmemory) - + (void *) inmemory->start)) { + struct bset *i; + unsigned int j; + + console_lock(); + + pr_err("*** in memory:\n"); + bch_dump_bset(&b->keys, inmemory, 0); + + pr_err("*** read back in:\n"); + bch_dump_bset(&v->keys, sorted, 0); + + for_each_written_bset(b, ondisk, i) { + unsigned int block = ((void *) i - (void *) ondisk) / + block_bytes(b->c->cache); + + pr_err("*** on disk block %u:\n", block); + bch_dump_bset(&b->keys, i, block); + } + + pr_err("*** block %zu not written\n", + ((void *) i - (void *) ondisk) / block_bytes(b->c->cache)); + + for (j = 0; j < inmemory->keys; j++) + if (inmemory->d[j] != sorted->d[j]) + break; + + pr_err("b->written %u\n", b->written); + + console_unlock(); + panic("verify failed at %u\n", j); + } + + mutex_unlock(&b->c->verify_lock); + up(&b->io_mutex); +} + +void bch_data_verify(struct cached_dev *dc, struct bio *bio) +{ + unsigned int nr_segs = bio_segments(bio); + struct bio *check; + struct bio_vec bv, cbv; + struct bvec_iter iter, citer = { 0 }; + + check = bio_kmalloc(nr_segs, GFP_NOIO); + if (!check) + return; + bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, + REQ_OP_READ); + check->bi_iter.bi_sector = bio->bi_iter.bi_sector; + check->bi_iter.bi_size = bio->bi_iter.bi_size; + + bch_bio_map(check, NULL); + if (bch_bio_alloc_pages(check, GFP_NOIO)) + goto out_put; + + submit_bio_wait(check); + + citer.bi_size = UINT_MAX; + bio_for_each_segment(bv, bio, iter) { + void *p1 = bvec_kmap_local(&bv); + void *p2; + + cbv = bio_iter_iovec(check, citer); + p2 = bvec_kmap_local(&cbv); + + cache_set_err_on(memcmp(p1, p2, bv.bv_len), + dc->disk.c, + "verify failed at dev %pg sector %llu", + dc->bdev, + (uint64_t) bio->bi_iter.bi_sector); + + kunmap_local(p2); + kunmap_local(p1); + bio_advance_iter(check, &citer, bv.bv_len); + } + + bio_free_pages(check); +out_put: + bio_uninit(check); + kfree(check); +} + +#endif + +#ifdef CONFIG_DEBUG_FS + +/* XXX: cache set refcounting */ + +struct dump_iterator { + char buf[PAGE_SIZE]; + size_t bytes; + struct cache_set *c; + struct keybuf keys; +}; + +static bool dump_pred(struct keybuf *buf, struct bkey *k) +{ + return true; +} + +static ssize_t bch_dump_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iterator *i = file->private_data; + ssize_t ret = 0; + char kbuf[80]; + + while (size) { + struct keybuf_key *w; + unsigned int bytes = min(i->bytes, size); + + if (copy_to_user(buf, i->buf, bytes)) + return -EFAULT; + + ret += bytes; + buf += bytes; + size -= bytes; + i->bytes -= bytes; + memmove(i->buf, i->buf + bytes, i->bytes); + + if (i->bytes) + break; + + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); + if (!w) + break; + + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); + i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); + bch_keybuf_del(&i->keys, w); + } + + return ret; +} + +static int bch_dump_open(struct inode *inode, struct file *file) +{ + struct cache_set *c = inode->i_private; + struct dump_iterator *i; + + i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->c = c; + bch_keybuf_init(&i->keys); + i->keys.last_scanned = KEY(0, 0, 0); + + return 0; +} + +static int bch_dump_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations cache_set_debug_ops = { + .owner = THIS_MODULE, + .open = bch_dump_open, + .read = bch_dump_read, + .release = bch_dump_release +}; + +void bch_debug_init_cache_set(struct cache_set *c) +{ + if (!IS_ERR_OR_NULL(bcache_debug)) { + char name[50]; + + snprintf(name, 50, "bcache-%pU", c->set_uuid); + c->debug = debugfs_create_file(name, 0400, bcache_debug, c, + &cache_set_debug_ops); + } +} + +#endif + +void bch_debug_exit(void) +{ + debugfs_remove_recursive(bcache_debug); +} + +void __init bch_debug_init(void) +{ + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ + bcache_debug = debugfs_create_dir("bcache", NULL); +} diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h new file mode 100644 index 000000000..fb3d4dff4 --- /dev/null +++ b/drivers/md/bcache/debug.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_DEBUG_H +#define _BCACHE_DEBUG_H + +struct bio; +struct cached_dev; +struct cache_set; + +#ifdef CONFIG_BCACHE_DEBUG + +void bch_btree_verify(struct btree *b); +void bch_data_verify(struct cached_dev *dc, struct bio *bio); + +#define expensive_debug_checks(c) ((c)->expensive_debug_checks) +#define key_merging_disabled(c) ((c)->key_merging_disabled) +#define bypass_torture_test(d) ((d)->bypass_torture_test) + +#else /* DEBUG */ + +static inline void bch_btree_verify(struct btree *b) {} +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} + +#define expensive_debug_checks(c) 0 +#define key_merging_disabled(c) 0 +#define bypass_torture_test(d) 0 + +#endif + +#ifdef CONFIG_DEBUG_FS +void bch_debug_init_cache_set(struct cache_set *c); +#else +static inline void bch_debug_init_cache_set(struct cache_set *c) {} +#endif + +#endif diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 000000000..d626ffcbe --- /dev/null +++ b/drivers/md/bcache/extents.c @@ -0,0 +1,630 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "writeback.h" + +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = c->cache; + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + return true; + } + + return false; +} + +/* Common among btree and extent ptrs */ + +static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = c->cache; + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) +{ + unsigned int i = 0; + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) + p(", "); + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p +} + +static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) +{ + struct btree *b = container_of(keys, struct btree, keys); + unsigned int j; + char buf[80]; + + bch_extent_to_text(buf, sizeof(buf), k); + pr_cont(" %s", buf); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + + pr_cont(" bucket %zu", n); + if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets) + pr_cont(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + pr_cont(" %s\n", bch_ptr_status(b->c, k)); +} + +/* Btree ptrs */ + +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + + return __bch_btree_ptr_invalid(b->c, k); +} + +static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) +{ + unsigned int i; + char buf[80]; + struct bucket *g; + + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + } + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned int i; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i) || + ptr_stale(b->c, k, i)) + return true; + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) + return true; + + return false; +} + +static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct btree *b = container_of(bk, struct btree, keys); + + if (!KEY_OFFSET(insert)) + btree_current_write(b)->prio_blocked++; + + return false; +} + +const struct btree_keys_ops bch_btree_keys_ops = { + .sort_cmp = bch_key_sort_cmp, + .insert_fixup = bch_btree_ptr_insert_fixup, + .key_invalid = bch_btree_ptr_invalid, + .key_bad = bch_btree_ptr_bad, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, +}; + +/* Extents */ + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, + struct bkey *tmp) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) + i++; + + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) + break; + + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, bch_extent_sort_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, bch_extent_sort_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, bch_extent_sort_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } + } + + return NULL; +} + +static void bch_subtract_dirty(struct bkey *k, + struct cache_set *c, + uint64_t offset, + int sectors) +{ + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); +} + +static bool bch_extent_insert_fixup(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct cache_set *c = container_of(b, struct btree, keys)->c; + + uint64_t old_offset; + unsigned int old_size, sectors_found = 0; + + BUG_ON(!KEY_OFFSET(insert)); + BUG_ON(!KEY_SIZE(insert)); + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + + if (!k) + break; + + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_offset = KEY_START(k); + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for replace + * operations. + */ + + if (replace_key && KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned int i; + uint64_t offset = KEY_START(k) - + KEY_START(replace_key); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (!bch_bkey_equal_header(k, replace_key)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(replace_key)); + + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + bch_subtract_dirty(k, c, KEY_START(insert), + KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, bset_tree_last(b), + insert); + bch_bset_insert(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + bch_bset_insert(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + goto out; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k)); + } + +check_failed: + if (replace_key) { + if (!sectors_found) { + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } +out: + if (KEY_DIRTY(insert)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), + KEY_START(insert), + KEY_SIZE(insert)); + + return false; +} + +bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + + return __bch_extent_invalid(b->c, k); +} + +static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned int ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + (!GC_MARK(g) || + GC_MARK(g) == GC_MARK_METADATA || + (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k)))) + goto err; + + if (g->prio == BTREE_PRIO) + goto err; + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned int i, stale; + char buf[80]; + + if (!KEY_PTRS(k) || + bch_extent_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) { + stale = ptr_stale(b->c, k, i); + + if (stale && KEY_DIRTY(k)) { + bch_extent_to_text(buf, sizeof(buf), k); + pr_info("stale dirty pointer, stale %u, key: %s\n", + stale, buf); + } + + btree_bug_on(stale > BUCKET_GC_GEN_MAX, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + if (stale) + return true; + + if (expensive_debug_checks(b->c) && + bch_extent_bad_expensive(b, k, i)) + return true; + } + + return false; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +static bool bch_extent_merge(struct btree_keys *bk, + struct bkey *l, + struct bkey *r) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned int i; + + if (key_merging_disabled(b->c)) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +const struct btree_keys_ops bch_extent_keys_ops = { + .sort_cmp = bch_extent_sort_cmp, + .sort_fixup = bch_extent_sort_fixup, + .insert_fixup = bch_extent_insert_fixup, + .key_invalid = bch_extent_invalid, + .key_bad = bch_extent_bad, + .key_merge = bch_extent_merge, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, + .is_extents = true, +}; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 000000000..4d667e05b --- /dev/null +++ b/drivers/md/bcache/extents.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +extern const struct btree_keys_ops bch_btree_keys_ops; +extern const struct btree_keys_ops bch_extent_keys_ops; + +struct bkey; +struct cache_set; + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k); +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k); +bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c new file mode 100644 index 000000000..634922c56 --- /dev/null +++ b/drivers/md/bcache/features.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Feature set bits and string conversion. + * Inspired by ext4's features compat/incompat/ro_compat related code. + * + * Copyright 2020 Coly Li + * + */ +#include "bcache_ondisk.h" +#include "bcache.h" +#include "features.h" + +struct feature { + int compat; + unsigned int mask; + const char *string; +}; + +static struct feature feature_list[] = { + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, + "large_bucket"}, + {0, 0, NULL }, +}; + +#define compose_feature_string(type) \ +({ \ + struct feature *f; \ + bool first = true; \ + \ + for (f = &feature_list[0]; f->compat != 0; f++) { \ + if (f->compat != BCH_FEATURE_ ## type) \ + continue; \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) { \ + if (first) { \ + out += snprintf(out, buf + size - out, \ + "["); \ + } else { \ + out += snprintf(out, buf + size - out, \ + " ["); \ + } \ + } else if (!first) { \ + out += snprintf(out, buf + size - out, " "); \ + } \ + \ + out += snprintf(out, buf + size - out, "%s", f->string);\ + \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) \ + out += snprintf(out, buf + size - out, "]"); \ + \ + first = false; \ + } \ + if (!first) \ + out += snprintf(out, buf + size - out, "\n"); \ +}) + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(RO_COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(INCOMPAT); + return out - buf; +} diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h new file mode 100644 index 000000000..09161b89c --- /dev/null +++ b/drivers/md/bcache/features.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BCACHE_FEATURES_H +#define _BCACHE_FEATURES_H + +#include +#include + +#include "bcache_ondisk.h" + +#define BCH_FEATURE_COMPAT 0 +#define BCH_FEATURE_RO_COMPAT 1 +#define BCH_FEATURE_INCOMPAT 2 +#define BCH_FEATURE_TYPE_MASK 0x03 + +/* Feature set definition */ +/* Incompat feature set */ +/* 32bit bucket size, obsoleted */ +#define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 +/* real bucket size is (1 << bucket_size) */ +#define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 + +#define BCH_FEATURE_COMPAT_SUPP 0 +#define BCH_FEATURE_RO_COMPAT_SUPP 0 +#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) + +#define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_ro_compat & (mask)) +#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ + ((sb)->feature_incompat & (mask)) + +#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \ + return 0; \ + return (((sb)->feature_compat & \ + BCH##_FEATURE_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat |= \ + BCH##_FEATURE_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat &= \ + ~BCH##_FEATURE_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \ + return 0; \ + return (((sb)->feature_ro_compat & \ + BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat |= \ + BCH##_FEATURE_RO_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat &= \ + ~BCH##_FEATURE_RO_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \ + return 0; \ + return (((sb)->feature_incompat & \ + BCH##_FEATURE_INCOMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat |= \ + BCH##_FEATURE_INCOMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat &= \ + ~BCH##_FEATURE_INCOMPAT_##flagname; \ +} + +BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); + +static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) +{ + return ((sb->feature_compat & ~BCH_FEATURE_COMPAT_SUPP) != 0); +} + +static inline bool bch_has_unknown_ro_compat_features(struct cache_sb *sb) +{ + return ((sb->feature_ro_compat & ~BCH_FEATURE_RO_COMPAT_SUPP) != 0); +} + +static inline bool bch_has_unknown_incompat_features(struct cache_sb *sb) +{ + return ((sb->feature_incompat & ~BCH_FEATURE_INCOMPAT_SUPP) != 0); +} + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size); + +#endif diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c new file mode 100644 index 000000000..020712c52 --- /dev/null +++ b/drivers/md/bcache/io.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "bset.h" +#include "debug.h" + +#include + +/* Bios with headers */ + +void bch_bbio_free(struct bio *bio, struct cache_set *c) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + + mempool_free(b, &c->bio_meta); +} + +struct bio *bch_bbio_alloc(struct cache_set *c) +{ + struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); + struct bio *bio = &b->bio; + + bio_init(bio, NULL, bio->bi_inline_vecs, + meta_bucket_pages(&c->cache->sb), 0); + + return bio; +} + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio_set_dev(bio, c->cache->bdev); + + b->submit_time_us = local_clock_us(); + closure_bio_submit(c, bio, bio->bi_private); +} + +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned int ptr) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + + bch_bkey_copy_single_ptr(&b->key, k, ptr); + __bch_submit_bbio(bio, c); +} + +/* IO errors */ +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) +{ + unsigned int errors; + + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + + /* + * Read-ahead requests on a degrading and recovering md raid + * (e.g. raid6) device might be failured immediately by md + * raid code, which is not a real hardware media failure. So + * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. + */ + if (bio->bi_opf & REQ_RAHEAD) { + pr_warn_ratelimited("%pg: Read-ahead I/O failed on backing device, ignore\n", + dc->bdev); + return; + } + + errors = atomic_add_return(1, &dc->io_errors); + if (errors < dc->error_limit) + pr_err("%pg: IO error on backing device, unrecoverable\n", + dc->bdev); + else + bch_cached_dev_error(dc); +} + +void bch_count_io_errors(struct cache *ca, + blk_status_t error, + int is_read, + const char *m) +{ + /* + * The halflife of an error is: + * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh + */ + + if (ca->set->error_decay) { + unsigned int count = atomic_inc_return(&ca->io_count); + + while (count > ca->set->error_decay) { + unsigned int errors; + unsigned int old = count; + unsigned int new = count - ca->set->error_decay; + + /* + * First we subtract refresh from count; each time we + * successfully do so, we rescale the errors once: + */ + + count = atomic_cmpxchg(&ca->io_count, old, new); + + if (count == old) { + count = new; + + errors = atomic_read(&ca->io_errors); + do { + old = errors; + new = ((uint64_t) errors * 127) / 128; + errors = atomic_cmpxchg(&ca->io_errors, + old, new); + } while (old != errors); + } + } + } + + if (error) { + unsigned int errors = atomic_add_return(1 << IO_ERROR_SHIFT, + &ca->io_errors); + errors >>= IO_ERROR_SHIFT; + + if (errors < ca->set->error_limit) + pr_err("%pg: IO error on %s%s\n", + ca->bdev, m, + is_read ? ", recovering." : "."); + else + bch_cache_set_error(ca->set, + "%pg: too many IO errors %s\n", + ca->bdev, m); + } +} + +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct cache *ca = c->cache; + int is_read = (bio_data_dir(bio) == READ ? 1 : 0); + + unsigned int threshold = op_is_write(bio_op(bio)) + ? c->congested_write_threshold_us + : c->congested_read_threshold_us; + + if (threshold) { + unsigned int t = local_clock_us(); + int us = t - b->submit_time_us; + int congested = atomic_read(&c->congested); + + if (us > (int) threshold) { + int ms = us / 1024; + + c->congested_last_us = t; + + ms = min(ms, CONGESTED_MAX + congested); + atomic_sub(ms, &c->congested); + } else if (congested < 0) + atomic_inc(&c->congested); + } + + bch_count_io_errors(ca, error, is_read, m); +} + +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m) +{ + struct closure *cl = bio->bi_private; + + bch_bbio_count_io_errors(c, bio, error, m); + bio_put(bio); + closure_put(cl); +} diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 000000000..c182c21de --- /dev/null +++ b/drivers/md/bcache/journal.c @@ -0,0 +1,1000 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache journalling code, for btree insertions + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" + +#include + +/* + * Journal replay/recovery: + * + * This code is all driven from run_cache_set(); we first read the journal + * entries, do some other stuff, then we mark all the keys in the journal + * entries (same as garbage collection would), then we replay them - reinserting + * them into the cache in precisely the same order as they appear in the + * journal. + * + * We only journal keys that go in leaf nodes, which simplifies things quite a + * bit. + */ + +static void journal_read_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + closure_put(cl); +} + +static int journal_read_bucket(struct cache *ca, struct list_head *list, + unsigned int bucket_index) +{ + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio; + + struct journal_replay *i; + struct jset *j, *data = ca->set->journal.w[0].data; + struct closure cl; + unsigned int len, left, offset = 0; + int ret = 0; + sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + closure_init_stack(&cl); + + pr_debug("reading %u\n", bucket_index); + + while (offset < ca->sb.bucket_size) { +reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); + + bio_reset(bio, ca->bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = bucket + offset; + bio->bi_iter.bi_size = len << 9; + + bio->bi_end_io = journal_read_endio; + bio->bi_private = &cl; + bch_bio_map(bio, data); + + closure_bio_submit(ca->set, bio, &cl); + closure_sync(&cl); + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ + + j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); + + if (j->magic != jset_magic(&ca->sb)) { + pr_debug("%u: bad magic\n", bucket_index); + return ret; + } + + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u\n", + bucket_index, bytes, offset); + return ret; + } + + if (bytes > len << 9) + goto reread; + + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u\n", + bucket_index, bytes, offset); + return ret; + } + + blocks = set_blocks(j, block_bytes(ca)); + + /* + * Nodes in 'list' are in linear increasing order of + * i->j.seq, the node on head has the smallest (oldest) + * journal seq, the node on tail has the biggest + * (latest) journal seq. + */ + + /* + * Check from the oldest jset for last_seq. If + * i->j.seq < j->last_seq, it means the oldest jset + * in list is expired and useless, remove it from + * this list. Otherwise, j is a candidate jset for + * further following checks. + */ + while (!list_empty(list)) { + i = list_first_entry(list, + struct journal_replay, list); + if (i->j.seq >= j->last_seq) + break; + list_del(&i->list); + kfree(i); + } + + /* iterate list in reverse order (from latest jset) */ + list_for_each_entry_reverse(i, list, list) { + if (j->seq == i->j.seq) + goto next_set; + + /* + * if j->seq is less than any i->j.last_seq + * in list, j is an expired and useless jset. + */ + if (j->seq < i->j.last_seq) + goto next_set; + + /* + * 'where' points to first jset in list which + * is elder then j. + */ + if (j->seq > i->j.seq) { + where = &i->list; + goto add; + } + } + + where = list; +add: + i = kmalloc(offsetof(struct journal_replay, j) + + bytes, GFP_KERNEL); + if (!i) + return -ENOMEM; + unsafe_memcpy(&i->j, j, bytes, + /* "bytes" was calculated by set_bytes() above */); + /* Add to the location after 'where' points to */ + list_add(&i->list, where); + ret = 1; + + if (j->seq > ja->seq[bucket_index]) + ja->seq[bucket_index] = j->seq; +next_set: + offset += blocks * ca->sb.block_size; + len -= blocks * ca->sb.block_size; + j = ((void *) j) + blocks * block_bytes(ca); + } + } + + return ret; +} + +int bch_journal_read(struct cache_set *c, struct list_head *list) +{ +#define read_bucket(b) \ + ({ \ + ret = journal_read_bucket(ca, list, b); \ + __set_bit(b, bitmap); \ + if (ret < 0) \ + return ret; \ + ret; \ + }) + + struct cache *ca = c->cache; + int ret = 0; + struct journal_device *ja = &ca->journal; + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); + unsigned int i, l, r, m; + uint64_t seq; + + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { + /* + * We must try the index l with ZERO first for + * correctness due to the scenario that the journal + * bucket is circular buffer which might have wrapped + */ + l = (i * 2654435769U) % ca->sb.njournal_buckets; + + if (test_bit(l, bitmap)) + break; + + if (read_bucket(l)) + goto bsearch; + } + + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search\n"); + + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) + if (read_bucket(l)) + goto bsearch; + + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) + goto out; +bsearch: + BUG_ON(list_empty(list)); + + /* Binary search */ + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u\n", l, r); + + while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; + + m = (l + r) >> 1; + read_bucket(m); + + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) + l = m; + else + r = m; + } + + /* + * Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up: m %u njournal_buckets %u\n", + m, ca->sb.njournal_buckets); + l = m; + + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; + + if (l == m) + break; + + if (test_bit(l, bitmap)) + continue; + + if (!read_bucket(l)) + break; + } + + seq = 0; + + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; + + } + +out: + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; + + return 0; +#undef read_bucket +} + +void bch_journal_mark(struct cache_set *c, struct list_head *list) +{ + atomic_t p = { 0 }; + struct bkey *k; + struct journal_replay *i; + struct journal *j = &c->journal; + uint64_t last = j->seq; + + /* + * journal.pin should never fill up - we never write a journal + * entry when it would fill up. But if for some reason it does, we + * iterate over the list in reverse order so that we can just skip that + * refcount instead of bugging. + */ + + list_for_each_entry_reverse(i, list, list) { + BUG_ON(last < i->j.seq); + i->pin = NULL; + + while (last-- != i->j.seq) + if (fifo_free(&j->pin) > 1) { + fifo_push_front(&j->pin, p); + atomic_set(&fifo_front(&j->pin), 0); + } + + if (fifo_free(&j->pin) > 1) { + fifo_push_front(&j->pin, p); + i->pin = &fifo_front(&j->pin); + atomic_set(i->pin, 1); + } + + for (k = i->j.start; + k < bset_bkey_last(&i->j); + k = bkey_next(k)) + if (!__bch_extent_invalid(c, k)) { + unsigned int j; + + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); + + bch_initial_mark_key(c, 0, k); + } + } +} + +static bool is_discard_enabled(struct cache_set *s) +{ + struct cache *ca = s->cache; + + if (ca->discard) + return true; + + return false; +} + +int bch_journal_replay(struct cache_set *s, struct list_head *list) +{ + int ret = 0, keys = 0, entries = 0; + struct bkey *k; + struct journal_replay *i = + list_entry(list->prev, struct journal_replay, list); + + uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; + + list_for_each_entry(i, list, list) { + BUG_ON(i->pin && atomic_read(i->pin) != 1); + + if (n != i->j.seq) { + if (n == start && is_discard_enabled(s)) + pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + else { + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; + } + } + + for (k = i->j.start; + k < bset_bkey_last(&i->j); + k = bkey_next(k)) { + trace_bcache_journal_replay_key(k); + + bch_keylist_init_single(&keylist, k); + + ret = bch_btree_insert(s, &keylist, i->pin, NULL); + if (ret) + goto err; + + BUG_ON(!bch_keylist_empty(&keylist)); + keys++; + + cond_resched(); + } + + if (i->pin) + atomic_dec(i->pin); + n = i->j.seq + 1; + entries++; + } + + pr_info("journal replay done, %i keys in %i entries, seq %llu\n", + keys, entries, end); +err: + while (!list_empty(list)) { + i = list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kfree(i); + } + + return ret; +} + +void bch_journal_space_reserve(struct journal *j) +{ + j->do_reserve = true; +} + +/* Journalling */ + +static void btree_flush_write(struct cache_set *c) +{ + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; + unsigned int i, nr; + int ref_nr; + atomic_t *fifo_front_p, *now_fifo_front_p; + size_t mask; + + if (c->journal.btree_flushing) + return; + + spin_lock(&c->journal.flush_write_lock); + if (c->journal.btree_flushing) { + spin_unlock(&c->journal.flush_write_lock); + return; + } + c->journal.btree_flushing = true; + spin_unlock(&c->journal.flush_write_lock); + + /* get the oldest journal entry and check its refcount */ + spin_lock(&c->journal.lock); + fifo_front_p = &fifo_front(&c->journal.pin); + ref_nr = atomic_read(fifo_front_p); + if (ref_nr <= 0) { + /* + * do nothing if no btree node references + * the oldest journal entry + */ + spin_unlock(&c->journal.lock); + goto out; + } + spin_unlock(&c->journal.lock); + + mask = c->journal.pin.mask; + nr = 0; + atomic_long_inc(&c->flush_write); + memset(btree_nodes, 0, sizeof(btree_nodes)); + + mutex_lock(&c->bucket_lock); + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + /* + * It is safe to get now_fifo_front_p without holding + * c->journal.lock here, because we don't need to know + * the exactly accurate value, just check whether the + * front pointer of c->journal.pin is changed. + */ + now_fifo_front_p = &fifo_front(&c->journal.pin); + /* + * If the oldest journal entry is reclaimed and front + * pointer of c->journal.pin changes, it is unnecessary + * to scan c->btree_cache anymore, just quit the loop and + * flush out what we have already. + */ + if (now_fifo_front_p != fifo_front_p) + break; + /* + * quit this loop if all matching btree nodes are + * scanned and record in btree_nodes[] already. + */ + ref_nr = atomic_read(fifo_front_p); + if (nr >= ref_nr) + break; + + if (btree_node_journal_flush(b)) + pr_err("BUG: flush_write bit should not be set here!\n"); + + mutex_lock(&b->write_lock); + + if (!btree_node_dirty(b)) { + mutex_unlock(&b->write_lock); + continue; + } + + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + continue; + } + + /* + * Only select the btree node which exactly references + * the oldest journal entry. + * + * If the journal entry pointed by fifo_front_p is + * reclaimed in parallel, don't worry: + * - the list_for_each_xxx loop will quit when checking + * next now_fifo_front_p. + * - If there are matched nodes recorded in btree_nodes[], + * they are clean now (this is why and how the oldest + * journal entry can be reclaimed). These selected nodes + * will be ignored and skipped in the following for-loop. + */ + if (((btree_current_write(b)->journal - fifo_front_p) & + mask) != 0) { + mutex_unlock(&b->write_lock); + continue; + } + + set_btree_node_journal_flush(b); + + mutex_unlock(&b->write_lock); + + btree_nodes[nr++] = b; + /* + * To avoid holding c->bucket_lock too long time, + * only scan for BTREE_FLUSH_NR matched btree nodes + * at most. If there are more btree nodes reference + * the oldest journal entry, try to flush them next + * time when btree_flush_write() is called. + */ + if (nr == BTREE_FLUSH_NR) + break; + } + mutex_unlock(&c->bucket_lock); + + for (i = 0; i < nr; i++) { + b = btree_nodes[i]; + if (!b) { + pr_err("BUG: btree_nodes[%d] is NULL\n", i); + continue; + } + + /* safe to check without holding b->write_lock */ + if (!btree_node_journal_flush(b)) { + pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); + continue; + } + + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: written by others\n", b); + continue; + } + + if (!btree_node_dirty(b)) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: dirty bit cleaned by others\n", b); + continue; + } + + __bch_btree_node_write(b, NULL); + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + } + +out: + spin_lock(&c->journal.flush_write_lock); + c->journal.btree_flushing = false; + spin_unlock(&c->journal.flush_write_lock); +} + +#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) + +static void journal_discard_endio(struct bio *bio) +{ + struct journal_device *ja = + container_of(bio, struct journal_device, discard_bio); + struct cache *ca = container_of(ja, struct cache, journal); + + atomic_set(&ja->discard_in_flight, DISCARD_DONE); + + closure_wake_up(&ca->set->journal.wait); + closure_put(&ca->set->cl); +} + +static void journal_discard_work(struct work_struct *work) +{ + struct journal_device *ja = + container_of(work, struct journal_device, discard_work); + + submit_bio(&ja->discard_bio); +} + +static void do_journal_discard(struct cache *ca) +{ + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->discard_bio; + + if (!ca->discard) { + ja->discard_idx = ja->last_idx; + return; + } + + switch (atomic_read(&ja->discard_in_flight)) { + case DISCARD_IN_FLIGHT: + return; + + case DISCARD_DONE: + ja->discard_idx = (ja->discard_idx + 1) % + ca->sb.njournal_buckets; + + atomic_set(&ja->discard_in_flight, DISCARD_READY); + fallthrough; + + case DISCARD_READY: + if (ja->discard_idx == ja->last_idx) + return; + + atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); + + bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio->bi_iter.bi_sector = bucket_to_sector(ca->set, + ca->sb.d[ja->discard_idx]); + bio->bi_iter.bi_size = bucket_bytes(ca); + bio->bi_end_io = journal_discard_endio; + + closure_get(&ca->set->cl); + INIT_WORK(&ja->discard_work, journal_discard_work); + queue_work(bch_journal_wq, &ja->discard_work); + } +} + +static unsigned int free_journal_buckets(struct cache_set *c) +{ + struct journal *j = &c->journal; + struct cache *ca = c->cache; + struct journal_device *ja = &c->cache->journal; + unsigned int n; + + /* In case njournal_buckets is not power of 2 */ + if (ja->cur_idx >= ja->discard_idx) + n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + else + n = ja->discard_idx - ja->cur_idx; + + if (n > (1 + j->do_reserve)) + return n - (1 + j->do_reserve); + + return 0; +} + +static void journal_reclaim(struct cache_set *c) +{ + struct bkey *k = &c->journal.key; + struct cache *ca = c->cache; + uint64_t last_seq; + struct journal_device *ja = &ca->journal; + atomic_t p __maybe_unused; + + atomic_long_inc(&c->reclaim); + + while (!atomic_read(&fifo_front(&c->journal.pin))) + fifo_pop(&c->journal.pin, p); + + last_seq = last_seq(&c->journal); + + /* Update last_idx */ + + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; + + do_journal_discard(ca); + + if (c->journal.blocks_free) + goto out; + + if (!free_journal_buckets(c)) + goto out; + + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + k->ptr[0] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); + + bkey_init(k); + SET_KEY_PTRS(k, 1); + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; + +out: + if (!journal_full(&c->journal)) + __closure_wake_up(&c->journal.wait); +} + +void bch_journal_next(struct journal *j) +{ + atomic_t p = { 1 }; + + j->cur = (j->cur == j->w) + ? &j->w[1] + : &j->w[0]; + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for last_seq() to be calculated correctly + */ + BUG_ON(!fifo_push(&j->pin, p)); + atomic_set(&fifo_back(&j->pin), 1); + + j->cur->data->seq = ++j->seq; + j->cur->dirty = false; + j->cur->need_write = false; + j->cur->data->keys = 0; + + if (fifo_full(&j->pin)) + pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin)); +} + +static void journal_write_endio(struct bio *bio) +{ + struct journal_write *w = bio->bi_private; + + cache_set_err_on(bio->bi_status, w->c, "journal io error"); + closure_put(&w->c->journal.io); +} + +static void journal_write(struct closure *cl); + +static void journal_write_done(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct journal_write *w = (j->cur == j->w) + ? &j->w[1] + : &j->w[0]; + + __closure_wake_up(&w->wait); + continue_at_nobarrier(cl, journal_write, bch_journal_wq); +} + +static void journal_write_unlock(struct closure *cl) + __releases(&c->journal.lock) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + + c->journal.io_in_flight = 0; + spin_unlock(&c->journal.lock); +} + +static void journal_write_unlocked(struct closure *cl) + __releases(c->journal.lock) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + struct cache *ca = c->cache; + struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * + ca->sb.block_size; + + struct bio *bio; + struct bio_list list; + + bio_list_init(&list); + + if (!w->need_write) { + closure_return_with_destructor(cl, journal_write_unlock); + return; + } else if (journal_full(&c->journal)) { + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + continue_at(cl, journal_write, bch_journal_wq); + return; + } + + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); + + w->data->btree_level = c->root->level; + + bkey_copy(&w->data->btree_root, &c->root->key); + bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); + + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + w->data->magic = jset_magic(&ca->sb); + w->data->version = BCACHE_JSET_VERSION; + w->data->last_seq = last_seq(&c->journal); + w->data->csum = csum_set(w->data); + + for (i = 0; i < KEY_PTRS(k); i++) { + ca = c->cache; + bio = &ca->journal.bio; + + atomic_long_add(sectors, &ca->meta_sectors_written); + + bio_reset(bio, ca->bdev, REQ_OP_WRITE | + REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); + bio->bi_iter.bi_sector = PTR_OFFSET(k, i); + bio->bi_iter.bi_size = sectors << 9; + + bio->bi_end_io = journal_write_endio; + bio->bi_private = w; + bch_bio_map(bio, w->data); + + trace_bcache_journal_write(bio, w->data->keys); + bio_list_add(&list, bio); + + SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } + + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + + atomic_dec_bug(&fifo_back(&c->journal.pin)); + bch_journal_next(&c->journal); + journal_reclaim(c); + + spin_unlock(&c->journal.lock); + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(c, bio, cl); + + continue_at(cl, journal_write_done, NULL); +} + +static void journal_write(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + + spin_lock(&c->journal.lock); + journal_write_unlocked(cl); +} + +static void journal_try_write(struct cache_set *c) + __releases(c->journal.lock) +{ + struct closure *cl = &c->journal.io; + struct journal_write *w = c->journal.cur; + + w->need_write = true; + + if (!c->journal.io_in_flight) { + c->journal.io_in_flight = 1; + closure_call(cl, journal_write_unlocked, NULL, &c->cl); + } else { + spin_unlock(&c->journal.lock); + } +} + +static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned int nkeys) + __acquires(&c->journal.lock) +{ + size_t sectors; + struct closure cl; + bool wait = false; + struct cache *ca = c->cache; + + closure_init_stack(&cl); + + spin_lock(&c->journal.lock); + + while (1) { + struct journal_write *w = c->journal.cur; + + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(ca)) * ca->sb.block_size; + + if (sectors <= min_t(size_t, + c->journal.blocks_free * ca->sb.block_size, + PAGE_SECTORS << JSET_BITS)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + + if (!journal_full(&c->journal)) { + if (wait) + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they + * won't fit in an _empty_ journal write, we'll + * deadlock. For now, handle this in + * bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); + + journal_try_write(c); /* unlocks */ + } else { + if (wait) + trace_bcache_journal_full(c); + + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + } + + closure_sync(&cl); + spin_lock(&c->journal.lock); + wait = true; + } +} + +static void journal_write_work(struct work_struct *work) +{ + struct cache_set *c = container_of(to_delayed_work(work), + struct cache_set, + journal.work); + spin_lock(&c->journal.lock); + if (c->journal.cur->dirty) + journal_try_write(c); + else + spin_unlock(&c->journal.lock); +} + +/* + * Entry point to the journalling code - bio_insert() and btree_invalidate() + * pass bch_journal() a list of keys to be journalled, and then + * bch_journal() hands those same keys off to btree_insert_async() + */ + +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent) +{ + struct journal_write *w; + atomic_t *ret; + + /* No journaling if CACHE_SET_IO_DISABLE set already */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return NULL; + + if (!CACHE_SYNC(&c->cache->sb)) + return NULL; + + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); + + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); + w->data->keys += bch_keylist_nkeys(keys); + + ret = &fifo_back(&c->journal.pin); + atomic_inc(ret); + + if (parent) { + closure_wait(&w->wait, parent); + journal_try_write(c); + } else if (!w->dirty) { + w->dirty = true; + queue_delayed_work(bch_flush_wq, &c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { + spin_unlock(&c->journal.lock); + } + + + return ret; +} + +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct keylist keys; + atomic_t *ref; + + bch_keylist_init(&keys); + + ref = bch_journal(c, &keys, cl); + if (ref) + atomic_dec_bug(ref); +} + +void bch_journal_free(struct cache_set *c) +{ + free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); + free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); + free_fifo(&c->journal.pin); +} + +int bch_journal_alloc(struct cache_set *c) +{ + struct journal *j = &c->journal; + + spin_lock_init(&j->lock); + spin_lock_init(&j->flush_write_lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); + + c->journal_delay_ms = 100; + + j->w[0].c = c; + j->w[1].c = c; + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h new file mode 100644 index 000000000..cd316b4a1 --- /dev/null +++ b/drivers/md/bcache/journal.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_JOURNAL_H +#define _BCACHE_JOURNAL_H + +/* + * THE JOURNAL: + * + * The journal is treated as a circular buffer of buckets - a journal entry + * never spans two buckets. This means (not implemented yet) we can resize the + * journal at runtime, and will be needed for bcache on raw flash support. + * + * Journal entries contain a list of keys, ordered by the time they were + * inserted; thus journal replay just has to reinsert the keys. + * + * We also keep some things in the journal header that are logically part of the + * superblock - all the things that are frequently updated. This is for future + * bcache on raw flash support; the superblock (which will become another + * journal) can't be moved or wear leveled, so it contains just enough + * information to find the main journal, and the superblock only has to be + * rewritten when we want to move/wear level the main journal. + * + * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be + * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions + * from cache misses, which don't have to be journaled, and for writeback and + * moving gc we work around it by flushing the btree to disk before updating the + * gc information. But it is a potential issue with incremental garbage + * collection, and it's fragile. + * + * OPEN JOURNAL ENTRIES: + * + * Each journal entry contains, in the header, the sequence number of the last + * journal entry still open - i.e. that has keys that haven't been flushed to + * disk in the btree. + * + * We track this by maintaining a refcount for every open journal entry, in a + * fifo; each entry in the fifo corresponds to a particular journal + * entry/sequence number. When the refcount at the tail of the fifo goes to + * zero, we pop it off - thus, the size of the fifo tells us the number of open + * journal entries + * + * We take a refcount on a journal entry when we add some keys to a journal + * entry that we're going to insert (held by struct btree_op), and then when we + * insert those keys into the btree the btree write we're setting up takes a + * copy of that refcount (held by struct btree_write). That refcount is dropped + * when the btree write completes. + * + * A struct btree_write can only hold a refcount on a single journal entry, but + * might contain keys for many journal entries - we handle this by making sure + * it always has a refcount on the _oldest_ journal entry of all the journal + * entries it has keys for. + * + * JOURNAL RECLAIM: + * + * As mentioned previously, our fifo of refcounts tells us the number of open + * journal entries; from that and the current journal sequence number we compute + * last_seq - the oldest journal entry we still need. We write last_seq in each + * journal entry, and we also have to keep track of where it exists on disk so + * we don't overwrite it when we loop around the journal. + * + * To do that we track, for each journal bucket, the sequence number of the + * newest journal entry it contains - if we don't need that journal entry we + * don't need anything in that bucket anymore. From that we track the last + * journal bucket we still need; all this is tracked in struct journal_device + * and updated by journal_reclaim(). + * + * JOURNAL FILLING UP: + * + * There are two ways the journal could fill up; either we could run out of + * space to write to, or we could have too many open journal entries and run out + * of room in the fifo of refcounts. Since those refcounts are decremented + * without any locking we can't safely resize that fifo, so we handle it the + * same way. + * + * If the journal fills up, we start flushing dirty btree nodes until we can + * allocate space for a journal write again - preferentially flushing btree + * nodes that are pinning the oldest journal entries first. + */ + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct list_head list; + atomic_t *pin; + struct jset j; +}; + +/* + * We put two of these in struct journal; we used them for writes to the + * journal that are being staged or in flight. + */ +struct journal_write { + struct jset *data; +#define JSET_BITS 3 + + struct cache_set *c; + struct closure_waitlist wait; + bool dirty; + bool need_write; +}; + +/* Embedded in struct cache_set */ +struct journal { + spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; + bool do_reserve; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; + int io_in_flight; + struct delayed_work work; + + /* Number of blocks free in the bucket(s) we're currently writing to */ + unsigned int blocks_free; + uint64_t seq; + DECLARE_FIFO(atomic_t, pin); + + BKEY_PADDED(key); + + struct journal_write w[2], *cur; +}; + +/* + * Embedded in struct cache. First three fields refer to the array of journal + * buckets, in cache_sb. + */ +struct journal_device { + /* + * For each journal bucket, contains the max sequence number of the + * journal writes it contains - so we know when a bucket can be reused. + */ + uint64_t seq[SB_JOURNAL_BUCKETS]; + + /* Journal bucket we're currently writing to */ + unsigned int cur_idx; + + /* Last journal bucket that still contains an open journal entry */ + unsigned int last_idx; + + /* Next journal bucket to be discarded */ + unsigned int discard_idx; + +#define DISCARD_READY 0 +#define DISCARD_IN_FLIGHT 1 +#define DISCARD_DONE 2 + /* 1 - discard in flight, -1 - discard completed */ + atomic_t discard_in_flight; + + struct work_struct discard_work; + struct bio discard_bio; + struct bio_vec discard_bv; + + /* Bio for journal reads/writes to this device */ + struct bio bio; + struct bio_vec bv[8]; +}; + +#define BTREE_FLUSH_NR 8 + +#define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + +#define JOURNAL_PIN 20000 + +#define journal_full(j) \ + (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) + +struct closure; +struct cache_set; +struct btree_op; +struct keylist; + +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent); +void bch_journal_next(struct journal *j); +void bch_journal_mark(struct cache_set *c, struct list_head *list); +void bch_journal_meta(struct cache_set *c, struct closure *cl); +int bch_journal_read(struct cache_set *c, struct list_head *list); +int bch_journal_replay(struct cache_set *c, struct list_head *list); + +void bch_journal_free(struct cache_set *c); +int bch_journal_alloc(struct cache_set *c); +void bch_journal_space_reserve(struct journal *j); + +#endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c new file mode 100644 index 000000000..99499d1f6 --- /dev/null +++ b/drivers/md/bcache/movinggc.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include + +struct moving_io { + struct closure cl; + struct keybuf_key *w; + struct data_insert_op op; + struct bbio bio; +}; + +static bool moving_pred(struct keybuf *buf, struct bkey *k) +{ + struct cache_set *c = container_of(buf, struct cache_set, + moving_gc_keys); + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + GC_MOVE(PTR_BUCKET(c, k, i))) + return true; + + return false; +} + +/* Moving GC - IO loop */ + +static void moving_io_destructor(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + + kfree(io); +} + +static void write_moving_finish(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct bio *bio = &io->bio.bio; + + bio_free_pages(bio); + + if (io->op.replace_collision) + trace_bcache_gc_copy_collision(&io->w->key); + + bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); + + up(&io->op.c->moving_in_flight); + + closure_return_with_destructor(cl, moving_io_destructor); +} + +static void read_moving_endio(struct bio *bio) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct moving_io *io = container_of(bio->bi_private, + struct moving_io, cl); + + if (bio->bi_status) + io->op.status = bio->bi_status; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(io->op.c, &b->key, 0)) { + io->op.status = BLK_STS_IOERR; + } + + bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move"); +} + +static void moving_init(struct moving_io *io) +{ + struct bio *bio = &io->bio.bio; + + bio_init(bio, NULL, bio->bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); + bio_get(bio); + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; + bio->bi_private = &io->cl; + bch_bio_map(bio, NULL); +} + +static void write_moving(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct data_insert_op *op = &io->op; + + if (!op->status) { + moving_init(io); + + io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); + op->write_prio = 1; + op->bio = &io->bio.bio; + + op->writeback = KEY_DIRTY(&io->w->key); + op->csum = KEY_CSUM(&io->w->key); + + bkey_copy(&op->replace_key, &io->w->key); + op->replace = true; + + closure_call(&op->cl, bch_data_insert, NULL, cl); + } + + continue_at(cl, write_moving_finish, op->wq); +} + +static void read_moving_submit(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct bio *bio = &io->bio.bio; + + bch_submit_bbio(bio, io->op.c, &io->w->key, 0); + + continue_at(cl, write_moving, io->op.wq); +} + +static void read_moving(struct cache_set *c) +{ + struct keybuf_key *w; + struct moving_io *io; + struct bio *bio; + struct closure cl; + + closure_init_stack(&cl); + + /* XXX: if we error, background writeback could stall indefinitely */ + + while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, + &MAX_KEY, moving_pred); + if (!w) + break; + + if (ptr_stale(c, &w->key, 0)) { + bch_keybuf_del(&c->moving_gc_keys, w); + continue; + } + + io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->w = w; + io->op.inode = KEY_INODE(&w->key); + io->op.c = c; + io->op.wq = c->moving_gc_wq; + + moving_init(io); + bio = &io->bio.bio; + + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_end_io = read_moving_endio; + + if (bch_bio_alloc_pages(bio, GFP_KERNEL)) + goto err; + + trace_bcache_gc_copy(&w->key); + + down(&c->moving_in_flight); + closure_call(&io->cl, read_moving_submit, NULL, &cl); + } + + if (0) { +err: if (!IS_ERR_OR_NULL(w->private)) + kfree(w->private); + + bch_keybuf_del(&c->moving_gc_keys, w); + } + + closure_sync(&cl); +} + +static bool bucket_cmp(struct bucket *l, struct bucket *r) +{ + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); +} + +static unsigned int bucket_heap_top(struct cache *ca) +{ + struct bucket *b; + + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; +} + +void bch_moving_gc(struct cache_set *c) +{ + struct cache *ca = c->cache; + struct bucket *b; + unsigned long sectors_to_move, reserve_sectors; + + if (!c->copy_gc_enabled) + return; + + mutex_lock(&c->bucket_lock); + + sectors_to_move = 0; + reserve_sectors = ca->sb.bucket_size * + fifo_used(&ca->free[RESERVE_MOVINGGC]); + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) + continue; + + if (!heap_full(&ca->heap)) { + sectors_to_move += GC_SECTORS_USED(b); + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca); + sectors_to_move += GC_SECTORS_USED(b); + + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_cmp); + } + } + + while (sectors_to_move > reserve_sectors) { + heap_pop(&ca->heap, b, bucket_cmp); + sectors_to_move -= GC_SECTORS_USED(b); + } + + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); + + mutex_unlock(&c->bucket_lock); + + c->moving_gc_keys.last_scanned = ZERO_KEY; + + read_moving(c); +} + +void bch_moving_init_cache_set(struct cache_set *c) +{ + bch_keybuf_init(&c->moving_gc_keys); + sema_init(&c->moving_in_flight, 64); +} diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c new file mode 100644 index 000000000..3427555b0 --- /dev/null +++ b/drivers/md/bcache/request.c @@ -0,0 +1,1345 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Main bcache entry point - handle a read or a write request and decide what to + * do with it; the make_request functions are called by the block layer. + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" +#include "writeback.h" + +#include +#include +#include +#include + +#include + +#define CUTOFF_CACHE_ADD 95 +#define CUTOFF_CACHE_READA 90 + +struct kmem_cache *bch_search_cache; + +static void bch_data_insert_start(struct closure *cl); + +static unsigned int cache_mode(struct cached_dev *dc) +{ + return BDEV_CACHE_MODE(&dc->sb); +} + +static bool verify(struct cached_dev *dc) +{ + return dc->verify; +} + +static void bio_csum(struct bio *bio, struct bkey *k) +{ + struct bio_vec bv; + struct bvec_iter iter; + uint64_t csum = 0; + + bio_for_each_segment(bv, bio, iter) { + void *d = bvec_kmap_local(&bv); + + csum = crc64_be(csum, d, bv.bv_len); + kunmap_local(d); + } + + k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); +} + +/* Insert data into cache */ + +static void bch_data_insert_keys(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + atomic_t *journal_ref = NULL; + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; + int ret; + + if (!op->replace) + journal_ref = bch_journal(op->c, &op->insert_keys, + op->flush_journal ? cl : NULL); + + ret = bch_btree_insert(op->c, &op->insert_keys, + journal_ref, replace_key); + if (ret == -ESRCH) { + op->replace_collision = true; + } else if (ret) { + op->status = BLK_STS_RESOURCE; + op->insert_data_done = true; + } + + if (journal_ref) + atomic_dec_bug(journal_ref); + + if (!op->insert_data_done) { + continue_at(cl, bch_data_insert_start, op->wq); + return; + } + + bch_keylist_free(&op->insert_keys); + closure_return(cl); +} + +static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, + struct cache_set *c) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + + /* + * The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bch_data_insert_keys() will insert the keys created so far + * and finish the rest when the keylist is empty. + */ + if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset)) + return -ENOMEM; + + return __bch_keylist_realloc(l, u64s); +} + +static void bch_data_invalidate(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio; + + pr_debug("invalidating %i sectors from %llu\n", + bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); + + while (bio_sectors(bio)) { + unsigned int sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); + + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) + goto out; + + bio->bi_iter.bi_sector += sectors; + bio->bi_iter.bi_size -= sectors << 9; + + bch_keylist_add(&op->insert_keys, + &KEY(op->inode, + bio->bi_iter.bi_sector, + sectors)); + } + + op->insert_data_done = true; + /* get in bch_data_insert() */ + bio_put(bio); +out: + continue_at(cl, bch_data_insert_keys, op->wq); +} + +static void bch_data_insert_error(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + /* + * Our data write just errored, which means we've got a bunch of keys to + * insert that point to data that wasn't successfully written. + * + * We don't have to insert those keys but we still have to invalidate + * that region of the cache - so, if we just strip off all the pointers + * from the keys we'll accomplish just that. + */ + + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; + + while (src != op->insert_keys.top) { + struct bkey *n = bkey_next(src); + + SET_KEY_PTRS(src, 0); + memmove(dst, src, bkey_bytes(src)); + + dst = bkey_next(dst); + src = n; + } + + op->insert_keys.top = dst; + + bch_data_insert_keys(cl); +} + +static void bch_data_insert_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + if (bio->bi_status) { + /* TODO: We could try to recover from this. */ + if (op->writeback) + op->status = bio->bi_status; + else if (!op->replace) + set_closure_fn(cl, bch_data_insert_error, op->wq); + else + set_closure_fn(cl, NULL, NULL); + } + + bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache"); +} + +static void bch_data_insert_start(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio, *n; + + if (op->bypass) + return bch_data_invalidate(cl); + + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + + /* + * Journal writes are marked REQ_PREFLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); + + do { + unsigned int i; + struct bkey *k; + struct bio_set *split = &op->c->bio_split; + + /* 1 for the device pointer and 1 for the chksum */ + if (bch_keylist_realloc(&op->insert_keys, + 3 + (op->csum ? 1 : 0), + op->c)) { + continue_at(cl, bch_data_insert_keys, op->wq); + return; + } + + k = op->insert_keys.top; + bkey_init(k); + SET_KEY_INODE(k, op->inode); + SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); + + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), + op->write_point, op->write_prio, + op->writeback)) + goto err; + + n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); + + n->bi_end_io = bch_data_insert_endio; + n->bi_private = cl; + + if (op->writeback) { + SET_KEY_DIRTY(k, true); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_GC_MARK(PTR_BUCKET(op->c, k, i), + GC_MARK_DIRTY); + } + + SET_KEY_CSUM(k, op->csum); + if (KEY_CSUM(k)) + bio_csum(n, k); + + trace_bcache_cache_insert(k); + bch_keylist_push(&op->insert_keys); + + bio_set_op_attrs(n, REQ_OP_WRITE, 0); + bch_submit_bbio(n, op->c, k, 0); + } while (n != bio); + + op->insert_data_done = true; + continue_at(cl, bch_data_insert_keys, op->wq); + return; +err: + /* bch_alloc_sectors() blocks if s->writeback = true */ + BUG_ON(op->writeback); + + /* + * But if it's not a writeback write we'd rather just bail out if + * there aren't any buckets ready to write to - it might take awhile and + * we might be starving btree writes for gc or something. + */ + + if (!op->replace) { + /* + * Writethrough write: We can't complete the write until we've + * updated the index. But we don't want to delay the write while + * we wait for buckets to be freed up, so just invalidate the + * rest of the write. + */ + op->bypass = true; + return bch_data_invalidate(cl); + } else { + /* + * From a cache miss, we can just insert the keys for the data + * we have written or bail out if we didn't do anything. + */ + op->insert_data_done = true; + bio_put(bio); + + if (!bch_keylist_empty(&op->insert_keys)) + continue_at(cl, bch_data_insert_keys, op->wq); + else + closure_return(cl); + } +} + +/** + * bch_data_insert - stick some data in the cache + * @cl: closure pointer. + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data had to be fragmented there will be multiple keys); after the + * data is written it calls bch_journal, and after the keys have been added to + * the next journal write they're inserted into the btree. + * + * It inserts the data in op->bio; bi_sector is used for the key offset, + * and op->inode is used for the key inode. + * + * If op->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. + */ +void bch_data_insert(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + trace_bcache_write(op->c, op->inode, op->bio, + op->writeback, op->bypass); + + bch_keylist_init(&op->insert_keys); + bio_get(op->bio); + bch_data_insert_start(cl); +} + +/* + * Congested? Return 0 (not congested) or the limit (in sectors) + * beyond which we should bypass the cache due to congestion. + */ +unsigned int bch_get_congested(const struct cache_set *c) +{ + int i; + + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); + + i -= hweight32(get_random_u32()); + + return i > 0 ? i : 1; +} + +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); + + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) +{ + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} + +static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) +{ + struct cache_set *c = dc->disk.c; + unsigned int mode = cache_mode(dc); + unsigned int sectors, congested; + struct task_struct *task = current; + struct io *i; + + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio_op(bio) == REQ_OP_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + op_is_write(bio_op(bio)))) + goto skip; + + /* + * If the bio is for read-ahead or background IO, bypass it or + * not depends on the following situations, + * - If the IO is for meta data, always cache it and no bypass + * - If the IO is not meta data, check dc->cache_reada_policy, + * BCH_CACHE_READA_ALL: cache it and not bypass + * BCH_CACHE_READA_META_ONLY: not cache it and bypass + * That is, read-ahead request for metadata always get cached + * (eg, for gfs2 or xfs). + */ + if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && + (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) + goto skip; + } + + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || + bio_sectors(bio) & (c->cache->sb.block_size - 1)) { + pr_debug("skipping unaligned io\n"); + goto skip; + } + + if (bypass_torture_test(dc)) { + if (prandom_u32_max(4) == 3) + goto skip; + else + goto rescale; + } + + congested = bch_get_congested(c); + if (!congested && !dc->sequential_cutoff) + goto rescale; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) + if (i->last == bio->bi_iter.bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_iter.bi_size > i->sequential) + i->sequential += bio->bi_iter.bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + + sectors = max(task->sequential_io, + task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(bio); + goto skip; + } + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return false; +skip: + bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); + return true; +} + +/* Cache lookup */ + +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; + + struct block_device *orig_bdev; + unsigned long start_time; + + struct btree_op op; + struct data_insert_op iop; +}; + +static void bch_cache_read_endio(struct bio *bio) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct closure *cl = bio->bi_private; + struct search *s = container_of(cl, struct search, cl); + + /* + * If the bucket was reused while our bio was in flight, we might have + * read the wrong data. Set s->error but not error so it doesn't get + * counted against the cache device, but we'll still reread the data + * from the backing device. + */ + + if (bio->bi_status) + s->iop.status = bio->bi_status; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(s->iop.c, &b->key, 0)) { + atomic_long_inc(&s->iop.c->cache_read_races); + s->iop.status = BLK_STS_IOERR; + } + + bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache"); +} + +/* + * Read from a single key, handling the initial cache miss if the key starts in + * the middle of the bio + */ +static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *n, *bio = &s->bio.bio; + struct bkey *bio_key; + unsigned int ptr; + + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) + return MAP_CONTINUE; + + if (KEY_INODE(k) != s->iop.inode || + KEY_START(k) > bio->bi_iter.bi_sector) { + unsigned int bio_sectors = bio_sectors(bio); + unsigned int sectors = KEY_INODE(k) == s->iop.inode + ? min_t(uint64_t, INT_MAX, + KEY_START(k) - bio->bi_iter.bi_sector) + : INT_MAX; + int ret = s->d->cache_miss(b, s, bio, sectors); + + if (ret != MAP_CONTINUE) + return ret; + + /* if this was a complete miss we shouldn't get here */ + BUG_ON(bio_sectors <= sectors); + } + + if (!KEY_SIZE(k)) + return MAP_CONTINUE; + + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + + if (KEY_DIRTY(k)) + s->read_dirty_data = true; + + n = bio_next_split(bio, min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_iter.bi_sector), + GFP_NOIO, &s->d->bio_split); + + bio_key = &container_of(n, struct bbio, bio)->key; + bch_bkey_copy_single_ptr(bio_key, k, ptr); + + bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); + + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; + + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ + + __bch_submit_bbio(n, b->c); + return n == bio ? MAP_DONE : MAP_CONTINUE; +} + +static void cache_lookup(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, iop.cl); + struct bio *bio = &s->bio.bio; + struct cached_dev *dc; + int ret; + + bch_btree_op_init(&s->op, -1); + + ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), + cache_lookup_fn, MAP_END_KEY); + if (ret == -EAGAIN) { + continue_at(cl, cache_lookup, bcache_wq); + return; + } + + /* + * We might meet err when searching the btree, If that happens, we will + * get negative ret, in this scenario we should not recover data from + * backing device (when cache device is dirty) because we don't know + * whether bkeys the read request covered are all clean. + * + * And after that happened, s->iop.status is still its initial value + * before we submit s->bio.bio + */ + if (ret < 0) { + BUG_ON(ret == -EINTR); + if (s->d && s->d->c && + !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) { + dc = container_of(s->d, struct cached_dev, disk); + if (dc && atomic_read(&dc->has_dirty)) + s->recoverable = false; + } + if (!s->iop.status) + s->iop.status = BLK_STS_IOERR; + } + + closure_return(cl); +} + +/* Common code for the make_request functions */ + +static void request_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + if (bio->bi_status) { + struct search *s = container_of(cl, struct search, cl); + + s->iop.status = bio->bi_status; + /* Only cache read errors are recoverable */ + s->recoverable = false; + } + + bio_put(bio); + closure_put(cl); +} + +static void backing_request_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + if (bio->bi_status) { + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, + struct cached_dev, disk); + /* + * If a bio has REQ_PREFLUSH for writeback mode, it is + * speically assembled in cached_dev_write() for a non-zero + * write request which has REQ_PREFLUSH. we don't set + * s->iop.status by this failure, the status will be decided + * by result of bch_data_insert() operation. + */ + if (unlikely(s->iop.writeback && + bio->bi_opf & REQ_PREFLUSH)) { + pr_err("Can't flush %pg: returned bi_status %i\n", + dc->bdev, bio->bi_status); + } else { + /* set to orig_bio->bi_status in bio_complete() */ + s->iop.status = bio->bi_status; + } + s->recoverable = false; + /* should count I/O error for backing device here */ + bch_count_backing_io_errors(dc, bio); + } + + bio_put(bio); + closure_put(cl); +} + +static void bio_complete(struct search *s) +{ + if (s->orig_bio) { + /* Count on bcache device */ + bio_end_io_acct_remapped(s->orig_bio, s->start_time, + s->orig_bdev); + trace_bcache_request_end(s->d, s->orig_bio); + s->orig_bio->bi_status = s->iop.status; + bio_endio(s->orig_bio); + s->orig_bio = NULL; + } +} + +static void do_bio_hook(struct search *s, + struct bio *orig_bio, + bio_end_io_t *end_io_fn) +{ + struct bio *bio = &s->bio.bio; + + bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO); + /* + * bi_end_io can be set separately somewhere else, e.g. the + * variants in, + * - cache_bio->bi_end_io from cached_dev_cache_miss() + * - n->bi_end_io from cache_lookup_fn() + */ + bio->bi_end_io = end_io_fn; + bio->bi_private = &s->cl; + + bio_cnt_set(bio, 3); +} + +static void search_free(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + atomic_dec(&s->iop.c->search_inflight); + + if (s->iop.bio) + bio_put(s->iop.bio); + + bio_complete(s); + closure_debug_destroy(cl); + mempool_free(s, &s->iop.c->search); +} + +static inline struct search *search_alloc(struct bio *bio, + struct bcache_device *d, struct block_device *orig_bdev, + unsigned long start_time) +{ + struct search *s; + + s = mempool_alloc(&d->c->search, GFP_NOIO); + + closure_init(&s->cl, NULL); + do_bio_hook(s, bio, request_endio); + atomic_inc(&d->c->search_inflight); + + s->orig_bio = bio; + s->cache_miss = NULL; + s->cache_missed = 0; + s->d = d; + s->recoverable = 1; + s->write = op_is_write(bio_op(bio)); + s->read_dirty_data = 0; + /* Count on the bcache device */ + s->orig_bdev = orig_bdev; + s->start_time = start_time; + s->iop.c = d->c; + s->iop.bio = NULL; + s->iop.inode = d->id; + s->iop.write_point = hash_long((unsigned long) current, 16); + s->iop.write_prio = 0; + s->iop.status = 0; + s->iop.flags = 0; + s->iop.flush_journal = op_is_flush(bio->bi_opf); + s->iop.wq = bcache_wq; + + return s; +} + +/* Cached devices */ + +static void cached_dev_bio_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + cached_dev_put(dc); + search_free(cl); +} + +/* Process reads */ + +static void cached_dev_read_error_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); + + if (s->iop.bio) + bio_free_pages(s->iop.bio); + + cached_dev_bio_complete(cl); +} + +static void cached_dev_read_error(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bio *bio = &s->bio.bio; + + /* + * If read request hit dirty data (s->read_dirty_data is true), + * then recovery a failed read request from cached device may + * get a stale data back. So read failure recovery is only + * permitted when read request hit clean data in cache device, + * or when cache read race happened. + */ + if (s->recoverable && !s->read_dirty_data) { + /* Retry from the backing device: */ + trace_bcache_read_retry(s->orig_bio); + + s->iop.status = 0; + do_bio_hook(s, s->orig_bio, backing_request_endio); + + /* XXX: invalidate cache */ + + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, bio, cl); + } + + continue_at(cl, cached_dev_read_error_done, NULL); +} + +static void cached_dev_cache_miss_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bcache_device *d = s->d; + + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); + + if (s->iop.bio) + bio_free_pages(s->iop.bio); + + cached_dev_bio_complete(cl); + closure_put(&d->cl); +} + +static void cached_dev_read_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + /* + * We had a cache miss; cache_bio now contains data ready to be inserted + * into the cache. + * + * First, we copy the data we just read from cache_bio's bounce buffers + * to the buffers the original bio pointed to: + */ + + if (s->iop.bio) { + bio_reset(s->iop.bio, s->cache_miss->bi_bdev, REQ_OP_READ); + s->iop.bio->bi_iter.bi_sector = + s->cache_miss->bi_iter.bi_sector; + s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + bio_clone_blkg_association(s->iop.bio, s->cache_miss); + bch_bio_map(s->iop.bio, NULL); + + bio_copy_data(s->cache_miss, s->iop.bio); + + bio_put(s->cache_miss); + s->cache_miss = NULL; + } + + if (verify(dc) && s->recoverable && !s->read_dirty_data) + bch_data_verify(dc, s->orig_bio); + + closure_get(&dc->disk.cl); + bio_complete(s); + + if (s->iop.bio && + !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { + BUG_ON(!s->iop.replace); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + } + + continue_at(cl, cached_dev_cache_miss_done, NULL); +} + +static void cached_dev_read_done_bh(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + bch_mark_cache_accounting(s->iop.c, s->d, + !s->cache_missed, s->iop.bypass); + trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass); + + if (s->iop.status) + continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); + else if (s->iop.bio || verify(dc)) + continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); + else + continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); +} + +static int cached_dev_cache_miss(struct btree *b, struct search *s, + struct bio *bio, unsigned int sectors) +{ + int ret = MAP_CONTINUE; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct bio *miss, *cache_bio; + unsigned int size_limit; + + s->cache_missed = 1; + + if (s->cache_miss || s->iop.bypass) { + miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); + ret = miss == bio ? MAP_DONE : MAP_CONTINUE; + goto out_submit; + } + + /* Limitation for valid replace key size and cache_bio bvecs number */ + size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS, + (1 << KEY_SIZE_BITS) - 1); + s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio)); + + s->iop.replace_key = KEY(s->iop.inode, + bio->bi_iter.bi_sector + s->insert_bio_sectors, + s->insert_bio_sectors); + + ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); + if (ret) + return ret; + + s->iop.replace = true; + + miss = bio_next_split(bio, s->insert_bio_sectors, GFP_NOIO, + &s->d->bio_split); + + /* btree_search_recurse()'s btree iterator is no good anymore */ + ret = miss == bio ? MAP_DONE : -EINTR; + + cache_bio = bio_alloc_bioset(miss->bi_bdev, + DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), + 0, GFP_NOWAIT, &dc->disk.bio_split); + if (!cache_bio) + goto out_submit; + + cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; + cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + + cache_bio->bi_end_io = backing_request_endio; + cache_bio->bi_private = &s->cl; + + bch_bio_map(cache_bio, NULL); + if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) + goto out_put; + + s->cache_miss = miss; + s->iop.bio = cache_bio; + bio_get(cache_bio); + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, cache_bio, &s->cl); + + return ret; +out_put: + bio_put(cache_bio); +out_submit: + miss->bi_end_io = backing_request_endio; + miss->bi_private = &s->cl; + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); + return ret; +} + +static void cached_dev_read(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + + closure_call(&s->iop.cl, cache_lookup, NULL, cl); + continue_at(cl, cached_dev_read_done_bh, NULL); +} + +/* Process writes */ + +static void cached_dev_write_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + up_read_non_owner(&dc->writeback_lock); + cached_dev_bio_complete(cl); +} + +static void cached_dev_write(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + struct bio *bio = &s->bio.bio; + struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); + struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); + + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); + + down_read_non_owner(&dc->writeback_lock); + if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { + /* + * We overlap with some dirty data undergoing background + * writeback, force this write to writeback + */ + s->iop.bypass = false; + s->iop.writeback = true; + } + + /* + * Discards aren't _required_ to do anything, so skipping if + * check_overlapping returned true is ok + * + * But check_overlapping drops dirty keys for which io hasn't started, + * so we still want to call it. + */ + if (bio_op(bio) == REQ_OP_DISCARD) + s->iop.bypass = true; + + if (should_writeback(dc, s->orig_bio, + cache_mode(dc), + s->iop.bypass)) { + s->iop.bypass = false; + s->iop.writeback = true; + } + + if (s->iop.bypass) { + s->iop.bio = s->orig_bio; + bio_get(s->iop.bio); + + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(dc->bdev)) + goto insert_data; + + /* I/O request sent to backing device */ + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + + } else if (s->iop.writeback) { + bch_writeback_add(dc); + s->iop.bio = bio; + + if (bio->bi_opf & REQ_PREFLUSH) { + /* + * Also need to send a flush to the backing + * device. + */ + struct bio *flush; + + flush = bio_alloc_bioset(bio->bi_bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, + GFP_NOIO, &dc->disk.bio_split); + if (!flush) { + s->iop.status = BLK_STS_RESOURCE; + goto insert_data; + } + flush->bi_end_io = backing_request_endio; + flush->bi_private = cl; + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, flush, cl); + } + } else { + s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, + &dc->disk.bio_split); + /* I/O request sent to backing device */ + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + } + +insert_data: + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + continue_at(cl, cached_dev_write_complete, NULL); +} + +static void cached_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bio *bio = &s->bio.bio; + + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + + /* If it's a flush, we send the flush to the backing device too */ + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + + continue_at(cl, cached_dev_bio_complete, NULL); +} + +struct detached_dev_io_private { + struct bcache_device *d; + unsigned long start_time; + bio_end_io_t *bi_end_io; + void *bi_private; + struct block_device *orig_bdev; +}; + +static void detached_dev_end_io(struct bio *bio) +{ + struct detached_dev_io_private *ddip; + + ddip = bio->bi_private; + bio->bi_end_io = ddip->bi_end_io; + bio->bi_private = ddip->bi_private; + + /* Count on the bcache device */ + bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev); + + if (bio->bi_status) { + struct cached_dev *dc = container_of(ddip->d, + struct cached_dev, disk); + /* should count I/O error for backing device here */ + bch_count_backing_io_errors(dc, bio); + } + + kfree(ddip); + bio->bi_end_io(bio); +} + +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, + struct block_device *orig_bdev, unsigned long start_time) +{ + struct detached_dev_io_private *ddip; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + /* + * no need to call closure_get(&dc->disk.cl), + * because upper layer had already opened bcache device, + * which would call closure_get(&dc->disk.cl) + */ + ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); + if (!ddip) { + bio->bi_status = BLK_STS_RESOURCE; + bio->bi_end_io(bio); + return; + } + + ddip->d = d; + /* Count on the bcache device */ + ddip->orig_bdev = orig_bdev; + ddip->start_time = start_time; + ddip->bi_end_io = bio->bi_end_io; + ddip->bi_private = bio->bi_private; + bio->bi_end_io = detached_dev_end_io; + bio->bi_private = ddip; + + if ((bio_op(bio) == REQ_OP_DISCARD) && + !bdev_max_discard_sectors(dc->bdev)) + bio->bi_end_io(bio); + else + submit_bio_noacct(bio); +} + +static void quit_max_writeback_rate(struct cache_set *c, + struct cached_dev *this_dc) +{ + int i; + struct bcache_device *d; + struct cached_dev *dc; + + /* + * mutex bch_register_lock may compete with other parallel requesters, + * or attach/detach operations on other backing device. Waiting to + * the mutex lock may increase I/O request latency for seconds or more. + * To avoid such situation, if mutext_trylock() failed, only writeback + * rate of current cached device is set to 1, and __update_write_back() + * will decide writeback rate of other cached devices (remember now + * c->idle_counter is 0 already). + */ + if (mutex_trylock(&bch_register_lock)) { + for (i = 0; i < c->devices_max_used; i++) { + if (!c->devices[i]) + continue; + + if (UUID_FLASH_ONLY(&c->uuids[i])) + continue; + + d = c->devices[i]; + dc = container_of(d, struct cached_dev, disk); + /* + * set writeback rate to default minimum value, + * then let update_writeback_rate() to decide the + * upcoming rate. + */ + atomic_long_set(&dc->writeback_rate.rate, 1); + } + mutex_unlock(&bch_register_lock); + } else + atomic_long_set(&this_dc->writeback_rate.rate, 1); +} + +/* Cached devices - read & write stuff */ + +void cached_dev_submit_bio(struct bio *bio) +{ + struct search *s; + struct block_device *orig_bdev = bio->bi_bdev; + struct bcache_device *d = orig_bdev->bd_disk->private_data; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + unsigned long start_time; + int rw = bio_data_dir(bio); + + if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || + dc->io_disable)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + + if (likely(d->c)) { + if (atomic_read(&d->c->idle_counter)) + atomic_set(&d->c->idle_counter, 0); + /* + * If at_max_writeback_rate of cache set is true and new I/O + * comes, quit max writeback rate of all cached devices + * attached to this cache set, and set at_max_writeback_rate + * to false. + */ + if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { + atomic_set(&d->c->at_max_writeback_rate, 0); + quit_max_writeback_rate(d->c, dc); + } + } + + start_time = bio_start_io_acct(bio); + + bio_set_dev(bio, dc->bdev); + bio->bi_iter.bi_sector += dc->sb.data_offset; + + if (cached_dev_get(dc)) { + s = search_alloc(bio, d, orig_bdev, start_time); + trace_bcache_request_start(s->d, bio); + + if (!bio->bi_iter.bi_size) { + /* + * can't call bch_journal_meta from under + * submit_bio_noacct + */ + continue_at_nobarrier(&s->cl, + cached_dev_nodata, + bcache_wq); + } else { + s->iop.bypass = check_should_bypass(dc, bio); + + if (rw) + cached_dev_write(dc, s); + else + cached_dev_read(dc, s); + } + } else + /* I/O request sent to backing device */ + detached_dev_do_request(d, bio, orig_bdev, start_time); +} + +static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + if (dc->io_disable) + return -EIO; + if (!dc->bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg); +} + +void bch_cached_dev_request_init(struct cached_dev *dc) +{ + dc->disk.cache_miss = cached_dev_cache_miss; + dc->disk.ioctl = cached_dev_ioctl; +} + +/* Flash backed devices */ + +static int flash_dev_cache_miss(struct btree *b, struct search *s, + struct bio *bio, unsigned int sectors) +{ + unsigned int bytes = min(sectors, bio_sectors(bio)) << 9; + + swap(bio->bi_iter.bi_size, bytes); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, bytes); + + bio_advance(bio, bytes); + + if (!bio->bi_iter.bi_size) + return MAP_DONE; + + return MAP_CONTINUE; +} + +static void flash_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + + continue_at(cl, search_free, NULL); +} + +void flash_dev_submit_bio(struct bio *bio) +{ + struct search *s; + struct closure *cl; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + + if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + + s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio)); + cl = &s->cl; + bio = &s->bio.bio; + + trace_bcache_request_start(s->d, bio); + + if (!bio->bi_iter.bi_size) { + /* + * can't call bch_journal_meta from under submit_bio_noacct + */ + continue_at_nobarrier(&s->cl, + flash_dev_nodata, + bcache_wq); + return; + } else if (bio_data_dir(bio)) { + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, + &KEY(d->id, bio->bi_iter.bi_sector, 0), + &KEY(d->id, bio_end_sector(bio), 0)); + + s->iop.bypass = (bio_op(bio) == REQ_OP_DISCARD) != 0; + s->iop.writeback = true; + s->iop.bio = bio; + + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + } else { + closure_call(&s->iop.cl, cache_lookup, NULL, cl); + } + + continue_at(cl, search_free, NULL); +} + +static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} + +void bch_flash_dev_request_init(struct bcache_device *d) +{ + d->cache_miss = flash_dev_cache_miss; + d->ioctl = flash_dev_ioctl; +} + +void bch_request_exit(void) +{ + kmem_cache_destroy(bch_search_cache); +} + +int __init bch_request_init(void) +{ + bch_search_cache = KMEM_CACHE(search, 0); + if (!bch_search_cache) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h new file mode 100644 index 000000000..38ab4856e --- /dev/null +++ b/drivers/md/bcache/request.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_REQUEST_H_ +#define _BCACHE_REQUEST_H_ + +struct data_insert_op { + struct closure cl; + struct cache_set *c; + struct bio *bio; + struct workqueue_struct *wq; + + unsigned int inode; + uint16_t write_point; + uint16_t write_prio; + blk_status_t status; + + union { + uint16_t flags; + + struct { + unsigned int bypass:1; + unsigned int writeback:1; + unsigned int flush_journal:1; + unsigned int csum:1; + + unsigned int replace:1; + unsigned int replace_collision:1; + + unsigned int insert_data_done:1; + }; + }; + + struct keylist insert_keys; + BKEY_PADDED(replace_key); +}; + +unsigned int bch_get_congested(const struct cache_set *c); +void bch_data_insert(struct closure *cl); + +void bch_cached_dev_request_init(struct cached_dev *dc); +void cached_dev_submit_bio(struct bio *bio); + +void bch_flash_dev_request_init(struct bcache_device *d); +void flash_dev_submit_bio(struct bio *bio); + +extern struct kmem_cache *bch_search_cache; + +#endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c new file mode 100644 index 000000000..68b022160 --- /dev/null +++ b/drivers/md/bcache/stats.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache stats code + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "stats.h" +#include "btree.h" +#include "sysfs.h" + +/* + * We keep absolute totals of various statistics, and addionally a set of three + * rolling averages. + * + * Every so often, a timer goes off and rescales the rolling averages. + * accounting_rescale[] is how many times the timer has to go off before we + * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, + * and one day. + * + * accounting_delay is how often the timer goes off - 22 times in 5 minutes, + * and accounting_weight is what we use to rescale: + * + * pow(31 / 32, 22) ~= 1/2 + * + * So that we don't have to increment each set of numbers every time we (say) + * get a cache hit, we increment a single atomic_t in acc->collector, and when + * the rescale function runs it resets the atomic counter to 0 and adds its + * old value to each of the exported numbers. + * + * To reduce rounding error, the numbers in struct cache_stats are all + * stored left shifted by 16, and scaled back in the sysfs show() function. + */ + +static const unsigned int DAY_RESCALE = 288; +static const unsigned int HOUR_RESCALE = 12; +static const unsigned int FIVE_MINUTE_RESCALE = 1; +static const unsigned int accounting_delay = (HZ * 300) / 22; +static const unsigned int accounting_weight = 32; + +/* sysfs reading/writing */ + +read_attribute(cache_hits); +read_attribute(cache_misses); +read_attribute(cache_bypass_hits); +read_attribute(cache_bypass_misses); +read_attribute(cache_hit_ratio); +read_attribute(cache_miss_collisions); +read_attribute(bypassed); + +SHOW(bch_stats) +{ + struct cache_stats *s = + container_of(kobj, struct cache_stats, kobj); +#define var(stat) (s->stat >> 16) + var_print(cache_hits); + var_print(cache_misses); + var_print(cache_bypass_hits); + var_print(cache_bypass_misses); + + sysfs_print(cache_hit_ratio, + DIV_SAFE(var(cache_hits) * 100, + var(cache_hits) + var(cache_misses))); + + var_print(cache_miss_collisions); + sysfs_hprint(bypassed, var(sectors_bypassed) << 9); +#undef var + return 0; +} + +STORE(bch_stats) +{ + return size; +} + +static void bch_stats_release(struct kobject *k) +{ +} + +static struct attribute *bch_stats_attrs[] = { + &sysfs_cache_hits, + &sysfs_cache_misses, + &sysfs_cache_bypass_hits, + &sysfs_cache_bypass_misses, + &sysfs_cache_hit_ratio, + &sysfs_cache_miss_collisions, + &sysfs_bypassed, + NULL +}; +ATTRIBUTE_GROUPS(bch_stats); +static KTYPE(bch_stats); + +int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, + struct kobject *parent) +{ + int ret = kobject_add(&acc->total.kobj, parent, + "stats_total"); + ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, + "stats_five_minute"); + ret = ret ?: kobject_add(&acc->hour.kobj, parent, + "stats_hour"); + ret = ret ?: kobject_add(&acc->day.kobj, parent, + "stats_day"); + return ret; +} + +void bch_cache_accounting_clear(struct cache_accounting *acc) +{ + acc->total.cache_hits = 0; + acc->total.cache_misses = 0; + acc->total.cache_bypass_hits = 0; + acc->total.cache_bypass_misses = 0; + acc->total.cache_miss_collisions = 0; + acc->total.sectors_bypassed = 0; +} + +void bch_cache_accounting_destroy(struct cache_accounting *acc) +{ + kobject_put(&acc->total.kobj); + kobject_put(&acc->five_minute.kobj); + kobject_put(&acc->hour.kobj); + kobject_put(&acc->day.kobj); + + atomic_set(&acc->closing, 1); + if (del_timer_sync(&acc->timer)) + closure_return(&acc->cl); +} + +/* EWMA scaling */ + +static void scale_stat(unsigned long *stat) +{ + *stat = ewma_add(*stat, 0, accounting_weight, 0); +} + +static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) +{ + if (++stats->rescale == rescale_at) { + stats->rescale = 0; + scale_stat(&stats->cache_hits); + scale_stat(&stats->cache_misses); + scale_stat(&stats->cache_bypass_hits); + scale_stat(&stats->cache_bypass_misses); + scale_stat(&stats->cache_miss_collisions); + scale_stat(&stats->sectors_bypassed); + } +} + +static void scale_accounting(struct timer_list *t) +{ + struct cache_accounting *acc = from_timer(acc, t, timer); + +#define move_stat(name) do { \ + unsigned int t = atomic_xchg(&acc->collector.name, 0); \ + t <<= 16; \ + acc->five_minute.name += t; \ + acc->hour.name += t; \ + acc->day.name += t; \ + acc->total.name += t; \ +} while (0) + + move_stat(cache_hits); + move_stat(cache_misses); + move_stat(cache_bypass_hits); + move_stat(cache_bypass_misses); + move_stat(cache_miss_collisions); + move_stat(sectors_bypassed); + + scale_stats(&acc->total, 0); + scale_stats(&acc->day, DAY_RESCALE); + scale_stats(&acc->hour, HOUR_RESCALE); + scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); + + acc->timer.expires += accounting_delay; + + if (!atomic_read(&acc->closing)) + add_timer(&acc->timer); + else + closure_return(&acc->cl); +} + +static void mark_cache_stats(struct cache_stat_collector *stats, + bool hit, bool bypass) +{ + if (!bypass) + if (hit) + atomic_inc(&stats->cache_hits); + else + atomic_inc(&stats->cache_misses); + else + if (hit) + atomic_inc(&stats->cache_bypass_hits); + else + atomic_inc(&stats->cache_bypass_misses); +} + +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + mark_cache_stats(&dc->accounting.collector, hit, bypass); + mark_cache_stats(&c->accounting.collector, hit, bypass); +} + +void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + atomic_inc(&dc->accounting.collector.cache_miss_collisions); + atomic_inc(&c->accounting.collector.cache_miss_collisions); +} + +void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, + int sectors) +{ + atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); +} + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent) +{ + kobject_init(&acc->total.kobj, &bch_stats_ktype); + kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); + kobject_init(&acc->hour.kobj, &bch_stats_ktype); + kobject_init(&acc->day.kobj, &bch_stats_ktype); + + closure_init(&acc->cl, parent); + timer_setup(&acc->timer, scale_accounting, 0); + acc->timer.expires = jiffies + accounting_delay; + add_timer(&acc->timer); +} diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h new file mode 100644 index 000000000..bd3afc856 --- /dev/null +++ b/drivers/md/bcache/stats.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_STATS_H_ +#define _BCACHE_STATS_H_ + +struct cache_stat_collector { + atomic_t cache_hits; + atomic_t cache_misses; + atomic_t cache_bypass_hits; + atomic_t cache_bypass_misses; + atomic_t cache_miss_collisions; + atomic_t sectors_bypassed; +}; + +struct cache_stats { + struct kobject kobj; + + unsigned long cache_hits; + unsigned long cache_misses; + unsigned long cache_bypass_hits; + unsigned long cache_bypass_misses; + unsigned long cache_readaheads; + unsigned long cache_miss_collisions; + unsigned long sectors_bypassed; + + unsigned int rescale; +}; + +struct cache_accounting { + struct closure cl; + struct timer_list timer; + atomic_t closing; + + struct cache_stat_collector collector; + + struct cache_stats total; + struct cache_stats five_minute; + struct cache_stats hour; + struct cache_stats day; +}; + +struct cache_set; +struct cached_dev; +struct bcache_device; + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent); + +int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, + struct kobject *parent); + +void bch_cache_accounting_clear(struct cache_accounting *acc); + +void bch_cache_accounting_destroy(struct cache_accounting *acc); + +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass); +void bch_mark_cache_miss_collision(struct cache_set *c, + struct bcache_device *d); +void bch_mark_sectors_bypassed(struct cache_set *c, + struct cached_dev *dc, + int sectors); + +#endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c new file mode 100644 index 000000000..70e5bd896 --- /dev/null +++ b/drivers/md/bcache/super.c @@ -0,0 +1,2941 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache setup/teardown code, and some metadata io - read a superblock and + * figure out what to do with it. + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "request.h" +#include "writeback.h" +#include "features.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int bch_cutoff_writeback; +unsigned int bch_cutoff_writeback_sync; + +static const char bcache_magic[] = { + 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 +}; + +static const char invalid_uuid[] = { + 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, + 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 +}; + +static struct kobject *bcache_kobj; +struct mutex bch_register_lock; +bool bcache_is_reboot; +LIST_HEAD(bch_cache_sets); +static LIST_HEAD(uncached_devices); + +static int bcache_major; +static DEFINE_IDA(bcache_device_idx); +static wait_queue_head_t unregister_wait; +struct workqueue_struct *bcache_wq; +struct workqueue_struct *bch_flush_wq; +struct workqueue_struct *bch_journal_wq; + + +#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) +/* limitation of partitions number on single bcache device */ +#define BCACHE_MINORS 128 +/* limitation of bcache devices number on single system */ +#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS) + +/* Superblock */ + +static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s) +{ + unsigned int bucket_size = le16_to_cpu(s->bucket_size); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + if (bch_has_feature_large_bucket(sb)) { + unsigned int max, order; + + max = sizeof(unsigned int) * BITS_PER_BYTE - 1; + order = le16_to_cpu(s->bucket_size); + /* + * bcache tool will make sure the overflow won't + * happen, an error message here is enough. + */ + if (order > max) + pr_err("Bucket size (1 << %u) overflows\n", + order); + bucket_size = 1 << order; + } else if (bch_has_feature_obso_large_bucket(sb)) { + bucket_size += + le16_to_cpu(s->obso_bucket_size_hi) << 16; + } + } + + return bucket_size; +} + +static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev, + struct cache_sb_disk *s) +{ + const char *err; + unsigned int i; + + sb->first_bucket= le16_to_cpu(s->first_bucket); + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->bucket_size = get_bucket_size(sb, s); + + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); + + err = "Too many journal buckets"; + if (sb->keys > SB_JOURNAL_BUCKETS) + goto err; + + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; + + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; + + err = "Bad block size (not power of 2)"; + if (!is_power_of_2(sb->block_size)) + goto err; + + err = "Bad block size (larger than page size)"; + if (sb->block_size > PAGE_SECTORS) + goto err; + + err = "Bad bucket size (not power of 2)"; + if (!is_power_of_2(sb->bucket_size)) + goto err; + + err = "Bad bucket size (smaller than page size)"; + if (sb->bucket_size < PAGE_SECTORS) + goto err; + + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < + sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; + + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + err = NULL; +err: + return err; +} + + +static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + struct cache_sb_disk **res) +{ + const char *err; + struct cache_sb_disk *s; + struct page *page; + unsigned int i; + + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(page)) + return "IO error"; + s = page_address(page) + offset_in_page(SB_OFFSET); + + sb->offset = le64_to_cpu(s->offset); + sb->version = le64_to_cpu(s->version); + + memcpy(sb->magic, s->magic, 16); + memcpy(sb->uuid, s->uuid, 16); + memcpy(sb->set_uuid, s->set_uuid, 16); + memcpy(sb->label, s->label, SB_LABEL_SIZE); + + sb->flags = le64_to_cpu(s->flags); + sb->seq = le64_to_cpu(s->seq); + sb->last_mount = le32_to_cpu(s->last_mount); + sb->keys = le16_to_cpu(s->keys); + + for (i = 0; i < SB_JOURNAL_BUCKETS; i++) + sb->d[i] = le64_to_cpu(s->d[i]); + + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n", + sb->version, sb->flags, sb->seq, sb->keys); + + err = "Not a bcache superblock (bad offset)"; + if (sb->offset != SB_SECTOR) + goto err; + + err = "Not a bcache superblock (bad magic)"; + if (memcmp(sb->magic, bcache_magic, 16)) + goto err; + + err = "Bad checksum"; + if (s->csum != csum_set(s)) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->uuid, 16)) + goto err; + + sb->block_size = le16_to_cpu(s->block_size); + + err = "Superblock block size smaller than device block size"; + if (sb->block_size << 9 < bdev_logical_block_size(bdev)) + goto err; + + switch (sb->version) { + case BCACHE_SB_VERSION_BDEV: + sb->data_offset = BDEV_DATA_START_DEFAULT; + break; + case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + case BCACHE_SB_VERSION_BDEV_WITH_FEATURES: + sb->data_offset = le64_to_cpu(s->data_offset); + + err = "Bad data offset"; + if (sb->data_offset < BDEV_DATA_START_DEFAULT) + goto err; + + break; + case BCACHE_SB_VERSION_CDEV: + case BCACHE_SB_VERSION_CDEV_WITH_UUID: + err = read_super_common(sb, bdev, s); + if (err) + goto err; + break; + case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: + /* + * Feature bits are needed in read_super_common(), + * convert them firstly. + */ + sb->feature_compat = le64_to_cpu(s->feature_compat); + sb->feature_incompat = le64_to_cpu(s->feature_incompat); + sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + + /* Check incompatible features */ + err = "Unsupported compatible feature found"; + if (bch_has_unknown_compat_features(sb)) + goto err; + + err = "Unsupported read-only compatible feature found"; + if (bch_has_unknown_ro_compat_features(sb)) + goto err; + + err = "Unsupported incompatible feature found"; + if (bch_has_unknown_incompat_features(sb)) + goto err; + + err = read_super_common(sb, bdev, s); + if (err) + goto err; + break; + default: + err = "Unsupported superblock version"; + goto err; + } + + sb->last_mount = (u32)ktime_get_real_seconds(); + *res = s; + return NULL; +err: + put_page(page); + return err; +} + +static void write_bdev_super_endio(struct bio *bio) +{ + struct cached_dev *dc = bio->bi_private; + + if (bio->bi_status) + bch_count_backing_io_errors(dc, bio); + + closure_put(&dc->sb_write); +} + +static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, + struct bio *bio) +{ + unsigned int i; + + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; + bio->bi_iter.bi_sector = SB_SECTOR; + __bio_add_page(bio, virt_to_page(out), SB_SIZE, + offset_in_page(out)); + + out->offset = cpu_to_le64(sb->offset); + + memcpy(out->uuid, sb->uuid, 16); + memcpy(out->set_uuid, sb->set_uuid, 16); + memcpy(out->label, sb->label, SB_LABEL_SIZE); + + out->flags = cpu_to_le64(sb->flags); + out->seq = cpu_to_le64(sb->seq); + + out->last_mount = cpu_to_le32(sb->last_mount); + out->first_bucket = cpu_to_le16(sb->first_bucket); + out->keys = cpu_to_le16(sb->keys); + + for (i = 0; i < sb->keys; i++) + out->d[i] = cpu_to_le64(sb->d[i]); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + out->feature_compat = cpu_to_le64(sb->feature_compat); + out->feature_incompat = cpu_to_le64(sb->feature_incompat); + out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat); + } + + out->version = cpu_to_le64(sb->version); + out->csum = csum_set(out); + + pr_debug("ver %llu, flags %llu, seq %llu\n", + sb->version, sb->flags, sb->seq); + + submit_bio(bio); +} + +static void bch_write_bdev_super_unlock(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + + up(&dc->sb_write_mutex); +} + +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) +{ + struct closure *cl = &dc->sb_write; + struct bio *bio = &dc->sb_bio; + + down(&dc->sb_write_mutex); + closure_init(cl, parent); + + bio_init(bio, dc->bdev, dc->sb_bv, 1, 0); + bio->bi_end_io = write_bdev_super_endio; + bio->bi_private = dc; + + closure_get(cl); + /* I/O request sent to backing device */ + __write_super(&dc->sb, dc->sb_disk, bio); + + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); +} + +static void write_super_endio(struct bio *bio) +{ + struct cache *ca = bio->bi_private; + + /* is_read = 0 */ + bch_count_io_errors(ca, bio->bi_status, 0, + "writing superblock"); + closure_put(&ca->set->sb_write); +} + +static void bcache_write_super_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, sb_write); + + up(&c->sb_write_mutex); +} + +void bcache_write_super(struct cache_set *c) +{ + struct closure *cl = &c->sb_write; + struct cache *ca = c->cache; + struct bio *bio = &ca->sb_bio; + unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + + down(&c->sb_write_mutex); + closure_init(cl, &c->cl); + + ca->sb.seq++; + + if (ca->sb.version < version) + ca->sb.version = version; + + bio_init(bio, ca->bdev, ca->sb_bv, 1, 0); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + + closure_get(cl); + __write_super(&ca->sb, ca->sb_disk, bio); + + closure_return_with_destructor(cl, bcache_write_super_unlock); +} + +/* UUID io */ + +static void uuid_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + cache_set_err_on(bio->bi_status, c, "accessing uuids"); + bch_bbio_free(bio, c); + closure_put(cl); +} + +static void uuid_io_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + up(&c->uuid_write_mutex); +} + +static void uuid_io(struct cache_set *c, blk_opf_t opf, struct bkey *k, + struct closure *parent) +{ + struct closure *cl = &c->uuid_write; + struct uuid_entry *u; + unsigned int i; + char buf[80]; + + BUG_ON(!parent); + down(&c->uuid_write_mutex); + closure_init(cl, parent); + + for (i = 0; i < KEY_PTRS(k); i++) { + struct bio *bio = bch_bbio_alloc(c); + + bio->bi_opf = opf | REQ_SYNC | REQ_META; + bio->bi_iter.bi_size = KEY_SIZE(k) << 9; + + bio->bi_end_io = uuid_endio; + bio->bi_private = cl; + bch_bio_map(bio, c->uuids); + + bch_submit_bbio(bio, c, k, i); + + if ((opf & REQ_OP_MASK) != REQ_OP_WRITE) + break; + } + + bch_extent_to_text(buf, sizeof(buf), k); + pr_debug("%s UUIDs at %s\n", (opf & REQ_OP_MASK) == REQ_OP_WRITE ? + "wrote" : "read", buf); + + for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) + if (!bch_is_zero(u->uuid, 16)) + pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n", + u - c->uuids, u->uuid, u->label, + u->first_reg, u->last_reg, u->invalidated); + + closure_return_with_destructor(cl, uuid_io_unlock); +} + +static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) +{ + struct bkey *k = &j->uuid_bucket; + + if (__bch_btree_ptr_invalid(c, k)) + return "bad uuid pointer"; + + bkey_copy(&c->uuid_bucket, k); + uuid_io(c, REQ_OP_READ, k, cl); + + if (j->version < BCACHE_JSET_VERSION_UUIDv1) { + struct uuid_entry_v0 *u0 = (void *) c->uuids; + struct uuid_entry *u1 = (void *) c->uuids; + int i; + + closure_sync(cl); + + /* + * Since the new uuid entry is bigger than the old, we have to + * convert starting at the highest memory address and work down + * in order to do it in place + */ + + for (i = c->nr_uuids - 1; + i >= 0; + --i) { + memcpy(u1[i].uuid, u0[i].uuid, 16); + memcpy(u1[i].label, u0[i].label, 32); + + u1[i].first_reg = u0[i].first_reg; + u1[i].last_reg = u0[i].last_reg; + u1[i].invalidated = u0[i].invalidated; + + u1[i].flags = 0; + u1[i].sectors = 0; + } + } + + return NULL; +} + +static int __uuid_write(struct cache_set *c) +{ + BKEY_PADDED(key) k; + struct closure cl; + struct cache *ca = c->cache; + unsigned int size; + + closure_init_stack(&cl); + lockdep_assert_held(&bch_register_lock); + + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true)) + return 1; + + size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS; + SET_KEY_SIZE(&k.key, size); + uuid_io(c, REQ_OP_WRITE, &k.key, &cl); + closure_sync(&cl); + + /* Only one bucket used for uuid write */ + atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written); + + bkey_copy(&c->uuid_bucket, &k.key); + bkey_put(c, &k.key); + return 0; +} + +int bch_uuid_write(struct cache_set *c) +{ + int ret = __uuid_write(c); + + if (!ret) + bch_journal_meta(c, NULL); + + return ret; +} + +static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) +{ + struct uuid_entry *u; + + for (u = c->uuids; + u < c->uuids + c->nr_uuids; u++) + if (!memcmp(u->uuid, uuid, 16)) + return u; + + return NULL; +} + +static struct uuid_entry *uuid_find_empty(struct cache_set *c) +{ + static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + + return uuid_find(c, zero_uuid); +} + +/* + * Bucket priorities/gens: + * + * For each bucket, we store on disk its + * 8 bit gen + * 16 bit priority + * + * See alloc.c for an explanation of the gen. The priority is used to implement + * lru (and in the future other) cache replacement policies; for most purposes + * it's just an opaque integer. + * + * The gens and the priorities don't have a whole lot to do with each other, and + * it's actually the gens that must be written out at specific times - it's no + * big deal if the priorities don't get written, if we lose them we just reuse + * buckets in suboptimal order. + * + * On disk they're stored in a packed array, and in as many buckets are required + * to fit them all. The buckets we use to store them form a list; the journal + * header points to the first bucket, the first bucket points to the second + * bucket, et cetera. + * + * This code is used by the allocation code; periodically (whenever it runs out + * of buckets to allocate from) the allocation code will invalidate some + * buckets, but it can't use those buckets until their new gens are safely on + * disk. + */ + +static void prio_endio(struct bio *bio) +{ + struct cache *ca = bio->bi_private; + + cache_set_err_on(bio->bi_status, ca->set, "accessing priorities"); + bch_bbio_free(bio, ca->set); + closure_put(&ca->prio); +} + +static void prio_io(struct cache *ca, uint64_t bucket, blk_opf_t opf) +{ + struct closure *cl = &ca->prio; + struct bio *bio = bch_bbio_alloc(ca->set); + + closure_init_stack(cl); + + bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; + bio_set_dev(bio, ca->bdev); + bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb); + + bio->bi_end_io = prio_endio; + bio->bi_private = ca; + bio->bi_opf = opf | REQ_SYNC | REQ_META; + bch_bio_map(bio, ca->disk_buckets); + + closure_bio_submit(ca->set, bio, &ca->prio); + closure_sync(cl); +} + +int bch_prio_write(struct cache *ca, bool wait) +{ + int i; + struct bucket *b; + struct closure cl; + + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + + closure_init_stack(&cl); + + lockdep_assert_held(&ca->set->bucket_lock); + + ca->disk_buckets->seq++; + + atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), + &ca->meta_sectors_written); + + for (i = prio_buckets(ca) - 1; i >= 0; --i) { + long bucket; + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data; + struct bucket_disk *end = d + prios_per_bucket(ca); + + for (b = ca->buckets + i * prios_per_bucket(ca); + b < ca->buckets + ca->sb.nbuckets && d < end; + b++, d++) { + d->prio = cpu_to_le16(b->prio); + d->gen = b->gen; + } + + p->next_bucket = ca->prio_buckets[i + 1]; + p->magic = pset_magic(&ca->sb); + p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8); + + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); + BUG_ON(bucket == -1); + + mutex_unlock(&ca->set->bucket_lock); + prio_io(ca, bucket, REQ_OP_WRITE); + mutex_lock(&ca->set->bucket_lock); + + ca->prio_buckets[i] = bucket; + atomic_dec_bug(&ca->buckets[bucket].pin); + } + + mutex_unlock(&ca->set->bucket_lock); + + bch_journal_meta(ca->set, &cl); + closure_sync(&cl); + + mutex_lock(&ca->set->bucket_lock); + + /* + * Don't want the old priorities to get garbage collected until after we + * finish writing the new ones, and they're journalled + */ + for (i = 0; i < prio_buckets(ca); i++) { + if (ca->prio_last_buckets[i]) + __bch_bucket_free(ca, + &ca->buckets[ca->prio_last_buckets[i]]); + + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } + return 0; +} + +static int prio_read(struct cache *ca, uint64_t bucket) +{ + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; + struct bucket *b; + unsigned int bucket_nr = 0; + int ret = -EIO; + + for (b = ca->buckets; + b < ca->buckets + ca->sb.nbuckets; + b++, d++) { + if (d == end) { + ca->prio_buckets[bucket_nr] = bucket; + ca->prio_last_buckets[bucket_nr] = bucket; + bucket_nr++; + + prio_io(ca, bucket, REQ_OP_READ); + + if (p->csum != + bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) { + pr_warn("bad csum reading priorities\n"); + goto out; + } + + if (p->magic != pset_magic(&ca->sb)) { + pr_warn("bad magic reading priorities\n"); + goto out; + } + + bucket = p->next_bucket; + d = p->data; + } + + b->prio = le16_to_cpu(d->prio); + b->gen = b->last_gc = d->gen; + } + + ret = 0; +out: + return ret; +} + +/* Bcache device */ + +static int open_dev(struct block_device *b, fmode_t mode) +{ + struct bcache_device *d = b->bd_disk->private_data; + + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) + return -ENXIO; + + closure_get(&d->cl); + return 0; +} + +static void release_dev(struct gendisk *b, fmode_t mode) +{ + struct bcache_device *d = b->private_data; + + closure_put(&d->cl); +} + +static int ioctl_dev(struct block_device *b, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct bcache_device *d = b->bd_disk->private_data; + + return d->ioctl(d, mode, cmd, arg); +} + +static const struct block_device_operations bcache_cached_ops = { + .submit_bio = cached_dev_submit_bio, + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +static const struct block_device_operations bcache_flash_ops = { + .submit_bio = flash_dev_submit_bio, + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +void bcache_device_stop(struct bcache_device *d) +{ + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) + /* + * closure_fn set to + * - cached device: cached_dev_flush() + * - flash dev: flash_dev_flush() + */ + closure_queue(&d->cl); +} + +static void bcache_device_unlink(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { + struct cache *ca = d->c->cache; + + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + bd_unlink_disk_holder(ca->bdev, d->disk); + } +} + +static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + const char *name) +{ + struct cache *ca = c->cache; + int ret; + + bd_link_disk_holder(ca->bdev, d->disk); + + snprintf(d->name, BCACHEDEVNAME_SIZE, + "%s%u", name, d->id); + + ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); + if (ret < 0) + pr_err("Couldn't create device -> cache set symlink\n"); + + ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); + if (ret < 0) + pr_err("Couldn't create cache set -> device symlink\n"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); +} + +static void bcache_device_detach(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + atomic_dec(&d->c->attached_dev_nr); + + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { + struct uuid_entry *u = d->c->uuids + d->id; + + SET_UUID_FLASH_ONLY(u, 0); + memcpy(u->uuid, invalid_uuid, 16); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); + bch_uuid_write(d->c); + } + + bcache_device_unlink(d); + + d->c->devices[d->id] = NULL; + closure_put(&d->c->caching); + d->c = NULL; +} + +static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, + unsigned int id) +{ + d->id = id; + d->c = c; + c->devices[id] = d; + + if (id >= c->devices_max_used) + c->devices_max_used = id + 1; + + closure_get(&c->caching); +} + +static inline int first_minor_to_idx(int first_minor) +{ + return (first_minor/BCACHE_MINORS); +} + +static inline int idx_to_first_minor(int idx) +{ + return (idx * BCACHE_MINORS); +} + +static void bcache_device_free(struct bcache_device *d) +{ + struct gendisk *disk = d->disk; + + lockdep_assert_held(&bch_register_lock); + + if (disk) + pr_info("%s stopped\n", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped\n"); + + if (d->c) + bcache_device_detach(d); + + if (disk) { + ida_simple_remove(&bcache_device_idx, + first_minor_to_idx(disk->first_minor)); + put_disk(disk); + } + + bioset_exit(&d->bio_split); + kvfree(d->full_dirty_stripes); + kvfree(d->stripe_sectors_dirty); + + closure_debug_destroy(&d->cl); +} + +static int bcache_device_init(struct bcache_device *d, unsigned int block_size, + sector_t sectors, struct block_device *cached_bdev, + const struct block_device_operations *ops) +{ + struct request_queue *q; + const size_t max_stripes = min_t(size_t, INT_MAX, + SIZE_MAX / sizeof(atomic_t)); + uint64_t n; + int idx; + + if (!d->stripe_size) + d->stripe_size = 1 << 31; + else if (d->stripe_size < BCH_MIN_STRIPE_SZ) + d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size); + + n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); + if (!n || n > max_stripes) { + pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n", + n); + return -ENOMEM; + } + d->nr_stripes = n; + + n = d->nr_stripes * sizeof(atomic_t); + d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL); + if (!d->stripe_sectors_dirty) + return -ENOMEM; + + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); + if (!d->full_dirty_stripes) + goto out_free_stripe_sectors_dirty; + + idx = ida_simple_get(&bcache_device_idx, 0, + BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + if (idx < 0) + goto out_free_full_dirty_stripes; + + if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto out_ida_remove; + + d->disk = blk_alloc_disk(NUMA_NO_NODE); + if (!d->disk) + goto out_bioset_exit; + + set_capacity(d->disk, sectors); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); + + d->disk->major = bcache_major; + d->disk->first_minor = idx_to_first_minor(idx); + d->disk->minors = BCACHE_MINORS; + d->disk->fops = ops; + d->disk->private_data = d; + + q = d->disk->queue; + q->limits.max_hw_sectors = UINT_MAX; + q->limits.max_sectors = UINT_MAX; + q->limits.max_segment_size = UINT_MAX; + q->limits.max_segments = BIO_MAX_VECS; + blk_queue_max_discard_sectors(q, UINT_MAX); + q->limits.discard_granularity = 512; + q->limits.io_min = block_size; + q->limits.logical_block_size = block_size; + q->limits.physical_block_size = block_size; + + if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) { + /* + * This should only happen with BCACHE_SB_VERSION_BDEV. + * Block/page size is checked for BCACHE_SB_VERSION_CDEV. + */ + pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", + d->disk->disk_name, q->limits.logical_block_size, + PAGE_SIZE, bdev_logical_block_size(cached_bdev)); + + /* This also adjusts physical block size/min io size if needed */ + blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev)); + } + + blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); + + blk_queue_write_cache(q, true, true); + + return 0; + +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: + ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); + return -ENOMEM; + +} + +/* Cached device */ + +static void calc_cached_dev_sectors(struct cache_set *c) +{ + uint64_t sectors = 0; + struct cached_dev *dc; + + list_for_each_entry(dc, &c->cached_devs, list) + sectors += bdev_nr_sectors(dc->bdev); + + c->cached_dev_sectors = sectors; +} + +#define BACKING_DEV_OFFLINE_TIMEOUT 5 +static int cached_dev_status_update(void *arg) +{ + struct cached_dev *dc = arg; + struct request_queue *q; + + /* + * If this delayed worker is stopping outside, directly quit here. + * dc->io_disable might be set via sysfs interface, so check it + * here too. + */ + while (!kthread_should_stop() && !dc->io_disable) { + q = bdev_get_queue(dc->bdev); + if (blk_queue_dying(q)) + dc->offline_seconds++; + else + dc->offline_seconds = 0; + + if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { + pr_err("%pg: device offline for %d seconds\n", + dc->bdev, + BACKING_DEV_OFFLINE_TIMEOUT); + pr_err("%s: disable I/O request due to backing device offline\n", + dc->disk.name); + dc->io_disable = true; + /* let others know earlier that io_disable is true */ + smp_mb(); + bcache_device_stop(&dc->disk); + break; + } + schedule_timeout_interruptible(HZ); + } + + wait_for_kthread_stop(); + return 0; +} + + +int bch_cached_dev_run(struct cached_dev *dc) +{ + int ret = 0; + struct bcache_device *d = &dc->disk; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); + char *env[] = { + "DRIVER=bcache", + kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), + kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""), + NULL, + }; + + if (dc->io_disable) { + pr_err("I/O disabled on cached dev %pg\n", dc->bdev); + ret = -EIO; + goto out; + } + + if (atomic_xchg(&dc->running, 1)) { + pr_info("cached dev %pg is running already\n", dc->bdev); + ret = -EBUSY; + goto out; + } + + if (!d->c && + BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { + struct closure cl; + + closure_init_stack(&cl); + + SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + } + + ret = add_disk(d->disk); + if (ret) + goto out; + bd_link_disk_holder(dc->bdev, dc->disk.disk); + /* + * won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent + */ + kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, + &d->kobj, "bcache")) { + pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n"); + ret = -ENOMEM; + goto out; + } + + dc->status_update_thread = kthread_run(cached_dev_status_update, + dc, "bcache_status_update"); + if (IS_ERR(dc->status_update_thread)) { + pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n"); + } + +out: + kfree(env[1]); + kfree(env[2]); + kfree(buf); + return ret; +} + +/* + * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed + * work dc->writeback_rate_update is running. Wait until the routine + * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to + * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out + * seconds, give up waiting here and continue to cancel it too. + */ +static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) +{ + int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; + + do { + if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, + &dc->disk.flags)) + break; + time_out--; + schedule_timeout_interruptible(1); + } while (time_out > 0); + + if (time_out == 0) + pr_warn("give up waiting for dc->writeback_write_update to quit\n"); + + cancel_delayed_work_sync(&dc->writeback_rate_update); +} + +static void cached_dev_detach_finish(struct work_struct *w) +{ + struct cached_dev *dc = container_of(w, struct cached_dev, detach); + struct cache_set *c = dc->disk.c; + + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); + BUG_ON(refcount_read(&dc->count)); + + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); + + if (!IS_ERR_OR_NULL(dc->writeback_thread)) { + kthread_stop(dc->writeback_thread); + dc->writeback_thread = NULL; + } + + mutex_lock(&bch_register_lock); + + bcache_device_detach(&dc->disk); + list_move(&dc->list, &uncached_devices); + calc_cached_dev_sectors(c); + + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); + + mutex_unlock(&bch_register_lock); + + pr_info("Caching disabled for %pg\n", dc->bdev); + + /* Drop ref we took in cached_dev_detach() */ + closure_put(&dc->disk.cl); +} + +void bch_cached_dev_detach(struct cached_dev *dc) +{ + lockdep_assert_held(&bch_register_lock); + + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) + return; + + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + return; + + /* + * Block the device from being closed and freed until we're finished + * detaching + */ + closure_get(&dc->disk.cl); + + bch_writeback_queue(dc); + + cached_dev_put(dc); +} + +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid) +{ + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); + struct uuid_entry *u; + struct cached_dev *exist_dc, *t; + int ret = 0; + + if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16))) + return -ENOENT; + + if (dc->disk.c) { + pr_err("Can't attach %pg: already attached\n", dc->bdev); + return -EINVAL; + } + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) { + pr_err("Can't attach %pg: shutting down\n", dc->bdev); + return -EINVAL; + } + + if (dc->sb.block_size < c->cache->sb.block_size) { + /* Will die */ + pr_err("Couldn't attach %pg: block size less than set's block size\n", + dc->bdev); + return -EINVAL; + } + + /* Check whether already attached */ + list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) { + if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) { + pr_err("Tried to attach %pg but duplicate UUID already attached\n", + dc->bdev); + + return -EINVAL; + } + } + + u = uuid_find(c, dc->sb.uuid); + + if (u && + (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || + BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { + memcpy(u->uuid, invalid_uuid, 16); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); + u = NULL; + } + + if (!u) { + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + pr_err("Couldn't find uuid for %pg in set\n", dc->bdev); + return -ENOENT; + } + + u = uuid_find_empty(c); + if (!u) { + pr_err("Not caching %pg, no room for UUID\n", dc->bdev); + return -EINVAL; + } + } + + /* + * Deadlocks since we're called via sysfs... + * sysfs_remove_file(&dc->kobj, &sysfs_attach); + */ + + if (bch_is_zero(u->uuid, 16)) { + struct closure cl; + + closure_init_stack(&cl); + + memcpy(u->uuid, dc->sb.uuid, 16); + memcpy(u->label, dc->sb.label, SB_LABEL_SIZE); + u->first_reg = u->last_reg = rtime; + bch_uuid_write(c); + + memcpy(dc->sb.set_uuid, c->set_uuid, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + } else { + u->last_reg = rtime; + bch_uuid_write(c); + } + + bcache_device_attach(&dc->disk, c, u - c->uuids); + list_move(&dc->list, &c->cached_devs); + calc_cached_dev_sectors(c); + + /* + * dc->c must be set before dc->count != 0 - paired with the mb in + * cached_dev_get() + */ + smp_wmb(); + refcount_set(&dc->count, 1); + + /* Block writeback thread, but spawn it */ + down_write(&dc->writeback_lock); + if (bch_cached_dev_writeback_start(dc)) { + up_write(&dc->writeback_lock); + pr_err("Couldn't start writeback facilities for %s\n", + dc->disk.disk->disk_name); + return -ENOMEM; + } + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + atomic_set(&dc->has_dirty, 1); + bch_writeback_queue(dc); + } + + bch_sectors_dirty_init(&dc->disk); + + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); + /* + * bch_register_lock is held, bcache_device_stop() is not + * able to be directly called. The kthread and kworker + * created previously in bch_cached_dev_writeback_start() + * have to be stopped manually here. + */ + kthread_stop(dc->writeback_thread); + cancel_writeback_rate_update_dwork(dc); + pr_err("Couldn't run cached device %pg\n", dc->bdev); + return ret; + } + + bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); + + if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) { + pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n"); + pr_err("Please update to the latest bcache-tools to create the cache device\n"); + set_disk_ro(dc->disk.disk, 1); + } + + /* Allow the writeback thread to proceed */ + up_write(&dc->writeback_lock); + + pr_info("Caching %pg as %s on set %pU\n", + dc->bdev, + dc->disk.disk->disk_name, + dc->disk.c->set_uuid); + return 0; +} + +/* when dc->disk.kobj released */ +void bch_cached_dev_release(struct kobject *kobj) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + kfree(dc); + module_put(THIS_MODULE); +} + +static void cached_dev_free(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); + + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + kthread_stop(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->status_update_thread)) + kthread_stop(dc->status_update_thread); + + mutex_lock(&bch_register_lock); + + if (atomic_read(&dc->running)) { + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + del_gendisk(dc->disk.disk); + } + bcache_device_free(&dc->disk); + list_del(&dc->list); + + mutex_unlock(&bch_register_lock); + + if (dc->sb_disk) + put_page(virt_to_page(dc->sb_disk)); + + if (!IS_ERR_OR_NULL(dc->bdev)) + blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + wake_up(&unregister_wait); + + kobject_put(&dc->disk.kobj); +} + +static void cached_dev_flush(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + struct bcache_device *d = &dc->disk; + + mutex_lock(&bch_register_lock); + bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); + + bch_cache_accounting_destroy(&dc->accounting); + kobject_del(&d->kobj); + + continue_at(cl, cached_dev_free, system_wq); +} + +static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) +{ + int ret; + struct io *io; + struct request_queue *q = bdev_get_queue(dc->bdev); + + __module_get(THIS_MODULE); + INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); + INIT_WORK(&dc->detach, cached_dev_detach_finish); + sema_init(&dc->sb_write_mutex, 1); + INIT_LIST_HEAD(&dc->io_lru); + spin_lock_init(&dc->io_lock); + bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); + + dc->sequential_cutoff = 4 << 20; + + for (io = dc->io; io < dc->io + RECENT_IO; io++) { + list_add(&io->lru, &dc->io_lru); + hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); + } + + dc->disk.stripe_size = q->limits.io_opt >> 9; + + if (dc->disk.stripe_size) + dc->partial_stripes_expensive = + q->limits.raid_partial_stripes_expensive; + + ret = bcache_device_init(&dc->disk, block_size, + bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, + dc->bdev, &bcache_cached_ops); + if (ret) + return ret; + + blk_queue_io_opt(dc->disk.disk->queue, + max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); + + atomic_set(&dc->io_errors, 0); + dc->io_disable = false; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; + /* default to auto */ + dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; + + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); + return 0; +} + +/* Cached device - bcache superblock */ + +static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, + struct block_device *bdev, + struct cached_dev *dc) +{ + const char *err = "cannot allocate memory"; + struct cache_set *c; + int ret = -ENOMEM; + + memcpy(&dc->sb, sb, sizeof(struct cache_sb)); + dc->bdev = bdev; + dc->bdev->bd_holder = dc; + dc->sb_disk = sb_disk; + + if (cached_dev_init(dc, sb->block_size << 9)) + goto err; + + err = "error creating kobject"; + if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache")) + goto err; + if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) + goto err; + + pr_info("registered backing device %pg\n", dc->bdev); + + list_add(&dc->list, &uncached_devices); + /* attach to a matched cache set if it exists */ + list_for_each_entry(c, &bch_cache_sets, list) + bch_cached_dev_attach(dc, c, NULL); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { + err = "failed to run cached device"; + ret = bch_cached_dev_run(dc); + if (ret) + goto err; + } + + return 0; +err: + pr_notice("error %pg: %s\n", dc->bdev, err); + bcache_device_stop(&dc->disk); + return ret; +} + +/* Flash only volumes */ + +/* When d->kobj released */ +void bch_flash_dev_release(struct kobject *kobj) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + kfree(d); +} + +static void flash_dev_free(struct closure *cl) +{ + struct bcache_device *d = container_of(cl, struct bcache_device, cl); + + mutex_lock(&bch_register_lock); + atomic_long_sub(bcache_dev_sectors_dirty(d), + &d->c->flash_dev_dirty_sectors); + del_gendisk(d->disk); + bcache_device_free(d); + mutex_unlock(&bch_register_lock); + kobject_put(&d->kobj); +} + +static void flash_dev_flush(struct closure *cl) +{ + struct bcache_device *d = container_of(cl, struct bcache_device, cl); + + mutex_lock(&bch_register_lock); + bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); + kobject_del(&d->kobj); + continue_at(cl, flash_dev_free, system_wq); +} + +static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) +{ + int err = -ENOMEM; + struct bcache_device *d = kzalloc(sizeof(struct bcache_device), + GFP_KERNEL); + if (!d) + goto err_ret; + + closure_init(&d->cl, NULL); + set_closure_fn(&d->cl, flash_dev_flush, system_wq); + + kobject_init(&d->kobj, &bch_flash_dev_ktype); + + if (bcache_device_init(d, block_bytes(c->cache), u->sectors, + NULL, &bcache_flash_ops)) + goto err; + + bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); + bch_flash_dev_request_init(d); + err = add_disk(d->disk); + if (err) + goto err; + + err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"); + if (err) + goto err; + + bcache_device_link(d, c, "volume"); + + if (bch_has_feature_obso_large_bucket(&c->cache->sb)) { + pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n"); + pr_err("Please update to the latest bcache-tools to create the cache device\n"); + set_disk_ro(d->disk, 1); + } + + return 0; +err: + kobject_put(&d->kobj); +err_ret: + return err; +} + +static int flash_devs_run(struct cache_set *c) +{ + int ret = 0; + struct uuid_entry *u; + + for (u = c->uuids; + u < c->uuids + c->nr_uuids && !ret; + u++) + if (UUID_FLASH_ONLY(u)) + ret = flash_dev_run(c, u); + + return ret; +} + +int bch_flash_dev_create(struct cache_set *c, uint64_t size) +{ + struct uuid_entry *u; + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) + return -EINTR; + + if (!test_bit(CACHE_SET_RUNNING, &c->flags)) + return -EPERM; + + u = uuid_find_empty(c); + if (!u) { + pr_err("Can't create volume, no room for UUID\n"); + return -EINVAL; + } + + get_random_bytes(u->uuid, 16); + memset(u->label, 0, 32); + u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds()); + + SET_UUID_FLASH_ONLY(u, 1); + u->sectors = size >> 9; + + bch_uuid_write(c); + + return flash_dev_run(c, u); +} + +bool bch_cached_dev_error(struct cached_dev *dc) +{ + if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) + return false; + + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); + + pr_err("stop %s: too many IO errors on backing device %pg\n", + dc->disk.disk->disk_name, dc->bdev); + + bcache_device_stop(&dc->disk); + return true; +} + +/* Cache set */ + +__printf(2, 3) +bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (c->on_error != ON_ERROR_PANIC && + test_bit(CACHE_SET_STOPPING, &c->flags)) + return false; + + if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) + pr_info("CACHE_SET_IO_DISABLE already set\n"); + + /* + * XXX: we can be called from atomic context + * acquire_console_sem(); + */ + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("error on %pU: %pV, disabling caching\n", + c->set_uuid, &vaf); + + va_end(args); + + if (c->on_error == ON_ERROR_PANIC) + panic("panic forced after error\n"); + + bch_cache_set_unregister(c); + return true; +} + +/* When c->kobj released */ +void bch_cache_set_release(struct kobject *kobj) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + kfree(c); + module_put(THIS_MODULE); +} + +static void cache_set_free(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, cl); + struct cache *ca; + + debugfs_remove(c->debug); + + bch_open_buckets_free(c); + bch_btree_cache_free(c); + bch_journal_free(c); + + mutex_lock(&bch_register_lock); + bch_bset_sort_state_free(&c->sort); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + + ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; + kobject_put(&ca->kobj); + } + + + if (c->moving_gc_wq) + destroy_workqueue(c->moving_gc_wq); + bioset_exit(&c->bio_split); + mempool_exit(&c->fill_iter); + mempool_exit(&c->bio_meta); + mempool_exit(&c->search); + kfree(c->devices); + + list_del(&c->list); + mutex_unlock(&bch_register_lock); + + pr_info("Cache set %pU unregistered\n", c->set_uuid); + wake_up(&unregister_wait); + + closure_debug_destroy(&c->cl); + kobject_put(&c->kobj); +} + +static void cache_set_flush(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cache *ca = c->cache; + struct btree *b; + + bch_cache_accounting_destroy(&c->accounting); + + kobject_put(&c->internal); + kobject_del(&c->kobj); + + if (!IS_ERR_OR_NULL(c->gc_thread)) + kthread_stop(c->gc_thread); + + if (!IS_ERR(c->root)) + list_add(&c->root->list, &c->btree_cache); + + /* + * Avoid flushing cached nodes if cache set is retiring + * due to too many I/O errors detected. + */ + if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } + + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); + + if (c->journal.cur) { + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + } + + closure_return(cl); +} + +/* + * This function is only called when CACHE_SET_IO_DISABLE is set, which means + * cache set is unregistering due to too many I/O errors. In this condition, + * the bcache device might be stopped, it depends on stop_when_cache_set_failed + * value and whether the broken cache has dirty data: + * + * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device + * BCH_CACHED_STOP_AUTO 0 NO + * BCH_CACHED_STOP_AUTO 1 YES + * BCH_CACHED_DEV_STOP_ALWAYS 0 YES + * BCH_CACHED_DEV_STOP_ALWAYS 1 YES + * + * The expected behavior is, if stop_when_cache_set_failed is configured to + * "auto" via sysfs interface, the bcache device will not be stopped if the + * backing device is clean on the broken cache device. + */ +static void conditional_stop_bcache_device(struct cache_set *c, + struct bcache_device *d, + struct cached_dev *dc) +{ + if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { + pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n", + d->disk->disk_name, c->set_uuid); + bcache_device_stop(d); + } else if (atomic_read(&dc->has_dirty)) { + /* + * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO + * and dc->has_dirty == 1 + */ + pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n", + d->disk->disk_name); + /* + * There might be a small time gap that cache set is + * released but bcache device is not. Inside this time + * gap, regular I/O requests will directly go into + * backing device as no cache set attached to. This + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due + * to a broken cache device, dc->io_disable should be + * explicitly set to true. + */ + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); + bcache_device_stop(d); + } else { + /* + * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO + * and dc->has_dirty == 0 + */ + pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n", + d->disk->disk_name); + } +} + +static void __cache_set_unregister(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cached_dev *dc; + struct bcache_device *d; + size_t i; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->devices_max_used; i++) { + d = c->devices[i]; + if (!d) + continue; + + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(d, struct cached_dev, disk); + bch_cached_dev_detach(dc); + if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + conditional_stop_bcache_device(c, d, dc); + } else { + bcache_device_stop(d); + } + } + + mutex_unlock(&bch_register_lock); + + continue_at(cl, cache_set_flush, system_wq); +} + +void bch_cache_set_stop(struct cache_set *c) +{ + if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + /* closure_fn set to __cache_set_unregister() */ + closure_queue(&c->caching); +} + +void bch_cache_set_unregister(struct cache_set *c) +{ + set_bit(CACHE_SET_UNREGISTERING, &c->flags); + bch_cache_set_stop(c); +} + +#define alloc_meta_bucket_pages(gfp, sb) \ + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb)))) + +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) +{ + int iter_size; + struct cache *ca = container_of(sb, struct cache, sb); + struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); + + if (!c) + return NULL; + + __module_get(THIS_MODULE); + closure_init(&c->cl, NULL); + set_closure_fn(&c->cl, cache_set_free, system_wq); + + closure_init(&c->caching, &c->cl); + set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + + /* Maybe create continue_at_noreturn() and use it here? */ + closure_set_stopped(&c->cl); + closure_put(&c->cl); + + kobject_init(&c->kobj, &bch_cache_set_ktype); + kobject_init(&c->internal, &bch_cache_set_internal_ktype); + + bch_cache_accounting_init(&c->accounting, &c->cl); + + memcpy(c->set_uuid, sb->set_uuid, 16); + + c->cache = ca; + c->cache->set = c; + c->bucket_bits = ilog2(sb->bucket_size); + c->block_bits = ilog2(sb->block_size); + c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry); + c->devices_max_used = 0; + atomic_set(&c->attached_dev_nr, 0); + c->btree_pages = meta_bucket_pages(sb); + if (c->btree_pages > BTREE_MAX_PAGES) + c->btree_pages = max_t(int, c->btree_pages / 4, + BTREE_MAX_PAGES); + + sema_init(&c->sb_write_mutex, 1); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); + init_waitqueue_head(&c->bucket_wait); + init_waitqueue_head(&c->gc_wait); + sema_init(&c->uuid_write_mutex, 1); + + spin_lock_init(&c->btree_gc_time.lock); + spin_lock_init(&c->btree_split_time.lock); + spin_lock_init(&c->btree_read_time.lock); + + bch_moving_init_cache_set(c); + + INIT_LIST_HEAD(&c->list); + INIT_LIST_HEAD(&c->cached_devs); + INIT_LIST_HEAD(&c->btree_cache); + INIT_LIST_HEAD(&c->btree_cache_freeable); + INIT_LIST_HEAD(&c->btree_cache_freed); + INIT_LIST_HEAD(&c->data_buckets); + + iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * + sizeof(struct btree_iter_set); + + c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); + if (!c->devices) + goto err; + + if (mempool_init_slab_pool(&c->search, 32, bch_search_cache)) + goto err; + + if (mempool_init_kmalloc_pool(&c->bio_meta, 2, + sizeof(struct bbio) + + sizeof(struct bio_vec) * meta_bucket_pages(sb))) + goto err; + + if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size)) + goto err; + + if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_RESCUER)) + goto err; + + c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); + if (!c->uuids) + goto err; + + c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + if (!c->moving_gc_wq) + goto err; + + if (bch_journal_alloc(c)) + goto err; + + if (bch_btree_cache_alloc(c)) + goto err; + + if (bch_open_buckets_alloc(c)) + goto err; + + if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) + goto err; + + c->congested_read_threshold_us = 2000; + c->congested_write_threshold_us = 20000; + c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; + WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + return c; +err: + bch_cache_set_unregister(c); + return NULL; +} + +static int run_cache_set(struct cache_set *c) +{ + const char *err = "cannot allocate memory"; + struct cached_dev *dc, *t; + struct cache *ca = c->cache; + struct closure cl; + LIST_HEAD(journal); + struct journal_replay *l; + + closure_init_stack(&cl); + + c->nbuckets = ca->sb.nbuckets; + set_gc_sectors(c); + + if (CACHE_SYNC(&c->cache->sb)) { + struct bkey *k; + struct jset *j; + + err = "cannot allocate memory for journal"; + if (bch_journal_read(c, &journal)) + goto err; + + pr_debug("btree_journal_read() done\n"); + + err = "no journal entries found"; + if (list_empty(&journal)) + goto err; + + j = &list_entry(journal.prev, struct journal_replay, list)->j; + + err = "IO error reading priorities"; + if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) + goto err; + + /* + * If prio_read() fails it'll call cache_set_error and we'll + * tear everything down right away, but if we perhaps checked + * sooner we could avoid journal replay. + */ + + k = &j->btree_root; + + err = "bad btree root"; + if (__bch_btree_ptr_invalid(c, k)) + goto err; + + err = "error reading btree root"; + c->root = bch_btree_node_get(c, NULL, k, + j->btree_level, + true, NULL); + if (IS_ERR(c->root)) + goto err; + + list_del_init(&c->root->list); + rw_unlock(true, c->root); + + err = uuid_read(c, j, &cl); + if (err) + goto err; + + err = "error in recovery"; + if (bch_btree_check(c)) + goto err; + + bch_journal_mark(c, &journal); + bch_initial_gc_finish(c); + pr_debug("btree_check() done\n"); + + /* + * bcache_journal_next() can't happen sooner, or + * btree_gc_finish() will give spurious errors about last_gc > + * gc_gen - this is a hack but oh well. + */ + bch_journal_next(&c->journal); + + err = "error starting allocator thread"; + if (bch_cache_allocator_start(ca)) + goto err; + + /* + * First place it's safe to allocate: btree_check() and + * btree_gc_finish() have to run before we have buckets to + * allocate, and bch_bucket_alloc_set() might cause a journal + * entry to be written so bcache_journal_next() has to be called + * first. + * + * If the uuids were in the old format we have to rewrite them + * before the next journal entry is written: + */ + if (j->version < BCACHE_JSET_VERSION_UUID) + __uuid_write(c); + + err = "bcache: replay journal failed"; + if (bch_journal_replay(c, &journal)) + goto err; + } else { + unsigned int j; + + pr_notice("invalidating existing data\n"); + ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, + 2, SB_JOURNAL_BUCKETS); + + for (j = 0; j < ca->sb.keys; j++) + ca->sb.d[j] = ca->sb.first_bucket + j; + + bch_initial_gc_finish(c); + + err = "error starting allocator thread"; + if (bch_cache_allocator_start(ca)) + goto err; + + mutex_lock(&c->bucket_lock); + bch_prio_write(ca, true); + mutex_unlock(&c->bucket_lock); + + err = "cannot allocate new UUID bucket"; + if (__uuid_write(c)) + goto err; + + err = "cannot allocate new btree root"; + c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); + if (IS_ERR(c->root)) + goto err; + + mutex_lock(&c->root->write_lock); + bkey_copy_key(&c->root->key, &MAX_KEY); + bch_btree_node_write(c->root, &cl); + mutex_unlock(&c->root->write_lock); + + bch_btree_set_root(c->root); + rw_unlock(true, c->root); + + /* + * We don't want to write the first journal entry until + * everything is set up - fortunately journal entries won't be + * written until the SET_CACHE_SYNC() here: + */ + SET_CACHE_SYNC(&c->cache->sb, true); + + bch_journal_next(&c->journal); + bch_journal_meta(c, &cl); + } + + err = "error starting gc thread"; + if (bch_gc_thread_start(c)) + goto err; + + closure_sync(&cl); + c->cache->sb.last_mount = (u32)ktime_get_real_seconds(); + bcache_write_super(c); + + if (bch_has_feature_obso_large_bucket(&c->cache->sb)) + pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n"); + + list_for_each_entry_safe(dc, t, &uncached_devices, list) + bch_cached_dev_attach(dc, c, NULL); + + flash_devs_run(c); + + bch_journal_space_reserve(&c->journal); + set_bit(CACHE_SET_RUNNING, &c->flags); + return 0; +err: + while (!list_empty(&journal)) { + l = list_first_entry(&journal, struct journal_replay, list); + list_del(&l->list); + kfree(l); + } + + closure_sync(&cl); + + bch_cache_set_error(c, "%s", err); + + return -EIO; +} + +static const char *register_cache_set(struct cache *ca) +{ + char buf[12]; + const char *err = "cannot allocate memory"; + struct cache_set *c; + + list_for_each_entry(c, &bch_cache_sets, list) + if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) { + if (c->cache) + return "duplicate cache set member"; + + goto found; + } + + c = bch_cache_set_alloc(&ca->sb); + if (!c) + return err; + + err = "error creating kobject"; + if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) || + kobject_add(&c->internal, &c->kobj, "internal")) + goto err; + + if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) + goto err; + + bch_debug_init_cache_set(c); + + list_add(&c->list, &bch_cache_sets); +found: + sprintf(buf, "cache%i", ca->sb.nr_this_dev); + if (sysfs_create_link(&ca->kobj, &c->kobj, "set") || + sysfs_create_link(&c->kobj, &ca->kobj, buf)) + goto err; + + kobject_get(&ca->kobj); + ca->set = c; + ca->set->cache = ca; + + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; + + return NULL; +err: + bch_cache_set_unregister(c); + return err; +} + +/* Cache device */ + +/* When ca->kobj released */ +void bch_cache_release(struct kobject *kobj) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + unsigned int i; + + if (ca->set) { + BUG_ON(ca->set->cache != ca); + ca->set->cache = NULL; + } + + free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb))); + kfree(ca->prio_buckets); + vfree(ca->buckets); + + free_heap(&ca->heap); + free_fifo(&ca->free_inc); + + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); + + if (ca->sb_disk) + put_page(virt_to_page(ca->sb_disk)); + + if (!IS_ERR_OR_NULL(ca->bdev)) + blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + kfree(ca); + module_put(THIS_MODULE); +} + +static int cache_alloc(struct cache *ca) +{ + size_t free; + size_t btree_buckets; + struct bucket *b; + int ret = -ENOMEM; + const char *err = NULL; + + __module_get(THIS_MODULE); + kobject_init(&ca->kobj, &bch_cache_ktype); + + bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + + /* + * when ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split, + * so bucket of RESERVE_BTREE type is needed, + * the worst situation is all journal buckets are valid journal, + * and all the keys need to replay, + * so the number of RESERVE_BTREE type buckets should be as much + * as journal buckets + */ + btree_buckets = ca->sb.njournal_buckets ?: 8; + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; + if (!free) { + ret = -EPERM; + err = "ca->sb.nbuckets is too small"; + goto err_free; + } + + if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, + GFP_KERNEL)) { + err = "ca->free[RESERVE_BTREE] alloc failed"; + goto err_btree_alloc; + } + + if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), + GFP_KERNEL)) { + err = "ca->free[RESERVE_PRIO] alloc failed"; + goto err_prio_alloc; + } + + if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) { + err = "ca->free[RESERVE_MOVINGGC] alloc failed"; + goto err_movinggc_alloc; + } + + if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) { + err = "ca->free[RESERVE_NONE] alloc failed"; + goto err_none_alloc; + } + + if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) { + err = "ca->free_inc alloc failed"; + goto err_free_inc_alloc; + } + + if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) { + err = "ca->heap alloc failed"; + goto err_heap_alloc; + } + + ca->buckets = vzalloc(array_size(sizeof(struct bucket), + ca->sb.nbuckets)); + if (!ca->buckets) { + err = "ca->buckets alloc failed"; + goto err_buckets_alloc; + } + + ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t), + prio_buckets(ca), 2), + GFP_KERNEL); + if (!ca->prio_buckets) { + err = "ca->prio_buckets alloc failed"; + goto err_prio_buckets_alloc; + } + + ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb); + if (!ca->disk_buckets) { + err = "ca->disk_buckets alloc failed"; + goto err_disk_buckets_alloc; + } + + ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); + + for_each_bucket(b, ca) + atomic_set(&b->pin, 0); + return 0; + +err_disk_buckets_alloc: + kfree(ca->prio_buckets); +err_prio_buckets_alloc: + vfree(ca->buckets); +err_buckets_alloc: + free_heap(&ca->heap); +err_heap_alloc: + free_fifo(&ca->free_inc); +err_free_inc_alloc: + free_fifo(&ca->free[RESERVE_NONE]); +err_none_alloc: + free_fifo(&ca->free[RESERVE_MOVINGGC]); +err_movinggc_alloc: + free_fifo(&ca->free[RESERVE_PRIO]); +err_prio_alloc: + free_fifo(&ca->free[RESERVE_BTREE]); +err_btree_alloc: +err_free: + module_put(THIS_MODULE); + if (err) + pr_notice("error %pg: %s\n", ca->bdev, err); + return ret; +} + +static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, + struct block_device *bdev, struct cache *ca) +{ + const char *err = NULL; /* must be set for any error case */ + int ret = 0; + + memcpy(&ca->sb, sb, sizeof(struct cache_sb)); + ca->bdev = bdev; + ca->bdev->bd_holder = ca; + ca->sb_disk = sb_disk; + + if (bdev_max_discard_sectors((bdev))) + ca->discard = CACHE_DISCARD(&ca->sb); + + ret = cache_alloc(ca); + if (ret != 0) { + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call blkdev_put() to bdev in bch_cache_release(). So we + * explicitly call blkdev_put() here. + */ + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + if (ret == -ENOMEM) + err = "cache_alloc(): -ENOMEM"; + else if (ret == -EPERM) + err = "cache_alloc(): cache device is too small"; + else + err = "cache_alloc(): unknown error"; + goto err; + } + + if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) { + err = "error calling kobject_add"; + ret = -ENOMEM; + goto out; + } + + mutex_lock(&bch_register_lock); + err = register_cache_set(ca); + mutex_unlock(&bch_register_lock); + + if (err) { + ret = -ENODEV; + goto out; + } + + pr_info("registered cache device %pg\n", ca->bdev); + +out: + kobject_put(&ca->kobj); + +err: + if (err) + pr_notice("error %pg: %s\n", ca->bdev, err); + + return ret; +} + +/* Global interfaces/init */ + +static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); + +kobj_attribute_write(register, register_bcache); +kobj_attribute_write(register_quiet, register_bcache); +kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); + +static bool bch_is_open_backing(dev_t dev) +{ + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev->bd_dev == dev) + return true; + list_for_each_entry_safe(dc, t, &uncached_devices, list) + if (dc->bdev->bd_dev == dev) + return true; + return false; +} + +static bool bch_is_open_cache(dev_t dev) +{ + struct cache_set *c, *tc; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + struct cache *ca = c->cache; + + if (ca->bdev->bd_dev == dev) + return true; + } + + return false; +} + +static bool bch_is_open(dev_t dev) +{ + return bch_is_open_cache(dev) || bch_is_open_backing(dev); +} + +struct async_reg_args { + struct delayed_work reg_work; + char *path; + struct cache_sb *sb; + struct cache_sb_disk *sb_disk; + struct block_device *bdev; +}; + +static void register_bdev_worker(struct work_struct *work) +{ + int fail = false; + struct async_reg_args *args = + container_of(work, struct async_reg_args, reg_work.work); + struct cached_dev *dc; + + dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) { + fail = true; + put_page(virt_to_page(args->sb_disk)); + blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + goto out; + } + + mutex_lock(&bch_register_lock); + if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0) + fail = true; + mutex_unlock(&bch_register_lock); + +out: + if (fail) + pr_info("error %s: fail to register backing device\n", + args->path); + kfree(args->sb); + kfree(args->path); + kfree(args); + module_put(THIS_MODULE); +} + +static void register_cache_worker(struct work_struct *work) +{ + int fail = false; + struct async_reg_args *args = + container_of(work, struct async_reg_args, reg_work.work); + struct cache *ca; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) { + fail = true; + put_page(virt_to_page(args->sb_disk)); + blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + goto out; + } + + /* blkdev_put() will be called in bch_cache_release() */ + if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0) + fail = true; + +out: + if (fail) + pr_info("error %s: fail to register cache device\n", + args->path); + kfree(args->sb); + kfree(args->path); + kfree(args); + module_put(THIS_MODULE); +} + +static void register_device_async(struct async_reg_args *args) +{ + if (SB_IS_BDEV(args->sb)) + INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker); + else + INIT_DELAYED_WORK(&args->reg_work, register_cache_worker); + + /* 10 jiffies is enough for a delay */ + queue_delayed_work(system_wq, &args->reg_work, 10); +} + +static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) +{ + const char *err; + char *path = NULL; + struct cache_sb *sb; + struct cache_sb_disk *sb_disk; + struct block_device *bdev; + ssize_t ret; + bool async_registration = false; + +#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION + async_registration = true; +#endif + + ret = -EBUSY; + err = "failed to reference bcache module"; + if (!try_module_get(THIS_MODULE)) + goto out; + + /* For latest state of bcache_is_reboot */ + smp_mb(); + err = "bcache is in reboot"; + if (bcache_is_reboot) + goto out_module_put; + + ret = -ENOMEM; + err = "cannot allocate memory"; + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto out_module_put; + + sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); + if (!sb) + goto out_free_path; + + ret = -EINVAL; + err = "failed to open device"; + bdev = blkdev_get_by_path(strim(path), + FMODE_READ|FMODE_WRITE|FMODE_EXCL, + sb); + if (IS_ERR(bdev)) { + if (bdev == ERR_PTR(-EBUSY)) { + dev_t dev; + + mutex_lock(&bch_register_lock); + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) + err = "device already registered"; + else + err = "device busy"; + mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) + goto done; + } + goto out_free_sb; + } + + err = "failed to set blocksize"; + if (set_blocksize(bdev, 4096)) + goto out_blkdev_put; + + err = read_super(sb, bdev, &sb_disk); + if (err) + goto out_blkdev_put; + + err = "failed to register device"; + + if (async_registration) { + /* register in asynchronous way */ + struct async_reg_args *args = + kzalloc(sizeof(struct async_reg_args), GFP_KERNEL); + + if (!args) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_page; + } + + args->path = path; + args->sb = sb; + args->sb_disk = sb_disk; + args->bdev = bdev; + register_device_async(args); + /* No wait and returns to user space */ + goto async_done; + } + + if (SB_IS_BDEV(sb)) { + struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + + if (!dc) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_page; + } + + mutex_lock(&bch_register_lock); + ret = register_bdev(sb, sb_disk, bdev, dc); + mutex_unlock(&bch_register_lock); + /* blkdev_put() will be called in cached_dev_free() */ + if (ret < 0) + goto out_free_sb; + } else { + struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + if (!ca) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_page; + } + + /* blkdev_put() will be called in bch_cache_release() */ + ret = register_cache(sb, sb_disk, bdev, ca); + if (ret) + goto out_free_sb; + } + +done: + kfree(sb); + kfree(path); + module_put(THIS_MODULE); +async_done: + return size; + +out_put_sb_page: + put_page(virt_to_page(sb_disk)); +out_blkdev_put: + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_free_sb: + kfree(sb); +out_free_path: + kfree(path); + path = NULL; +out_module_put: + module_put(THIS_MODULE); +out: + pr_info("error %s: %s\n", path?path:"", err); + return ret; +} + + +struct pdev { + struct list_head list; + struct cached_dev *dc; +}; + +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, + size_t size) +{ + LIST_HEAD(pending_devs); + ssize_t ret = size; + struct cached_dev *dc, *tdc; + struct pdev *pdev, *tpdev; + struct cache_set *c, *tc; + + mutex_lock(&bch_register_lock); + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { + pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); + if (!pdev) + break; + pdev->dc = dc; + list_add(&pdev->list, &pending_devs); + } + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + char *set_uuid = c->set_uuid; + + if (!memcmp(pdev_set_uuid, set_uuid, 16)) { + list_del(&pdev->list); + kfree(pdev); + break; + } + } + } + mutex_unlock(&bch_register_lock); + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + pr_info("delete pdev %p\n", pdev); + list_del(&pdev->list); + bcache_device_stop(&pdev->dc->disk); + kfree(pdev); + } + + return ret; +} + +static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + if (bcache_is_reboot) + return NOTIFY_DONE; + + if (code == SYS_DOWN || + code == SYS_HALT || + code == SYS_POWER_OFF) { + DEFINE_WAIT(wait); + unsigned long start = jiffies; + bool stopped = false; + + struct cache_set *c, *tc; + struct cached_dev *dc, *tdc; + + mutex_lock(&bch_register_lock); + + if (bcache_is_reboot) + goto out; + + /* New registration is rejected since now */ + bcache_is_reboot = true; + /* + * Make registering caller (if there is) on other CPU + * core know bcache_is_reboot set to true earlier + */ + smp_mb(); + + if (list_empty(&bch_cache_sets) && + list_empty(&uncached_devices)) + goto out; + + mutex_unlock(&bch_register_lock); + + pr_info("Stopping all devices:\n"); + + /* + * The reason bch_register_lock is not held to call + * bch_cache_set_stop() and bcache_device_stop() is to + * avoid potential deadlock during reboot, because cache + * set or bcache device stopping process will acquire + * bch_register_lock too. + * + * We are safe here because bcache_is_reboot sets to + * true already, register_bcache() will reject new + * registration now. bcache_is_reboot also makes sure + * bcache_reboot() won't be re-entered on by other thread, + * so there is no race in following list iteration by + * list_for_each_entry_safe(). + */ + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + + + /* + * Give an early chance for other kthreads and + * kworkers to stop themselves + */ + schedule(); + + /* What's a condition variable? */ + while (1) { + long timeout = start + 10 * HZ - jiffies; + + mutex_lock(&bch_register_lock); + stopped = list_empty(&bch_cache_sets) && + list_empty(&uncached_devices); + + if (timeout < 0 || stopped) + break; + + prepare_to_wait(&unregister_wait, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&bch_register_lock); + schedule_timeout(timeout); + } + + finish_wait(&unregister_wait, &wait); + + if (stopped) + pr_info("All devices stopped\n"); + else + pr_notice("Timeout waiting for devices to be closed\n"); +out: + mutex_unlock(&bch_register_lock); + } + + return NOTIFY_DONE; +} + +static struct notifier_block reboot = { + .notifier_call = bcache_reboot, + .priority = INT_MAX, /* before any real devices */ +}; + +static void bcache_exit(void) +{ + bch_debug_exit(); + bch_request_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) + destroy_workqueue(bcache_wq); + if (bch_journal_wq) + destroy_workqueue(bch_journal_wq); + if (bch_flush_wq) + destroy_workqueue(bch_flush_wq); + bch_btree_exit(); + + if (bcache_major) + unregister_blkdev(bcache_major, "bcache"); + unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); +} + +/* Check and fixup module parameters */ +static void check_module_parameters(void) +{ + if (bch_cutoff_writeback_sync == 0) + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC; + else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) { + pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n", + bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX); + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX; + } + + if (bch_cutoff_writeback == 0) + bch_cutoff_writeback = CUTOFF_WRITEBACK; + else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) { + pr_warn("set bch_cutoff_writeback (%u) to max value %u\n", + bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX); + bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX; + } + + if (bch_cutoff_writeback > bch_cutoff_writeback_sync) { + pr_warn("set bch_cutoff_writeback (%u) to %u\n", + bch_cutoff_writeback, bch_cutoff_writeback_sync); + bch_cutoff_writeback = bch_cutoff_writeback_sync; + } +} + +static int __init bcache_init(void) +{ + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, + &ksysfs_pendings_cleanup.attr, + NULL + }; + + check_module_parameters(); + + mutex_init(&bch_register_lock); + init_waitqueue_head(&unregister_wait); + register_reboot_notifier(&reboot); + + bcache_major = register_blkdev(0, "bcache"); + if (bcache_major < 0) { + unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); + return bcache_major; + } + + if (bch_btree_init()) + goto err; + + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + if (!bcache_wq) + goto err; + + /* + * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: + * + * 1. It used `system_wq` before which also does no memory reclaim. + * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and + * reduced throughput can be observed. + * + * We still want to user our own queue to not congest the `system_wq`. + */ + bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + if (!bch_flush_wq) + goto err; + + bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); + if (!bch_journal_wq) + goto err; + + bcache_kobj = kobject_create_and_add("bcache", fs_kobj); + if (!bcache_kobj) + goto err; + + if (bch_request_init() || + sysfs_create_files(bcache_kobj, files)) + goto err; + + bch_debug_init(); + closure_debug_init(); + + bcache_is_reboot = false; + + return 0; +err: + bcache_exit(); + return -ENOMEM; +} + +/* + * Module hooks + */ +module_exit(bcache_exit); +module_init(bcache_init); + +module_param(bch_cutoff_writeback, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback"); + +module_param(bch_cutoff_writeback_sync, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback"); + +MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); +MODULE_AUTHOR("Kent Overstreet "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c new file mode 100644 index 000000000..025fe6479 --- /dev/null +++ b/drivers/md/bcache/sysfs.c @@ -0,0 +1,1204 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "sysfs.h" +#include "btree.h" +#include "request.h" +#include "writeback.h" +#include "features.h" + +#include +#include +#include + +extern bool bcache_is_reboot; + +/* Default is 0 ("writethrough") */ +static const char * const bch_cache_modes[] = { + "writethrough", + "writeback", + "writearound", + "none", + NULL +}; + +static const char * const bch_reada_cache_policies[] = { + "all", + "meta-only", + NULL +}; + +/* Default is 0 ("auto") */ +static const char * const bch_stop_on_failure_modes[] = { + "auto", + "always", + NULL +}; + +static const char * const cache_replacement_policies[] = { + "lru", + "fifo", + "random", + NULL +}; + +static const char * const error_actions[] = { + "unregister", + "panic", + NULL +}; + +write_attribute(attach); +write_attribute(detach); +write_attribute(unregister); +write_attribute(stop); +write_attribute(clear_stats); +write_attribute(trigger_gc); +write_attribute(prune_cache); +write_attribute(flash_vol_create); + +read_attribute(bucket_size); +read_attribute(block_size); +read_attribute(nbuckets); +read_attribute(tree_depth); +read_attribute(root_usage_percent); +read_attribute(priority_stats); +read_attribute(btree_cache_size); +read_attribute(btree_cache_max_chain); +read_attribute(cache_available_percent); +read_attribute(written); +read_attribute(btree_written); +read_attribute(metadata_written); +read_attribute(active_journal_entries); +read_attribute(backing_dev_name); +read_attribute(backing_dev_uuid); + +sysfs_time_stats_attribute(btree_gc, sec, ms); +sysfs_time_stats_attribute(btree_split, sec, us); +sysfs_time_stats_attribute(btree_sort, ms, us); +sysfs_time_stats_attribute(btree_read, ms, us); + +read_attribute(btree_nodes); +read_attribute(btree_used_percent); +read_attribute(average_key_size); +read_attribute(dirty_data); +read_attribute(bset_tree_stats); +read_attribute(feature_compat); +read_attribute(feature_ro_compat); +read_attribute(feature_incompat); + +read_attribute(state); +read_attribute(cache_read_races); +read_attribute(reclaim); +read_attribute(reclaimed_journal_buckets); +read_attribute(flush_write); +read_attribute(writeback_keys_done); +read_attribute(writeback_keys_failed); +read_attribute(io_errors); +read_attribute(congested); +read_attribute(cutoff_writeback); +read_attribute(cutoff_writeback_sync); +rw_attribute(congested_read_threshold_us); +rw_attribute(congested_write_threshold_us); + +rw_attribute(sequential_cutoff); +rw_attribute(data_csum); +rw_attribute(cache_mode); +rw_attribute(readahead_cache_policy); +rw_attribute(stop_when_cache_set_failed); +rw_attribute(writeback_metadata); +rw_attribute(writeback_running); +rw_attribute(writeback_percent); +rw_attribute(writeback_delay); +rw_attribute(writeback_rate); +rw_attribute(writeback_consider_fragment); + +rw_attribute(writeback_rate_update_seconds); +rw_attribute(writeback_rate_i_term_inverse); +rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_fp_term_low); +rw_attribute(writeback_rate_fp_term_mid); +rw_attribute(writeback_rate_fp_term_high); +rw_attribute(writeback_rate_minimum); +read_attribute(writeback_rate_debug); + +read_attribute(stripe_size); +read_attribute(partial_stripes_expensive); + +rw_attribute(synchronous); +rw_attribute(journal_delay_ms); +rw_attribute(io_disable); +rw_attribute(discard); +rw_attribute(running); +rw_attribute(label); +rw_attribute(errors); +rw_attribute(io_error_limit); +rw_attribute(io_error_halflife); +rw_attribute(verify); +rw_attribute(bypass_torture_test); +rw_attribute(key_merging_disabled); +rw_attribute(gc_always_rewrite); +rw_attribute(expensive_debug_checks); +rw_attribute(cache_replacement_policy); +rw_attribute(btree_shrinker_disabled); +rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); +rw_attribute(gc_after_writeback); +rw_attribute(size); + +static ssize_t bch_snprint_string_list(char *buf, + size_t size, + const char * const list[], + size_t selected) +{ + char *out = buf; + size_t i; + + for (i = 0; list[i]; i++) + out += scnprintf(out, buf + size - out, + i == selected ? "[%s] " : "%s ", list[i]); + + out[-1] = '\n'; + return out - buf; +} + +SHOW(__bch_cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + char const *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + int wb = dc->writeback_running; + +#define var(stat) (dc->stat) + + if (attr == &sysfs_cache_mode) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_cache_modes, + BDEV_CACHE_MODE(&dc->sb)); + + if (attr == &sysfs_readahead_cache_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_reada_cache_policies, + dc->cache_readahead_policy); + + if (attr == &sysfs_stop_when_cache_set_failed) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_stop_on_failure_modes, + dc->stop_when_cache_set_failed); + + + sysfs_printf(data_csum, "%i", dc->disk.data_csum); + var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); + var_printf(writeback_metadata, "%i"); + var_printf(writeback_running, "%i"); + var_printf(writeback_consider_fragment, "%i"); + var_print(writeback_delay); + var_print(writeback_percent); + sysfs_hprint(writeback_rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); + sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors)); + sysfs_printf(io_error_limit, "%i", dc->error_limit); + sysfs_printf(io_disable, "%i", dc->io_disable); + var_print(writeback_rate_update_seconds); + var_print(writeback_rate_i_term_inverse); + var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_fp_term_low); + var_print(writeback_rate_fp_term_mid); + var_print(writeback_rate_fp_term_high); + var_print(writeback_rate_minimum); + + if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; + char dirty[20]; + char target[20]; + char proportional[20]; + char integral[20]; + char change[20]; + s64 next_io; + + /* + * Except for dirty and target, other values should + * be 0 if writeback is not running. + */ + bch_hprint(rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 + : 0); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); + bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional, + wb ? dc->writeback_rate_proportional << 9 : 0); + bch_hprint(integral, + wb ? dc->writeback_rate_integral_scaled << 9 : 0); + bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0); + next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(), + NSEC_PER_MSEC) : 0; + + return sprintf(buf, + "rate:\t\t%s/sec\n" + "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" + "integral:\t%s\n" + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + integral, change, next_io); + } + + sysfs_hprint(dirty_data, + bcache_dev_sectors_dirty(&dc->disk) << 9); + + sysfs_hprint(stripe_size, ((uint64_t)dc->disk.stripe_size) << 9); + var_printf(partial_stripes_expensive, "%u"); + + var_hprint(sequential_cutoff); + + sysfs_print(running, atomic_read(&dc->running)); + sysfs_print(state, states[BDEV_STATE(&dc->sb)]); + + if (attr == &sysfs_label) { + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE + 1] = '\0'; + strcat(buf, "\n"); + return strlen(buf); + } + + if (attr == &sysfs_backing_dev_name) { + snprintf(buf, BDEVNAME_SIZE + 1, "%pg", dc->bdev); + strcat(buf, "\n"); + return strlen(buf); + } + + if (attr == &sysfs_backing_dev_uuid) { + /* convert binary uuid into 36-byte string plus '\0' */ + snprintf(buf, 36+1, "%pU", dc->sb.uuid); + strcat(buf, "\n"); + return strlen(buf); + } + +#undef var + return 0; +} +SHOW_LOCKED(bch_cached_dev) + +STORE(__cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + ssize_t v; + struct cache_set *c; + struct kobj_uevent_env *env; + + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + +#define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) +#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) + + sysfs_strtoul(data_csum, dc->disk.data_csum); + d_strtoul(verify); + sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); + sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); + sysfs_strtoul_bool(writeback_running, dc->writeback_running); + sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment); + sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); + + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, + 0, bch_cutoff_writeback); + + if (attr == &sysfs_writeback_rate) { + ssize_t ret; + long int v = atomic_long_read(&dc->writeback_rate.rate); + + ret = strtoul_safe_clamp(buf, v, 1, INT_MAX); + + if (!ret) { + atomic_long_set(&dc->writeback_rate.rate, v); + ret = size; + } + + return ret; + } + + sysfs_strtoul_clamp(writeback_rate_update_seconds, + dc->writeback_rate_update_seconds, + 1, WRITEBACK_RATE_UPDATE_SECS_MAX); + sysfs_strtoul_clamp(writeback_rate_i_term_inverse, + dc->writeback_rate_i_term_inverse, + 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_p_term_inverse, + dc->writeback_rate_p_term_inverse, + 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_fp_term_low, + dc->writeback_rate_fp_term_low, + 1, dc->writeback_rate_fp_term_mid - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_low + 1, + dc->writeback_rate_fp_term_high - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_mid + 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_minimum, + dc->writeback_rate_minimum, + 1, UINT_MAX); + + sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); + + if (attr == &sysfs_io_disable) { + int v = strtoul_or_return(buf); + + dc->io_disable = v ? 1 : 0; + } + + sysfs_strtoul_clamp(sequential_cutoff, + dc->sequential_cutoff, + 0, UINT_MAX); + + if (attr == &sysfs_clear_stats) + bch_cache_accounting_clear(&dc->accounting); + + if (attr == &sysfs_running && + strtoul_or_return(buf)) { + v = bch_cached_dev_run(dc); + if (v) + return v; + } + + if (attr == &sysfs_cache_mode) { + v = __sysfs_match_string(bch_cache_modes, -1, buf); + if (v < 0) + return v; + + if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) { + SET_BDEV_CACHE_MODE(&dc->sb, v); + bch_write_bdev_super(dc, NULL); + } + } + + if (attr == &sysfs_readahead_cache_policy) { + v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); + if (v < 0) + return v; + + if ((unsigned int) v != dc->cache_readahead_policy) + dc->cache_readahead_policy = v; + } + + if (attr == &sysfs_stop_when_cache_set_failed) { + v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); + if (v < 0) + return v; + + dc->stop_when_cache_set_failed = v; + } + + if (attr == &sysfs_label) { + if (size > SB_LABEL_SIZE) + return -EINVAL; + memcpy(dc->sb.label, buf, size); + if (size < SB_LABEL_SIZE) + dc->sb.label[size] = '\0'; + if (size && dc->sb.label[size - 1] == '\n') + dc->sb.label[size - 1] = '\0'; + bch_write_bdev_super(dc, NULL); + if (dc->disk.c) { + memcpy(dc->disk.c->uuids[dc->disk.id].label, + buf, SB_LABEL_SIZE); + bch_uuid_write(dc->disk.c); + } + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + add_uevent_var(env, "DRIVER=bcache"); + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid); + add_uevent_var(env, "CACHED_LABEL=%s", buf); + kobject_uevent_env(&disk_to_dev(dc->disk.disk)->kobj, + KOBJ_CHANGE, + env->envp); + kfree(env); + } + + if (attr == &sysfs_attach) { + uint8_t set_uuid[16]; + + if (bch_parse_uuid(buf, set_uuid) < 16) + return -EINVAL; + + v = -ENOENT; + list_for_each_entry(c, &bch_cache_sets, list) { + v = bch_cached_dev_attach(dc, c, set_uuid); + if (!v) + return size; + } + if (v == -ENOENT) + pr_err("Can't attach %s: cache set not found\n", buf); + return v; + } + + if (attr == &sysfs_detach && dc->disk.c) + bch_cached_dev_detach(dc); + + if (attr == &sysfs_stop) + bcache_device_stop(&dc->disk); + + return size; +} + +STORE(bch_cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + + mutex_lock(&bch_register_lock); + size = __cached_dev_store(kobj, attr, buf, size); + + if (attr == &sysfs_writeback_running) { + /* dc->writeback_running changed in __cached_dev_store() */ + if (IS_ERR_OR_NULL(dc->writeback_thread)) { + /* + * reject setting it to 1 via sysfs if writeback + * kthread is not created yet. + */ + if (dc->writeback_running) { + dc->writeback_running = false; + pr_err("%s: failed to run non-existent writeback thread\n", + dc->disk.disk->disk_name); + } + } else + /* + * writeback kthread will check if dc->writeback_running + * is true or false. + */ + bch_writeback_queue(dc); + } + + /* + * Only set BCACHE_DEV_WB_RUNNING when cached device attached to + * a cache set, otherwise it doesn't make sense. + */ + if (attr == &sysfs_writeback_percent) + if ((dc->disk.c != NULL) && + (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))) + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + mutex_unlock(&bch_register_lock); + return size; +} + +static struct attribute *bch_cached_dev_attrs[] = { + &sysfs_attach, + &sysfs_detach, + &sysfs_stop, +#if 0 + &sysfs_data_csum, +#endif + &sysfs_cache_mode, + &sysfs_readahead_cache_policy, + &sysfs_stop_when_cache_set_failed, + &sysfs_writeback_metadata, + &sysfs_writeback_running, + &sysfs_writeback_delay, + &sysfs_writeback_percent, + &sysfs_writeback_rate, + &sysfs_writeback_consider_fragment, + &sysfs_writeback_rate_update_seconds, + &sysfs_writeback_rate_i_term_inverse, + &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_fp_term_low, + &sysfs_writeback_rate_fp_term_mid, + &sysfs_writeback_rate_fp_term_high, + &sysfs_writeback_rate_minimum, + &sysfs_writeback_rate_debug, + &sysfs_io_errors, + &sysfs_io_error_limit, + &sysfs_io_disable, + &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, + &sysfs_sequential_cutoff, + &sysfs_clear_stats, + &sysfs_running, + &sysfs_state, + &sysfs_label, +#ifdef CONFIG_BCACHE_DEBUG + &sysfs_verify, + &sysfs_bypass_torture_test, +#endif + &sysfs_backing_dev_name, + &sysfs_backing_dev_uuid, + NULL +}; +ATTRIBUTE_GROUPS(bch_cached_dev); +KTYPE(bch_cached_dev); + +SHOW(bch_flash_dev) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + + sysfs_printf(data_csum, "%i", d->data_csum); + sysfs_hprint(size, u->sectors << 9); + + if (attr == &sysfs_label) { + memcpy(buf, u->label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE + 1] = '\0'; + strcat(buf, "\n"); + return strlen(buf); + } + + return 0; +} + +STORE(__bch_flash_dev) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + + sysfs_strtoul(data_csum, d->data_csum); + + if (attr == &sysfs_size) { + uint64_t v; + + strtoi_h_or_return(buf, v); + + u->sectors = v >> 9; + bch_uuid_write(d->c); + set_capacity(d->disk, u->sectors); + } + + if (attr == &sysfs_label) { + memcpy(u->label, buf, SB_LABEL_SIZE); + bch_uuid_write(d->c); + } + + if (attr == &sysfs_unregister) { + set_bit(BCACHE_DEV_DETACHING, &d->flags); + bcache_device_stop(d); + } + + return size; +} +STORE_LOCKED(bch_flash_dev) + +static struct attribute *bch_flash_dev_attrs[] = { + &sysfs_unregister, +#if 0 + &sysfs_data_csum, +#endif + &sysfs_label, + &sysfs_size, + NULL +}; +ATTRIBUTE_GROUPS(bch_flash_dev); +KTYPE(bch_flash_dev); + +struct bset_stats_op { + struct btree_op op; + size_t nodes; + struct bset_stats stats; +}; + +static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b) +{ + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); + + op->nodes++; + bch_btree_keys_stats(&b->keys, &op->stats); + + return MAP_CONTINUE; +} + +static int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct bset_stats_op op; + int ret; + + memset(&op, 0, sizeof(op)); + bch_btree_op_init(&op.op, -1); + + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats); + if (ret < 0) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + op.nodes, + op.stats.sets_written, op.stats.sets_unwritten, + op.stats.bytes_written, op.stats.bytes_unwritten, + op.stats.floats, op.stats.failed); +} + +static unsigned int bch_root_usage(struct cache_set *c) +{ + unsigned int bytes = 0; + struct bkey *k; + struct btree *b; + struct btree_iter iter; + + goto lock_root; + + do { + rw_unlock(false, b); +lock_root: + b = c->root; + rw_lock(false, b, b->level); + } while (b != c->root); + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + bytes += bkey_bytes(k); + + rw_unlock(false, b); + + return (bytes * 100) / btree_bytes(c); +} + +static size_t bch_cache_size(struct cache_set *c) +{ + size_t ret = 0; + struct btree *b; + + mutex_lock(&c->bucket_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += 1 << (b->keys.page_order + PAGE_SHIFT); + + mutex_unlock(&c->bucket_lock); + return ret; +} + +static unsigned int bch_cache_max_chain(struct cache_set *c) +{ + unsigned int ret = 0; + struct hlist_head *h; + + mutex_lock(&c->bucket_lock); + + for (h = c->bucket_hash; + h < c->bucket_hash + (1 << BUCKET_HASH_BITS); + h++) { + unsigned int i = 0; + struct hlist_node *p; + + hlist_for_each(p, h) + i++; + + ret = max(ret, i); + } + + mutex_unlock(&c->bucket_lock); + return ret; +} + +static unsigned int bch_btree_used(struct cache_set *c) +{ + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); +} + +static unsigned int bch_average_key_size(struct cache_set *c) +{ + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; +} + +SHOW(__bch_cache_set) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb)); + sysfs_print(journal_delay_ms, c->journal_delay_ms); + sysfs_hprint(bucket_size, bucket_bytes(c->cache)); + sysfs_hprint(block_size, block_bytes(c->cache)); + sysfs_print(tree_depth, c->root->level); + sysfs_print(root_usage_percent, bch_root_usage(c)); + + sysfs_hprint(btree_cache_size, bch_cache_size(c)); + sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c)); + sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); + + sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); + sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); + sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); + + sysfs_print(btree_used_percent, bch_btree_used(c)); + sysfs_print(btree_nodes, c->gc_stats.nodes); + sysfs_hprint(average_key_size, bch_average_key_size(c)); + + sysfs_print(cache_read_races, + atomic_long_read(&c->cache_read_races)); + + sysfs_print(reclaim, + atomic_long_read(&c->reclaim)); + + sysfs_print(reclaimed_journal_buckets, + atomic_long_read(&c->reclaimed_journal_buckets)); + + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + + sysfs_print(writeback_keys_done, + atomic_long_read(&c->writeback_keys_done)); + sysfs_print(writeback_keys_failed, + atomic_long_read(&c->writeback_keys_failed)); + + if (attr == &sysfs_errors) + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, + c->on_error); + + /* See count_io_errors for why 88 */ + sysfs_print(io_error_halflife, c->error_decay * 88); + sysfs_print(io_error_limit, c->error_limit); + + sysfs_hprint(congested, + ((uint64_t) bch_get_congested(c)) << 9); + sysfs_print(congested_read_threshold_us, + c->congested_read_threshold_us); + sysfs_print(congested_write_threshold_us, + c->congested_write_threshold_us); + + sysfs_print(cutoff_writeback, bch_cutoff_writeback); + sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync); + + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); + sysfs_printf(verify, "%i", c->verify); + sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(expensive_debug_checks, + "%i", c->expensive_debug_checks); + sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); + sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); + sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); + sysfs_printf(io_disable, "%i", + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + if (attr == &sysfs_bset_tree_stats) + return bch_bset_print_stats(c, buf); + + if (attr == &sysfs_feature_compat) + return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_ro_compat) + return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_incompat) + return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE); + + return 0; +} +SHOW_LOCKED(bch_cache_set) + +STORE(__bch_cache_set) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + ssize_t v; + + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + + if (attr == &sysfs_unregister) + bch_cache_set_unregister(c); + + if (attr == &sysfs_stop) + bch_cache_set_stop(c); + + if (attr == &sysfs_synchronous) { + bool sync = strtoul_or_return(buf); + + if (sync != CACHE_SYNC(&c->cache->sb)) { + SET_CACHE_SYNC(&c->cache->sb, sync); + bcache_write_super(c); + } + } + + if (attr == &sysfs_flash_vol_create) { + int r; + uint64_t v; + + strtoi_h_or_return(buf, v); + + r = bch_flash_dev_create(c, v); + if (r) + return r; + } + + if (attr == &sysfs_clear_stats) { + atomic_long_set(&c->writeback_keys_done, 0); + atomic_long_set(&c->writeback_keys_failed, 0); + + memset(&c->gc_stats, 0, sizeof(struct gc_stat)); + bch_cache_accounting_clear(&c->accounting); + } + + if (attr == &sysfs_trigger_gc) + force_wake_up_gc(c); + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->shrink.scan_objects(&c->shrink, &sc); + } + + sysfs_strtoul_clamp(congested_read_threshold_us, + c->congested_read_threshold_us, + 0, UINT_MAX); + sysfs_strtoul_clamp(congested_write_threshold_us, + c->congested_write_threshold_us, + 0, UINT_MAX); + + if (attr == &sysfs_errors) { + v = __sysfs_match_string(error_actions, -1, buf); + if (v < 0) + return v; + + c->on_error = v; + } + + sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX); + + /* See count_io_errors() for why 88 */ + if (attr == &sysfs_io_error_halflife) { + unsigned long v = 0; + ssize_t ret; + + ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX); + if (!ret) { + c->error_decay = v / 88; + return size; + } + return ret; + } + + if (attr == &sysfs_io_disable) { + v = strtoul_or_return(buf); + if (v) { + if (test_and_set_bit(CACHE_SET_IO_DISABLE, + &c->flags)) + pr_warn("CACHE_SET_IO_DISABLE already set\n"); + } else { + if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, + &c->flags)) + pr_warn("CACHE_SET_IO_DISABLE already cleared\n"); + } + } + + sysfs_strtoul_clamp(journal_delay_ms, + c->journal_delay_ms, + 0, USHRT_MAX); + sysfs_strtoul_bool(verify, c->verify); + sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); + sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + + /* + * write gc_after_writeback here may overwrite an already set + * BCH_DO_AUTO_GC, it doesn't matter because this flag will be + * set in next chance. + */ + sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1); + + return size; +} +STORE_LOCKED(bch_cache_set) + +SHOW(bch_cache_set_internal) +{ + struct cache_set *c = container_of(kobj, struct cache_set, internal); + + return bch_cache_set_show(&c->kobj, attr, buf); +} + +STORE(bch_cache_set_internal) +{ + struct cache_set *c = container_of(kobj, struct cache_set, internal); + + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + + return bch_cache_set_store(&c->kobj, attr, buf, size); +} + +static void bch_cache_set_internal_release(struct kobject *k) +{ +} + +static struct attribute *bch_cache_set_attrs[] = { + &sysfs_unregister, + &sysfs_stop, + &sysfs_synchronous, + &sysfs_journal_delay_ms, + &sysfs_flash_vol_create, + + &sysfs_bucket_size, + &sysfs_block_size, + &sysfs_tree_depth, + &sysfs_root_usage_percent, + &sysfs_btree_cache_size, + &sysfs_cache_available_percent, + + &sysfs_average_key_size, + + &sysfs_errors, + &sysfs_io_error_limit, + &sysfs_io_error_halflife, + &sysfs_congested, + &sysfs_congested_read_threshold_us, + &sysfs_congested_write_threshold_us, + &sysfs_clear_stats, + NULL +}; +ATTRIBUTE_GROUPS(bch_cache_set); +KTYPE(bch_cache_set); + +static struct attribute *bch_cache_set_internal_attrs[] = { + &sysfs_active_journal_entries, + + sysfs_time_stats_attribute_list(btree_gc, sec, ms) + sysfs_time_stats_attribute_list(btree_split, sec, us) + sysfs_time_stats_attribute_list(btree_sort, ms, us) + sysfs_time_stats_attribute_list(btree_read, ms, us) + + &sysfs_btree_nodes, + &sysfs_btree_used_percent, + &sysfs_btree_cache_max_chain, + + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, + &sysfs_reclaim, + &sysfs_reclaimed_journal_buckets, + &sysfs_flush_write, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, + + &sysfs_trigger_gc, + &sysfs_prune_cache, +#ifdef CONFIG_BCACHE_DEBUG + &sysfs_verify, + &sysfs_key_merging_disabled, + &sysfs_expensive_debug_checks, +#endif + &sysfs_gc_always_rewrite, + &sysfs_btree_shrinker_disabled, + &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, + &sysfs_gc_after_writeback, + &sysfs_io_disable, + &sysfs_cutoff_writeback, + &sysfs_cutoff_writeback_sync, + &sysfs_feature_compat, + &sysfs_feature_ro_compat, + &sysfs_feature_incompat, + NULL +}; +ATTRIBUTE_GROUPS(bch_cache_set_internal); +KTYPE(bch_cache_set_internal); + +static int __bch_cache_cmp(const void *l, const void *r) +{ + cond_resched(); + return *((uint16_t *)r) - *((uint16_t *)l); +} + +SHOW(__bch_cache) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + + sysfs_hprint(bucket_size, bucket_bytes(ca)); + sysfs_hprint(block_size, block_bytes(ca)); + sysfs_print(nbuckets, ca->sb.nbuckets); + sysfs_print(discard, ca->discard); + sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); + sysfs_hprint(btree_written, + atomic_long_read(&ca->btree_sectors_written) << 9); + sysfs_hprint(metadata_written, + (atomic_long_read(&ca->meta_sectors_written) + + atomic_long_read(&ca->btree_sectors_written)) << 9); + + sysfs_print(io_errors, + atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); + + if (attr == &sysfs_cache_replacement_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + cache_replacement_policies, + CACHE_REPLACEMENT(&ca->sb)); + + if (attr == &sysfs_priority_stats) { + struct bucket *b; + size_t n = ca->sb.nbuckets, i; + size_t unused = 0, available = 0, dirty = 0, meta = 0; + uint64_t sum = 0; + /* Compute 31 quantiles */ + uint16_t q[31], *p, *cached; + ssize_t ret; + + cached = p = vmalloc(array_size(sizeof(uint16_t), + ca->sb.nbuckets)); + if (!p) + return -ENOMEM; + + mutex_lock(&ca->set->bucket_lock); + for_each_bucket(b, ca) { + if (!GC_SECTORS_USED(b)) + unused++; + if (GC_MARK(b) == GC_MARK_RECLAIMABLE) + available++; + if (GC_MARK(b) == GC_MARK_DIRTY) + dirty++; + if (GC_MARK(b) == GC_MARK_METADATA) + meta++; + } + + for (i = ca->sb.first_bucket; i < n; i++) + p[i] = ca->buckets[i].prio; + mutex_unlock(&ca->set->bucket_lock); + + sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL); + + while (n && + !cached[n - 1]) + --n; + + while (cached < p + n && + *cached == BTREE_PRIO) { + cached++; + n--; + } + + for (i = 0; i < n; i++) + sum += INITIAL_PRIO - cached[i]; + + if (n) + sum = div64_u64(sum, n); + + for (i = 0; i < ARRAY_SIZE(q); i++) + q[i] = INITIAL_PRIO - cached[n * (i + 1) / + (ARRAY_SIZE(q) + 1)]; + + vfree(p); + + ret = scnprintf(buf, PAGE_SIZE, + "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + + for (i = 0; i < ARRAY_SIZE(q); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%u ", q[i]); + ret--; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + + return ret; + } + + return 0; +} +SHOW_LOCKED(bch_cache) + +STORE(__bch_cache) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + ssize_t v; + + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + + if (bdev_max_discard_sectors(ca->bdev)) + ca->discard = v; + + if (v != CACHE_DISCARD(&ca->sb)) { + SET_CACHE_DISCARD(&ca->sb, v); + bcache_write_super(ca->set); + } + } + + if (attr == &sysfs_cache_replacement_policy) { + v = __sysfs_match_string(cache_replacement_policies, -1, buf); + if (v < 0) + return v; + + if ((unsigned int) v != CACHE_REPLACEMENT(&ca->sb)) { + mutex_lock(&ca->set->bucket_lock); + SET_CACHE_REPLACEMENT(&ca->sb, v); + mutex_unlock(&ca->set->bucket_lock); + + bcache_write_super(ca->set); + } + } + + if (attr == &sysfs_clear_stats) { + atomic_long_set(&ca->sectors_written, 0); + atomic_long_set(&ca->btree_sectors_written, 0); + atomic_long_set(&ca->meta_sectors_written, 0); + atomic_set(&ca->io_count, 0); + atomic_set(&ca->io_errors, 0); + } + + return size; +} +STORE_LOCKED(bch_cache) + +static struct attribute *bch_cache_attrs[] = { + &sysfs_bucket_size, + &sysfs_block_size, + &sysfs_nbuckets, + &sysfs_priority_stats, + &sysfs_discard, + &sysfs_written, + &sysfs_btree_written, + &sysfs_metadata_written, + &sysfs_io_errors, + &sysfs_clear_stats, + &sysfs_cache_replacement_policy, + NULL +}; +ATTRIBUTE_GROUPS(bch_cache); +KTYPE(bch_cache); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h new file mode 100644 index 000000000..a2ff6447b --- /dev/null +++ b/drivers/md/bcache/sysfs.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_SYSFS_H_ +#define _BCACHE_SYSFS_H_ + +#define KTYPE(type) \ +struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &((const struct sysfs_ops) { \ + .show = type ## _show, \ + .store = type ## _store \ + }), \ + .default_groups = type ## _groups \ +} + +#define SHOW(fn) \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ + +#define STORE(fn) \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) \ + +#define SHOW_LOCKED(fn) \ +SHOW(fn) \ +{ \ + ssize_t ret; \ + mutex_lock(&bch_register_lock); \ + ret = __ ## fn ## _show(kobj, attr, buf); \ + mutex_unlock(&bch_register_lock); \ + return ret; \ +} + +#define STORE_LOCKED(fn) \ +STORE(fn) \ +{ \ + ssize_t ret; \ + mutex_lock(&bch_register_lock); \ + ret = __ ## fn ## _store(kobj, attr, buf, size); \ + mutex_unlock(&bch_register_lock); \ + return ret; \ +} + +#define __sysfs_attribute(_name, _mode) \ + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + +#define write_attribute(n) __sysfs_attribute(n, 0200) +#define read_attribute(n) __sysfs_attribute(n, 0444) +#define rw_attribute(n) __sysfs_attribute(n, 0644) + +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ + return sysfs_emit(buf, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return sysfs_emit(buf, \ + __builtin_types_compatible_p(typeof(var), int) \ + ? "%i\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned int) \ + ? "%u\n" : \ + __builtin_types_compatible_p(typeof(var), long) \ + ? "%li\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned long)\ + ? "%lu\n" : \ + __builtin_types_compatible_p(typeof(var), int64_t) \ + ? "%lli\n" : \ + __builtin_types_compatible_p(typeof(var), uint64_t) \ + ? "%llu\n" : \ + __builtin_types_compatible_p(typeof(var), const char *) \ + ? "%s\n" : "%i\n", var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ + if (attr == &sysfs_ ## file) { \ + ssize_t ret = bch_hprint(buf, val); \ + strcat(buf, "\n"); \ + return ret + 1; \ + } \ +} while (0) + +#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) +#define var_print(_var) sysfs_print(_var, var(_var)) +#define var_hprint(_var) sysfs_hprint(_var, var(_var)) + +#define sysfs_strtoul(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe(buf, var) ?: (ssize_t) size; \ +} while (0) + +#define sysfs_strtoul_bool(file, var) \ +do { \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = strtoul_or_return(buf); \ + \ + var = v ? 1 : 0; \ + return size; \ + } \ +} while (0) + +#define sysfs_strtoul_clamp(file, var, min, max) \ +do { \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = 0; \ + ssize_t ret; \ + ret = strtoul_safe_clamp(buf, v, min, max); \ + if (!ret) { \ + var = v; \ + return size; \ + } \ + return ret; \ + } \ +} while (0) + +#define strtoul_or_return(cp) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define strtoi_h_or_return(cp, v) \ +do { \ + int _r = strtoi_h(cp, &v); \ + if (_r) \ + return _r; \ +} while (0) + +#define sysfs_hatoi(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoi_h(buf, &var) ?: (ssize_t) size; \ +} while (0) + +#endif /* _BCACHE_SYSFS_H_ */ diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c new file mode 100644 index 000000000..a9a73f560 --- /dev/null +++ b/drivers/md/bcache/trace.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcache.h" +#include "btree.h" + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c new file mode 100644 index 000000000..ae380bc39 --- /dev/null +++ b/drivers/md/bcache/util.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" + +#define simple_strtoint(c, end, base) simple_strtol(c, end, base) +#define simple_strtouint(c, end, base) simple_strtoul(c, end, base) + +#define STRTO_H(name, type) \ +int bch_ ## name ## _h(const char *cp, type *res) \ +{ \ + int u = 0; \ + char *e; \ + type i = simple_ ## name(cp, &e, 10); \ + \ + switch (tolower(*e)) { \ + default: \ + return -EINVAL; \ + case 'y': \ + case 'z': \ + u++; \ + fallthrough; \ + case 'e': \ + u++; \ + fallthrough; \ + case 'p': \ + u++; \ + fallthrough; \ + case 't': \ + u++; \ + fallthrough; \ + case 'g': \ + u++; \ + fallthrough; \ + case 'm': \ + u++; \ + fallthrough; \ + case 'k': \ + u++; \ + if (e++ == cp) \ + return -EINVAL; \ + fallthrough; \ + case '\n': \ + case '\0': \ + if (*e == '\n') \ + e++; \ + } \ + \ + if (*e) \ + return -EINVAL; \ + \ + while (u--) { \ + if ((type) ~0 > 0 && \ + (type) ~0 / 1024 <= i) \ + return -EINVAL; \ + if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ + (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ + return -EINVAL; \ + i *= 1024; \ + } \ + \ + *res = i; \ + return 0; \ +} \ + +STRTO_H(strtoint, int) +STRTO_H(strtouint, unsigned int) +STRTO_H(strtoll, long long) +STRTO_H(strtoull, unsigned long long) + +/** + * bch_hprint - formats @v to human readable string for sysfs. + * @buf: the (at least 8 byte) buffer to format the result into. + * @v: signed 64 bit integer + * + * Returns the number of bytes used by format. + */ +ssize_t bch_hprint(char *buf, int64_t v) +{ + static const char units[] = "?kMGTPEZY"; + int u = 0, t; + + uint64_t q; + + if (v < 0) + q = -v; + else + q = v; + + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; + + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); +} + +bool bch_is_zero(const char *p, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + if (p[i]) + return false; + return true; +} + +int bch_parse_uuid(const char *s, char *uuid) +{ + size_t i, j, x; + + memset(uuid, 0, 16); + + for (i = 0, j = 0; + i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; + i++) { + x = s[i] | 32; + + switch (x) { + case '0'...'9': + x -= '0'; + break; + case 'a'...'f': + x -= 'a' - 10; + break; + default: + continue; + } + + if (!(j & 1)) + x <<= 4; + uuid[j++ >> 1] |= x; + } + return i; +} + +void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) +{ + uint64_t now, duration, last; + + spin_lock(&stats->lock); + + now = local_clock(); + duration = time_after64(now, start_time) + ? now - start_time : 0; + last = time_after64(now, stats->last) + ? now - stats->last : 0; + + stats->max_duration = max(stats->max_duration, duration); + + if (stats->last) { + ewma_add(stats->average_duration, duration, 8, 8); + + if (stats->average_frequency) + ewma_add(stats->average_frequency, last, 8, 8); + else + stats->average_frequency = last << 8; + } else { + stats->average_duration = duration << 8; + } + + stats->last = now ?: 1; + + spin_unlock(&stats->lock); +} + +/** + * bch_next_delay() - update ratelimiting statistics and calculate next delay + * @d: the struct bch_ratelimit to update + * @done: the amount of work done, in arbitrary units + * + * Increment @d by the amount of work done, and return how long to delay in + * jiffies until the next time to do some work. + */ +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) +{ + uint64_t now = local_clock(); + + d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate)); + + /* Bound the time. Don't let us fall further than 2 seconds behind + * (this prevents unnecessary backlog that would make it impossible + * to catch up). If we're ahead of the desired writeback rate, + * don't let us sleep more than 2.5 seconds (so we can notice/respond + * if the control system tells us to speed up!). + */ + if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next)) + d->next = now + NSEC_PER_SEC * 5LLU / 2LLU; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; + + return time_after64(d->next, now) + ? div_u64(d->next - now, NSEC_PER_SEC / HZ) + : 0; +} + +/* + * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly, + * the preferred way is bio_add_page, but in this case, bch_bio_map() + * supposes that the bvec table is empty, so it is safe to access + * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is + * supported. + */ +void bch_bio_map(struct bio *bio, void *base) +{ + size_t size = bio->bi_iter.bi_size; + struct bio_vec *bv = bio->bi_io_vec; + + BUG_ON(!bio->bi_iter.bi_size); + BUG_ON(bio->bi_vcnt); + + bv->bv_offset = base ? offset_in_page(base) : 0; + goto start; + + for (; size; bio->bi_vcnt++, bv++) { + bv->bv_offset = 0; +start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, + size); + if (base) { + bv->bv_page = is_vmalloc_addr(base) + ? vmalloc_to_page(base) + : virt_to_page(base); + + base += bv->bv_len; + } + + size -= bv->bv_len; + } +} + +/** + * bch_bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + /* + * This is called on freshly new bio, so it is safe to access the + * bvec table directly. + */ + for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h new file mode 100644 index 000000000..6f3cb7c92 --- /dev/null +++ b/drivers/md/bcache/util.h @@ -0,0 +1,562 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHE_UTIL_H +#define _BCACHE_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "closure.h" + +struct closure; + +#ifdef CONFIG_BCACHE_DEBUG + +#define EBUG_ON(cond) BUG_ON(cond) +#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) +#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) + +#else /* DEBUG */ + +#define EBUG_ON(cond) do { if (cond) do {} while (0); } while (0) +#define atomic_dec_bug(v) atomic_dec(v) +#define atomic_inc_bug(v, i) atomic_inc(v) + +#endif + +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + +#define init_heap(heap, _size, gfp) \ +({ \ + size_t _bytes; \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + _bytes = (heap)->size * sizeof(*(heap)->data); \ + (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ + (heap)->data; \ +}) + +#define free_heap(heap) \ +do { \ + kvfree((heap)->data); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + +#define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ + type *data; \ + } name + +#define fifo_for_each(c, fifo, iter) \ + for (iter = (fifo)->front; \ + c = (fifo)->data[iter], iter != (fifo)->back; \ + iter = (iter + 1) & (fifo)->mask) + +#define __init_fifo(fifo, gfp) \ +({ \ + size_t _allocated_size, _bytes; \ + BUG_ON(!(fifo)->size); \ + \ + _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ + _bytes = _allocated_size * sizeof(*(fifo)->data); \ + \ + (fifo)->mask = _allocated_size - 1; \ + (fifo)->front = (fifo)->back = 0; \ + \ + (fifo)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ + (fifo)->data; \ +}) + +#define init_fifo_exact(fifo, _size, gfp) \ +({ \ + (fifo)->size = (_size); \ + __init_fifo(fifo, gfp); \ +}) + +#define init_fifo(fifo, _size, gfp) \ +({ \ + (fifo)->size = (_size); \ + if ((fifo)->size > 4) \ + (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ + __init_fifo(fifo, gfp); \ +}) + +#define free_fifo(fifo) \ +do { \ + kvfree((fifo)->data); \ + (fifo)->data = NULL; \ +} while (0) + +#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) +#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) + +#define fifo_empty(fifo) (!fifo_used(fifo)) +#define fifo_full(fifo) (!fifo_free(fifo)) + +#define fifo_front(fifo) ((fifo)->data[(fifo)->front]) +#define fifo_back(fifo) \ + ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) + +#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) + +#define fifo_push_back(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) { \ + (fifo)->data[(fifo)->back++] = (i); \ + (fifo)->back &= (fifo)->mask; \ + } \ + _r; \ +}) + +#define fifo_pop_front(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) { \ + (i) = (fifo)->data[(fifo)->front++]; \ + (fifo)->front &= (fifo)->mask; \ + } \ + _r; \ +}) + +#define fifo_push_front(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) { \ + --(fifo)->front; \ + (fifo)->front &= (fifo)->mask; \ + (fifo)->data[(fifo)->front] = (i); \ + } \ + _r; \ +}) + +#define fifo_pop_back(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) { \ + --(fifo)->back; \ + (fifo)->back &= (fifo)->mask; \ + (i) = (fifo)->data[(fifo)->back] \ + } \ + _r; \ +}) + +#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) +#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) + +#define fifo_swap(l, r) \ +do { \ + swap((l)->front, (r)->front); \ + swap((l)->back, (r)->back); \ + swap((l)->size, (r)->size); \ + swap((l)->mask, (r)->mask); \ + swap((l)->data, (r)->data); \ +} while (0) + +#define fifo_move(dest, src) \ +do { \ + typeof(*((dest)->data)) _t; \ + while (!fifo_full(dest) && \ + fifo_pop(src, _t)) \ + fifo_push(dest, _t); \ +} while (0) + +/* + * Simple array based allocator - preallocates a number of elements and you can + * never allocate more than that, also has no locking. + * + * Handy because if you know you only need a fixed number of elements you don't + * have to worry about memory allocation failure, and sometimes a mempool isn't + * what you want. + * + * We treat the free elements as entries in a singly linked list, and the + * freelist as a stack - allocating and freeing push and pop off the freelist. + */ + +#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ + struct { \ + type *freelist; \ + type data[size]; \ + } name + +#define array_alloc(array) \ +({ \ + typeof((array)->freelist) _ret = (array)->freelist; \ + \ + if (_ret) \ + (array)->freelist = *((typeof((array)->freelist) *) _ret);\ + \ + _ret; \ +}) + +#define array_free(array, ptr) \ +do { \ + typeof((array)->freelist) _ptr = ptr; \ + \ + *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ + (array)->freelist = _ptr; \ +} while (0) + +#define array_allocator_init(array) \ +do { \ + typeof((array)->freelist) _i; \ + \ + BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ + (array)->freelist = NULL; \ + \ + for (_i = (array)->data; \ + _i < (array)->data + ARRAY_SIZE((array)->data); \ + _i++) \ + array_free(array, _i); \ +} while (0) + +#define array_freelist_empty(array) ((array)->freelist == NULL) + +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + +int bch_strtoint_h(const char *cp, int *res); +int bch_strtouint_h(const char *cp, unsigned int *res); +int bch_strtoll_h(const char *cp, long long *res); +int bch_strtoull_h(const char *cp, unsigned long long *res); + +static inline int bch_strtol_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch_strtoint_h(cp, (int *) res); +#else + return bch_strtoll_h(cp, (long long *) res); +#endif +} + +static inline int bch_strtoul_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch_strtouint_h(cp, (unsigned int *) res); +#else + return bch_strtoull_h(cp, (unsigned long long *) res); +#endif +} + +#define strtoi_h(cp, res) \ + (__builtin_types_compatible_p(typeof(*res), int) \ + ? bch_strtoint_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), long) \ + ? bch_strtol_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), long long) \ + ? bch_strtoll_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned int) \ + ? bch_strtouint_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned long) \ + ? bch_strtoul_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ + ? bch_strtoull_h(cp, (void *) res) : -EINVAL) + +#define strtoul_safe(cp, var) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = _v; \ + _r; \ +}) + +#define strtoul_safe_clamp(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = clamp_t(typeof(var), _v, min, max); \ + _r; \ +}) + +ssize_t bch_hprint(char *buf, int64_t v); + +bool bch_is_zero(const char *p, size_t n); +int bch_parse_uuid(const char *s, char *uuid); + +struct time_stats { + spinlock_t lock; + /* + * all fields are in nanoseconds, averages are ewmas stored left shifted + * by 8 + */ + uint64_t max_duration; + uint64_t average_duration; + uint64_t average_frequency; + uint64_t last; +}; + +void bch_time_stats_update(struct time_stats *stats, uint64_t time); + +static inline unsigned int local_clock_us(void) +{ + return local_clock() >> 10; +} + +#define NSEC_PER_ns 1L +#define NSEC_PER_us NSEC_PER_USEC +#define NSEC_PER_ms NSEC_PER_MSEC +#define NSEC_PER_sec NSEC_PER_SEC + +#define __print_time_stat(stats, name, stat, units) \ + sysfs_print(name ## _ ## stat ## _ ## units, \ + div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) + +#define sysfs_print_time_stats(stats, name, \ + frequency_units, \ + duration_units) \ +do { \ + __print_time_stat(stats, name, \ + average_frequency, frequency_units); \ + __print_time_stat(stats, name, \ + average_duration, duration_units); \ + sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ + div_u64((stats)->max_duration, \ + NSEC_PER_ ## duration_units)); \ + \ + sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ + ? div_s64(local_clock() - (stats)->last, \ + NSEC_PER_ ## frequency_units) \ + : -1LL); \ +} while (0) + +#define sysfs_time_stats_attribute(name, \ + frequency_units, \ + duration_units) \ +read_attribute(name ## _average_frequency_ ## frequency_units); \ +read_attribute(name ## _average_duration_ ## duration_units); \ +read_attribute(name ## _max_duration_ ## duration_units); \ +read_attribute(name ## _last_ ## frequency_units) + +#define sysfs_time_stats_attribute_list(name, \ + frequency_units, \ + duration_units) \ +&sysfs_ ## name ## _average_frequency_ ## frequency_units, \ +&sysfs_ ## name ## _average_duration_ ## duration_units, \ +&sysfs_ ## name ## _max_duration_ ## duration_units, \ +&sysfs_ ## name ## _last_ ## frequency_units, + +#define ewma_add(ewma, val, weight, factor) \ +({ \ + (ewma) *= (weight) - 1; \ + (ewma) += (val) << factor; \ + (ewma) /= (weight); \ + (ewma) >> factor; \ +}) + +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ + uint64_t next; + + /* + * Rate at which we want to do work, in units per second + * The units here correspond to the units passed to bch_next_delay() + */ + atomic_long_t rate; +}; + +static inline void bch_ratelimit_reset(struct bch_ratelimit *d) +{ + d->next = local_clock(); +} + +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); + +#define __DIV_SAFE(n, d, zero) \ +({ \ + typeof(n) _n = (n); \ + typeof(d) _d = (d); \ + _d ? _n / _d : zero; \ +}) + +#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) + +#define container_of_or_null(ptr, type, member) \ +({ \ + typeof(ptr) _ptr = ptr; \ + _ptr ? container_of(_ptr, type, member) : NULL; \ +}) + +#define RB_INSERT(root, new, member, cmp) \ +({ \ + __label__ dup; \ + struct rb_node **n = &(root)->rb_node, *parent = NULL; \ + typeof(new) this; \ + int res, ret = -1; \ + \ + while (*n) { \ + parent = *n; \ + this = container_of(*n, typeof(*(new)), member); \ + res = cmp(new, this); \ + if (!res) \ + goto dup; \ + n = res < 0 \ + ? &(*n)->rb_left \ + : &(*n)->rb_right; \ + } \ + \ + rb_link_node(&(new)->member, parent, n); \ + rb_insert_color(&(new)->member, root); \ + ret = 0; \ +dup: \ + ret; \ +}) + +#define RB_SEARCH(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (!res) { \ + ret = this; \ + break; \ + } \ + n = res < 0 \ + ? n->rb_left \ + : n->rb_right; \ + } \ + ret; \ +}) + +#define RB_GREATER(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (res < 0) { \ + ret = this; \ + n = n->rb_left; \ + } else \ + n = n->rb_right; \ + } \ + ret; \ +}) + +#define RB_FIRST(root, type, member) \ + container_of_or_null(rb_first(root), type, member) + +#define RB_LAST(root, type, member) \ + container_of_or_null(rb_last(root), type, member) + +#define RB_NEXT(ptr, member) \ + container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) + +#define RB_PREV(ptr, member) \ + container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) + +static inline uint64_t bch_crc64(const void *p, size_t len) +{ + uint64_t crc = 0xffffffffffffffffULL; + + crc = crc64_be(crc, p, len); + return crc ^ 0xffffffffffffffffULL; +} + +/* + * A stepwise-linear pseudo-exponential. This returns 1 << (x >> + * frac_bits), with the less-significant bits filled in by linear + * interpolation. + * + * This can also be interpreted as a floating-point number format, + * where the low frac_bits are the mantissa (with implicit leading + * 1 bit), and the more significant bits are the exponent. + * The return value is 1.mantissa * 2^exponent. + * + * The way this is used, fract_bits is 6 and the largest possible + * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc), + * so the maximum output is 0x1fc00. + */ +static inline unsigned int fract_exp_two(unsigned int x, + unsigned int fract_bits) +{ + unsigned int mantissa = 1 << fract_bits; /* Implicit bit */ + + mantissa += x & (mantissa - 1); + x >>= fract_bits; /* The exponent */ + /* Largest intermediate value 0x7f0000 */ + return mantissa << x >> fract_bits; +} + +void bch_bio_map(struct bio *bio, void *base); +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); + +#endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c new file mode 100644 index 000000000..18c6e0d28 --- /dev/null +++ b/drivers/md/bcache/writeback.c @@ -0,0 +1,1100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * background writeback - scan btree for dirty data and write it to the backing + * device + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "writeback.h" + +#include +#include +#include +#include + +static void update_gc_after_writeback(struct cache_set *c) +{ + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) + return; + + c->gc_after_writeback |= BCH_DO_AUTO_GC; +} + +/* Rate limiting */ +static uint64_t __calc_target_rate(struct cached_dev *dc) +{ + struct cache_set *c = dc->disk.c; + + /* + * This is the size of the cache, minus the amount used for + * flash-only devices + */ + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - + atomic_long_read(&c->flash_dev_dirty_sectors); + + /* + * Unfortunately there is no control of global dirty data. If the + * user states that they want 10% dirty data in the cache, and has, + * e.g., 5 backing volumes of equal size, we try and ensure each + * backing volume uses about 2% of the cache for dirty data. + */ + uint32_t bdev_share = + div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + c->cached_dev_sectors); + + uint64_t cache_dirty_target = + div_u64(cache_sectors * dc->writeback_percent, 100); + + /* Ensure each backing dev gets at least one dirty share */ + if (bdev_share < 1) + bdev_share = 1; + + return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; +} + +static void __update_writeback_rate(struct cached_dev *dc) +{ + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ + int64_t target = __calc_target_rate(dc); + int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled; + uint32_t new_rate; + + /* + * We need to consider the number of dirty buckets as well + * when calculating the proportional_scaled, Otherwise we might + * have an unreasonable small writeback rate at a highly fragmented situation + * when very few dirty sectors consumed a lot dirty buckets, the + * worst case is when dirty buckets reached cutoff_writeback_sync and + * dirty data is still not even reached to writeback percent, so the rate + * still will be at the minimum value, which will cause the write + * stuck at a non-writeback mode. + */ + struct cache_set *c = dc->disk.c; + + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; + + if (dc->writeback_consider_fragment && + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { + int64_t fragment = + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); + int64_t fp_term; + int64_t fps; + + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { + fp_term = (int64_t)dc->writeback_rate_fp_term_low * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); + } else { + fp_term = (int64_t)dc->writeback_rate_fp_term_high * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); + } + fps = div_s64(dirty, dirty_buckets) * fp_term; + if (fragment > 3 && fps > proportional_scaled) { + /* Only overrite the p when fragment > 3 */ + proportional_scaled = fps; + } + } + + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } + + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); + + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); + + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - + atomic_long_read(&dc->writeback_rate.rate); + atomic_long_set(&dc->writeback_rate.rate, new_rate); + dc->writeback_rate_target = target; +} + +static bool idle_counter_exceeded(struct cache_set *c) +{ + int counter, dev_nr; + + /* + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. + */ + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } + + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ +static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) +{ + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; + + if (!idle_counter_exceeded(c)) + return false; + + if (atomic_read(&c->at_max_writeback_rate) != 1) + atomic_set(&c->at_max_writeback_rate, 1); + + atomic_long_set(&dc->writeback_rate.rate, INT_MAX); + + /* keep writeback_rate_target as existing value */ + dc->writeback_rate_proportional = 0; + dc->writeback_rate_integral_scaled = 0; + dc->writeback_rate_change = 0; + + /* + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. + */ + if (!idle_counter_exceeded(c) || + !atomic_read(&c->at_max_writeback_rate)) + return false; + + return true; +} + +static void update_writeback_rate(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_rate_update); + struct cache_set *c = dc->disk.c; + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling + * cancel_delayed_work_sync(). + */ + set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb__after_atomic(); + + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, + * check it here too. + */ + if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb__after_atomic(); + return; + } + + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent && + !set_at_max_writeback_rate(c, dc)) { + do { + if (!down_read_trylock((&dc->writeback_lock))) { + dc->rate_update_retry++; + if (dc->rate_update_retry <= + BCH_WBRATE_UPDATE_MAX_SKIPS) + break; + down_read(&dc->writeback_lock); + dc->rate_update_retry = 0; + } + __update_writeback_rate(dc); + update_gc_after_writeback(c); + up_read(&dc->writeback_lock); + } while (0); + } + + + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, + * check it here too. + */ + if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling + * cancel_delayed_work_sync(). + */ + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb__after_atomic(); +} + +static unsigned int writeback_delay(struct cached_dev *dc, + unsigned int sectors) +{ + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + !dc->writeback_percent) + return 0; + + return bch_next_delay(&dc->writeback_rate, sectors); +} + +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + uint16_t sequence; + struct bio bio; +}; + +static void dirty_init(struct keybuf_key *w) +{ + struct dirty_io *io = w->private; + struct bio *bio = &io->bio; + + bio_init(bio, NULL, bio->bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); + if (!io->dc->writeback_percent) + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; + bio->bi_private = w; + bch_bio_map(bio, NULL); +} + +static void dirty_io_destructor(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + kfree(io); +} + +static void write_dirty_finish(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; + + bio_free_pages(&io->bio); + + /* This is kind of a dumb way of signalling errors. */ + if (KEY_DIRTY(&w->key)) { + int ret; + unsigned int i; + struct keylist keys; + + bch_keylist_init(&keys); + + bkey_copy(keys.top, &w->key); + SET_KEY_DIRTY(keys.top, false); + bch_keylist_push(&keys); + + for (i = 0; i < KEY_PTRS(&w->key); i++) + atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); + + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); + + if (ret) + trace_bcache_writeback_collision(&w->key); + + atomic_long_inc(ret + ? &dc->disk.c->writeback_keys_failed + : &dc->disk.c->writeback_keys_done); + } + + bch_keybuf_del(&dc->writeback_keys, w); + up(&dc->in_flight); + + closure_return_with_destructor(cl, dirty_io_destructor); +} + +static void dirty_endio(struct bio *bio) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + if (bio->bi_status) { + SET_KEY_DIRTY(&w->key, false); + bch_count_backing_io_errors(io->dc, bio); + } + + closure_put(&io->cl); +} + +static void write_dirty(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; + + uint16_t next_sequence; + + if (atomic_read(&dc->writeback_sequence_next) != io->sequence) { + /* Not our turn to write; wait for a write to complete */ + closure_wait(&dc->writeback_ordering_wait, cl); + + if (atomic_read(&dc->writeback_sequence_next) == io->sequence) { + /* + * Edge case-- it happened in indeterminate order + * relative to when we were added to wait list.. + */ + closure_wake_up(&dc->writeback_ordering_wait); + } + + continue_at(cl, write_dirty, io->dc->writeback_write_wq); + return; + } + + next_sequence = io->sequence + 1; + + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; + + /* I/O request sent to backing device */ + closure_bio_submit(io->dc->disk.c, &io->bio, cl); + } + + atomic_set(&dc->writeback_sequence_next, next_sequence); + closure_wake_up(&dc->writeback_ordering_wait); + + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); +} + +static void read_dirty_endio(struct bio *bio) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + /* is_read = 1 */ + bch_count_io_errors(io->dc->disk.c->cache, + bio->bi_status, 1, + "reading dirty data from cache"); + + dirty_endio(bio); +} + +static void read_dirty_submit(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + closure_bio_submit(io->dc->disk.c, &io->bio, cl); + + continue_at(cl, write_dirty, io->dc->writeback_write_wq); +} + +static void read_dirty(struct cached_dev *dc) +{ + unsigned int delay = 0; + struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; + size_t size; + int nk, i; + struct dirty_io *io; + struct closure cl; + uint16_t sequence = 0; + + BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); + atomic_set(&dc->writeback_sequence_next, sequence); + closure_init_stack(&cl); + + /* + * XXX: if we error, background writeback just spins. Should use some + * mempools. + */ + + next = bch_keybuf_next(&dc->writeback_keys); + + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && + next) { + size = 0; + nk = 0; + + do { + BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); + + /* + * Don't combine too many operations, even if they + * are all small. + */ + if (nk >= MAX_WRITEBACKS_IN_PASS) + break; + + /* + * If the current operation is very large, don't + * further combine operations. + */ + if (size >= MAX_WRITESIZE_IN_PASS) + break; + + /* + * Operations are only eligible to be combined + * if they are contiguous. + * + * TODO: add a heuristic willing to fire a + * certain amount of non-contiguous IO per pass, + * so that we can benefit from backing device + * command queueing. + */ + if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, + &START_KEY(&next->key))) + break; + + size += KEY_SIZE(&next->key); + keys[nk++] = next; + } while ((next = bch_keybuf_next(&dc->writeback_keys))); + + /* Now we have gathered a set of 1..5 keys to write back. */ + for (i = 0; i < nk; i++) { + w = keys[i]; + + io = kzalloc(struct_size(io, bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->dc = dc; + io->sequence = sequence++; + + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); + io->bio.bi_end_io = read_dirty_endio; + + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + goto err_free; + + trace_bcache_writeback(&w->key); + + down(&dc->in_flight); + + /* + * We've acquired a semaphore for the maximum + * simultaneous number of writebacks; from here + * everything happens asynchronously. + */ + closure_call(&io->cl, read_dirty_submit, NULL, &cl); + } + + delay = writeback_delay(dc, size); + + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && + delay) { + schedule_timeout_interruptible(delay); + delay = writeback_delay(dc, 0); + } + } + + if (0) { +err_free: + kfree(w->private); +err: + bch_keybuf_del(&dc->writeback_keys, w); + } + + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + closure_sync(&cl); +} + +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned int stripe_offset, sectors_dirty; + int stripe; + + if (!d) + return; + + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; + + if (UUID_FLASH_ONLY(&c->uuids[inode])) + atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); + + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned int, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } + + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + struct cached_dev *dc = container_of(buf, + struct cached_dev, + writeback_keys); + + BUG_ON(KEY_INODE(k) != dc->disk.id); + + return KEY_DIRTY(k); +} + +static void refill_full_stripes(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + unsigned int start_stripe, next_stripe; + int stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); + if (stripe < 0) + stripe = 0; + + start_stripe = stripe; + + while (1) { + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + if (stripe == dc->disk.nr_stripes) + goto next; + + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } + } +} + +/* + * Returns true if we scanned the entire disk + */ +static bool refill_dirty(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + struct bkey start = KEY(dc->disk.id, 0, 0); + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + struct bkey start_pos; + + /* + * make sure keybuf pos is inside the range for this disk - at bringup + * we might not be attached yet so this disk's inode nr isn't + * initialized then + */ + if (bkey_cmp(&buf->last_scanned, &start) < 0 || + bkey_cmp(&buf->last_scanned, &end) > 0) + buf->last_scanned = start; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } + + start_pos = buf->last_scanned; + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + + if (bkey_cmp(&buf->last_scanned, &end) < 0) + return false; + + /* + * If we get to the end start scanning again from the beginning, and + * only scan up to where we initially started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + struct cache_set *c = dc->disk.c; + bool searched_full_index; + + bch_ratelimit_reset(&dc->writeback_rate); + + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + /* + * If the bache device is detaching, skip here and continue + * to perform writeback. Otherwise, if no dirty data on cache, + * or there is dirty data on cache but writeback is disabled, + * the writeback thread should sleep here and wait for others + * to wake up it. + */ + if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { + up_write(&dc->writeback_lock); + + if (kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + set_current_state(TASK_RUNNING); + break; + } + + schedule(); + continue; + } + set_current_state(TASK_RUNNING); + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + /* + * If bcache device is detaching via sysfs interface, + * writeback thread should stop after there is no dirty + * data on cache. BCACHE_DEV_DETACHING flag is set in + * bch_cached_dev_detach(). + */ + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + + up_write(&dc->writeback_lock); + break; + } + + /* + * When dirty data rate is high (e.g. 50%+), there might + * be heavy buckets fragmentation after writeback + * finished, which hurts following write performance. + * If users really care about write performance they + * may set BCH_ENABLE_AUTO_GC via sysfs, then when + * BCH_DO_AUTO_GC is set, garbage collection thread + * will be wake up here. After moving gc, the shrunk + * btree and discarded free buckets SSD space may be + * helpful for following write requests. + */ + if (c->gc_after_writeback == + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; + force_wake_up_gc(c); + } + } + + up_write(&dc->writeback_lock); + + read_dirty(dc); + + if (searched_full_index) { + unsigned int delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); + } + } + + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); + + cached_dev_put(dc); + wait_for_kthread_stop(); + + return 0; +} + +/* Init */ +#define INIT_KEYS_EACH_TIME 500000 + +struct sectors_dirty_init { + struct btree_op op; + unsigned int inode; + size_t count; +}; + +static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, + struct bkey *k) +{ + struct sectors_dirty_init *op = container_of(_op, + struct sectors_dirty_init, op); + if (KEY_INODE(k) > op->inode) + return MAP_DONE; + + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + op->count++; + if (!(op->count % INIT_KEYS_EACH_TIME)) + cond_resched(); + + return MAP_CONTINUE; +} + +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) +{ + struct sectors_dirty_init op; + int ret; + + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, + 0); + if (ret < 0) + pr_warn("sectors dirty init failed, ret=%d!\n", ret); + + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); + + return ret; +} + +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + prev_idx = 0; + + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + } + + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } + + p = NULL; + prev_idx = cur_idx; + } + +out: + /* In order to wake up state->wait in time */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); + + return 0; +} + +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; + + return n; +} + +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct btree *b = NULL; + struct bkey *k = NULL; + struct btree_iter iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state state; + +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } + + /* Just count root keys if no leaf node */ + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; + sectors_dirty_init_fn(&op.op, c->root, k); + } + + rw_unlock(0, b); + return; + } + + memset(&state, 0, sizeof(struct bch_dirty_init_state)); + state.c = c; + state.d = d; + state.total_threads = bch_btre_dirty_init_thread_nr(); + state.key_idx = 0; + spin_lock_init(&state.idx_lock); + atomic_set(&state.started, 0); + atomic_set(&state.enough, 0); + init_waitqueue_head(&state.wait); + + for (i = 0; i < state.total_threads; i++) { + /* Fetch latest state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&state.enough)) + break; + + atomic_inc(&state.started); + state.infos[i].state = &state; + state.infos[i].thread = + kthread_run(bch_dirty_init_thread, &state.infos[i], + "bch_dirtcnt[%d]", i); + if (IS_ERR(state.infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); + for (--i; i >= 0; i--) + kthread_stop(state.infos[i].thread); + goto out; + } + } + +out: + /* Must wait for all threads to stop. */ + wait_event(state.wait, atomic_read(&state.started) == 0); + rw_unlock(0, b); +} + +void bch_cached_dev_writeback_init(struct cached_dev *dc) +{ + sema_init(&dc->in_flight, 64); + init_rwsem(&dc->writeback_lock); + bch_keybuf_init(&dc->writeback_keys); + + dc->writeback_metadata = true; + dc->writeback_running = false; + dc->writeback_consider_fragment = true; + dc->writeback_percent = 10; + dc->writeback_delay = 30; + atomic_long_set(&dc->writeback_rate.rate, 1024); + dc->writeback_rate_minimum = 8; + + dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_fp_term_low = 1; + dc->writeback_rate_fp_term_mid = 10; + dc->writeback_rate_fp_term_high = 1000; + dc->writeback_rate_i_term_inverse = 10000; + + /* For dc->writeback_lock contention in update_writeback_rate() */ + dc->rate_update_retry = 0; + + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); +} + +int bch_cached_dev_writeback_start(struct cached_dev *dc) +{ + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + + cached_dev_get(dc); + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) { + cached_dev_put(dc); + destroy_workqueue(dc->writeback_write_wq); + return PTR_ERR(dc->writeback_thread); + } + dc->writeback_running = true; + + WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + bch_writeback_queue(dc); + + return 0; +} diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 000000000..31df71695 --- /dev/null +++ b/drivers/md/bcache/writeback.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHE_WRITEBACK_H +#define _BCACHE_WRITEBACK_H + +#define CUTOFF_WRITEBACK 40 +#define CUTOFF_WRITEBACK_SYNC 70 + +#define CUTOFF_WRITEBACK_MAX 70 +#define CUTOFF_WRITEBACK_SYNC_MAX 90 + +#define MAX_WRITEBACKS_IN_PASS 5 +#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ + +#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 +#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 + +#define BCH_AUTO_GC_DIRTY_THRESHOLD 50 + +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64 + +#define BCH_DIRTY_INIT_THRD_MAX 12 +/* + * 14 (16384ths) is chosen here as something that each backing device + * should be a reasonable fraction of the share, and not to blow up + * until individual backing devices are a petabyte. + */ +#define WRITEBACK_SHARE_SHIFT 14 + +struct bch_dirty_init_state; +struct dirty_init_thrd_info { + struct bch_dirty_init_state *state; + struct task_struct *thread; +}; + +struct bch_dirty_init_state { + struct cache_set *c; + struct bcache_device *d; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX]; +}; + +static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) +{ + uint64_t i, ret = 0; + + for (i = 0; i < d->nr_stripes; i++) + ret += atomic_read(d->stripe_sectors_dirty + i); + + return ret; +} + +static inline int offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + + /* d->nr_stripes is in range [1, INT_MAX] */ + if (unlikely(offset >= d->nr_stripes)) { + pr_err("Invalid stripe %llu (>= nr_stripes %d).\n", + offset, d->nr_stripes); + return -EINVAL; + } + + /* + * Here offset is definitly smaller than INT_MAX, + * return it as int will never overflow. + */ + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, + uint64_t offset, + unsigned int nr_sectors) +{ + int stripe = offset_to_stripe(&dc->disk, offset); + + if (stripe < 0) + return false; + + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; + stripe++; + } +} + +extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + +static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, + unsigned int cache_mode, bool would_skip) +{ + unsigned int in_use = dc->disk.c->gc_stats.in_use; + + if (cache_mode != CACHE_MODE_WRITEBACK || + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + in_use > bch_cutoff_writeback_sync) + return false; + + if (bio_op(bio) == REQ_OP_DISCARD) + return false; + + if (dc->partial_stripes_expensive && + bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, + bio_sectors(bio))) + return true; + + if (would_skip) + return false; + + return (op_is_sync(bio->bi_opf) || + bio->bi_opf & (REQ_META|REQ_PRIO) || + in_use <= bch_cutoff_writeback); +} + +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, + uint64_t offset, int nr_sectors); + +void bch_sectors_dirty_init(struct bcache_device *d); +void bch_cached_dev_writeback_init(struct cached_dev *dc); +int bch_cached_dev_writeback_start(struct cached_dev *dc); + +#endif diff --git a/drivers/md/dm-audit.c b/drivers/md/dm-audit.c new file mode 100644 index 000000000..3049dfe67 --- /dev/null +++ b/drivers/md/dm-audit.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Creating audit records for mapped devices. + * + * Copyright (C) 2021 Fraunhofer AISEC. All rights reserved. + * + * Authors: Michael Weiß + */ + +#include +#include +#include +#include +#include + +#include "dm-audit.h" +#include "dm-core.h" + +static struct audit_buffer *dm_audit_log_start(int audit_type, + const char *dm_msg_prefix, + const char *op) +{ + struct audit_buffer *ab; + + if (audit_enabled == AUDIT_OFF) + return NULL; + + ab = audit_log_start(audit_context(), GFP_KERNEL, audit_type); + if (unlikely(!ab)) + return NULL; + + audit_log_format(ab, "module=%s op=%s", dm_msg_prefix, op); + return ab; +} + +void dm_audit_log_ti(int audit_type, const char *dm_msg_prefix, const char *op, + struct dm_target *ti, int result) +{ + struct audit_buffer *ab = NULL; + struct mapped_device *md = dm_table_get_md(ti->table); + int dev_major = dm_disk(md)->major; + int dev_minor = dm_disk(md)->first_minor; + + switch (audit_type) { + case AUDIT_DM_CTRL: + ab = dm_audit_log_start(audit_type, dm_msg_prefix, op); + if (unlikely(!ab)) + return; + audit_log_task_info(ab); + audit_log_format(ab, " dev=%d:%d error_msg='%s'", dev_major, + dev_minor, !result ? ti->error : "success"); + break; + case AUDIT_DM_EVENT: + ab = dm_audit_log_start(audit_type, dm_msg_prefix, op); + if (unlikely(!ab)) + return; + audit_log_format(ab, " dev=%d:%d sector=?", dev_major, + dev_minor); + break; + default: /* unintended use */ + return; + } + + audit_log_format(ab, " res=%d", result); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(dm_audit_log_ti); + +void dm_audit_log_bio(const char *dm_msg_prefix, const char *op, + struct bio *bio, sector_t sector, int result) +{ + struct audit_buffer *ab; + int dev_major = MAJOR(bio->bi_bdev->bd_dev); + int dev_minor = MINOR(bio->bi_bdev->bd_dev); + + ab = dm_audit_log_start(AUDIT_DM_EVENT, dm_msg_prefix, op); + if (unlikely(!ab)) + return; + + audit_log_format(ab, " dev=%d:%d sector=%llu res=%d", + dev_major, dev_minor, sector, result); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(dm_audit_log_bio); diff --git a/drivers/md/dm-audit.h b/drivers/md/dm-audit.h new file mode 100644 index 000000000..2385f2b65 --- /dev/null +++ b/drivers/md/dm-audit.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Creating audit records for mapped devices. + * + * Copyright (C) 2021 Fraunhofer AISEC. All rights reserved. + * + * Authors: Michael Weiß + */ + +#ifndef DM_AUDIT_H +#define DM_AUDIT_H + +#include +#include + +#ifdef CONFIG_DM_AUDIT +void dm_audit_log_bio(const char *dm_msg_prefix, const char *op, + struct bio *bio, sector_t sector, int result); + +/* + * dm_audit_log_ti() is not intended to be used directly in dm modules, + * the wrapper functions below should be called by dm modules instead. + */ +void dm_audit_log_ti(int audit_type, const char *dm_msg_prefix, const char *op, + struct dm_target *ti, int result); + +static inline void dm_audit_log_ctr(const char *dm_msg_prefix, + struct dm_target *ti, int result) +{ + dm_audit_log_ti(AUDIT_DM_CTRL, dm_msg_prefix, "ctr", ti, result); +} + +static inline void dm_audit_log_dtr(const char *dm_msg_prefix, + struct dm_target *ti, int result) +{ + dm_audit_log_ti(AUDIT_DM_CTRL, dm_msg_prefix, "dtr", ti, result); +} + +static inline void dm_audit_log_target(const char *dm_msg_prefix, const char *op, + struct dm_target *ti, int result) +{ + dm_audit_log_ti(AUDIT_DM_EVENT, dm_msg_prefix, op, ti, result); +} +#else +static inline void dm_audit_log_bio(const char *dm_msg_prefix, const char *op, + struct bio *bio, sector_t sector, + int result) +{ +} +static inline void dm_audit_log_target(const char *dm_msg_prefix, + const char *op, struct dm_target *ti, + int result) +{ +} +static inline void dm_audit_log_ctr(const char *dm_msg_prefix, + struct dm_target *ti, int result) +{ +} + +static inline void dm_audit_log_dtr(const char *dm_msg_prefix, + struct dm_target *ti, int result) +{ +} +#endif + +#endif diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c new file mode 100644 index 000000000..138067abe --- /dev/null +++ b/drivers/md/dm-bio-prison-v1.c @@ -0,0 +1,458 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison-v1.h" +#include "dm-bio-prison-v2.h" + +#include +#include +#include +#include + +/*----------------------------------------------------------------*/ + +#define MIN_CELLS 1024 + +struct dm_bio_prison { + spinlock_t lock; + struct rb_root cells; + mempool_t cell_pool; +}; + +static struct kmem_cache *_cell_cache; + +/*----------------------------------------------------------------*/ + +/* + * @nr_cells should be the number of cells you want in use _concurrently_. + * Don't confuse it with the number of distinct keys. + */ +struct dm_bio_prison *dm_bio_prison_create(void) +{ + struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL); + int ret; + + if (!prison) + return NULL; + + spin_lock_init(&prison->lock); + + ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache); + if (ret) { + kfree(prison); + return NULL; + } + + prison->cells = RB_ROOT; + + return prison; +} +EXPORT_SYMBOL_GPL(dm_bio_prison_create); + +void dm_bio_prison_destroy(struct dm_bio_prison *prison) +{ + mempool_exit(&prison->cell_pool); + kfree(prison); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); + +struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp) +{ + return mempool_alloc(&prison->cell_pool, gfp); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell); + +void dm_bio_prison_free_cell(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell) +{ + mempool_free(cell, &prison->cell_pool); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); + +static void __setup_new_cell(struct dm_cell_key *key, + struct bio *holder, + struct dm_bio_prison_cell *cell) +{ + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = holder; + bio_list_init(&cell->bios); +} + +static int cmp_keys(struct dm_cell_key *lhs, + struct dm_cell_key *rhs) +{ + if (lhs->virtual < rhs->virtual) + return -1; + + if (lhs->virtual > rhs->virtual) + return 1; + + if (lhs->dev < rhs->dev) + return -1; + + if (lhs->dev > rhs->dev) + return 1; + + if (lhs->block_end <= rhs->block_begin) + return -1; + + if (lhs->block_begin >= rhs->block_end) + return 1; + + return 0; +} + +static int __bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; + + while (*new) { + struct dm_bio_prison_cell *cell = + rb_entry(*new, struct dm_bio_prison_cell, node); + + r = cmp_keys(key, &cell->key); + + parent = *new; + if (r < 0) + new = &((*new)->rb_left); + else if (r > 0) + new = &((*new)->rb_right); + else { + if (inmate) + bio_list_add(&cell->bios, inmate); + *cell_result = cell; + return 1; + } + } + + __setup_new_cell(key, inmate, cell_prealloc); + *cell_result = cell_prealloc; + + rb_link_node(&cell_prealloc->node, parent, new); + rb_insert_color(&cell_prealloc->node, &prison->cells); + + return 0; +} + +static int bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + int r; + + spin_lock_irq(&prison->lock); + r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); + spin_unlock_irq(&prison->lock); + + return r; +} + +int dm_bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + return bio_detain(prison, key, inmate, cell_prealloc, cell_result); +} +EXPORT_SYMBOL_GPL(dm_bio_detain); + +int dm_get_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + return bio_detain(prison, key, NULL, cell_prealloc, cell_result); +} +EXPORT_SYMBOL_GPL(dm_get_cell); + +/* + * @inmates must have been initialised prior to this call + */ +static void __cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates) +{ + rb_erase(&cell->node, &prison->cells); + + if (inmates) { + if (cell->holder) + bio_list_add(inmates, cell->holder); + bio_list_merge(inmates, &cell->bios); + } +} + +void dm_cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + spin_lock_irq(&prison->lock); + __cell_release(prison, cell, bios); + spin_unlock_irq(&prison->lock); +} +EXPORT_SYMBOL_GPL(dm_cell_release); + +/* + * Sometimes we don't want the holder, just the additional bios. + */ +static void __cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates) +{ + rb_erase(&cell->node, &prison->cells); + bio_list_merge(inmates, &cell->bios); +} + +void dm_cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates) +{ + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(prison, cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); + +void dm_cell_error(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, blk_status_t error) +{ + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + dm_cell_release(prison, cell, &bios); + + while ((bio = bio_list_pop(&bios))) { + bio->bi_status = error; + bio_endio(bio); + } +} +EXPORT_SYMBOL_GPL(dm_cell_error); + +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + spin_lock_irq(&prison->lock); + visit_fn(context, cell); + rb_erase(&cell->node, &prison->cells); + spin_unlock_irq(&prison->lock); +} +EXPORT_SYMBOL_GPL(dm_cell_visit_release); + +static int __promote_or_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell) +{ + if (bio_list_empty(&cell->bios)) { + rb_erase(&cell->node, &prison->cells); + return 1; + } + + cell->holder = bio_list_pop(&cell->bios); + return 0; +} + +int dm_cell_promote_or_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell) +{ + int r; + + spin_lock_irq(&prison->lock); + r = __promote_or_release(prison, cell); + spin_unlock_irq(&prison->lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_promote_or_release); + +/*----------------------------------------------------------------*/ + +#define DEFERRED_SET_SIZE 64 + +struct dm_deferred_entry { + struct dm_deferred_set *ds; + unsigned int count; + struct list_head work_items; +}; + +struct dm_deferred_set { + spinlock_t lock; + unsigned int current_entry; + unsigned int sweeper; + struct dm_deferred_entry entries[DEFERRED_SET_SIZE]; +}; + +struct dm_deferred_set *dm_deferred_set_create(void) +{ + int i; + struct dm_deferred_set *ds; + + ds = kmalloc(sizeof(*ds), GFP_KERNEL); + if (!ds) + return NULL; + + spin_lock_init(&ds->lock); + ds->current_entry = 0; + ds->sweeper = 0; + for (i = 0; i < DEFERRED_SET_SIZE; i++) { + ds->entries[i].ds = ds; + ds->entries[i].count = 0; + INIT_LIST_HEAD(&ds->entries[i].work_items); + } + + return ds; +} +EXPORT_SYMBOL_GPL(dm_deferred_set_create); + +void dm_deferred_set_destroy(struct dm_deferred_set *ds) +{ + kfree(ds); +} +EXPORT_SYMBOL_GPL(dm_deferred_set_destroy); + +struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds) +{ + unsigned long flags; + struct dm_deferred_entry *entry; + + spin_lock_irqsave(&ds->lock, flags); + entry = ds->entries + ds->current_entry; + entry->count++; + spin_unlock_irqrestore(&ds->lock, flags); + + return entry; +} +EXPORT_SYMBOL_GPL(dm_deferred_entry_inc); + +static unsigned int ds_next(unsigned int index) +{ + return (index + 1) % DEFERRED_SET_SIZE; +} + +static void __sweep(struct dm_deferred_set *ds, struct list_head *head) +{ + while ((ds->sweeper != ds->current_entry) && + !ds->entries[ds->sweeper].count) { + list_splice_init(&ds->entries[ds->sweeper].work_items, head); + ds->sweeper = ds_next(ds->sweeper); + } + + if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) + list_splice_init(&ds->entries[ds->sweeper].work_items, head); +} + +void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&entry->ds->lock, flags); + BUG_ON(!entry->count); + --entry->count; + __sweep(entry->ds, head); + spin_unlock_irqrestore(&entry->ds->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); + +/* + * Returns 1 if deferred or 0 if no pending items to delay job. + */ +int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) +{ + int r = 1; + unsigned int next_entry; + + spin_lock_irq(&ds->lock); + if ((ds->sweeper == ds->current_entry) && + !ds->entries[ds->current_entry].count) + r = 0; + else { + list_add(work, &ds->entries[ds->current_entry].work_items); + next_entry = ds_next(ds->current_entry); + if (!ds->entries[next_entry].count) + ds->current_entry = next_entry; + } + spin_unlock_irq(&ds->lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_deferred_set_add_work); + +/*----------------------------------------------------------------*/ + +static int __init dm_bio_prison_init_v1(void) +{ + _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); + if (!_cell_cache) + return -ENOMEM; + + return 0; +} + +static void dm_bio_prison_exit_v1(void) +{ + kmem_cache_destroy(_cell_cache); + _cell_cache = NULL; +} + +static int (*_inits[])(void) __initdata = { + dm_bio_prison_init_v1, + dm_bio_prison_init_v2, +}; + +static void (*_exits[])(void) = { + dm_bio_prison_exit_v1, + dm_bio_prison_exit_v2, +}; + +static int __init dm_bio_prison_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i](); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _exits[i](); + + return r; +} + +static void __exit dm_bio_prison_exit(void) +{ + int i = ARRAY_SIZE(_exits); + + while (i--) + _exits[i](); +} + +/* + * module hooks + */ +module_init(dm_bio_prison_init); +module_exit(dm_bio_prison_exit); + +MODULE_DESCRIPTION(DM_NAME " bio prison"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h new file mode 100644 index 000000000..cec52ac5e --- /dev/null +++ b/drivers/md/dm-bio-prison-v1.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2011-2017 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_PRISON_H +#define DM_BIO_PRISON_H + +#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ +#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ + +#include +#include + +/*----------------------------------------------------------------*/ + +/* + * Sometimes we can't deal with a bio straight away. We put them in prison + * where they can't cause any mischief. Bios are put in a cell identified + * by a key, multiple bios can be in the same cell. When the cell is + * subsequently unlocked the bios become available. + */ +struct dm_bio_prison; + +/* + * Keys define a range of blocks within either a virtual or physical + * device. + */ +struct dm_cell_key { + int virtual; + dm_thin_id dev; + dm_block_t block_begin, block_end; +}; + +/* + * Treat this as opaque, only in header so callers can manage allocation + * themselves. + */ +struct dm_bio_prison_cell { + struct list_head user_list; /* for client use */ + struct rb_node node; + + struct dm_cell_key key; + struct bio *holder; + struct bio_list bios; +}; + +struct dm_bio_prison *dm_bio_prison_create(void); +void dm_bio_prison_destroy(struct dm_bio_prison *prison); + +/* + * These two functions just wrap a mempool. This is a transitory step: + * Eventually all bio prison clients should manage their own cell memory. + * + * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called + * in interrupt context or passed GFP_NOWAIT. + */ +struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, + gfp_t gfp); +void dm_bio_prison_free_cell(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell); + +/* + * Creates, or retrieves a cell that overlaps the given key. + * + * Returns 1 if pre-existing cell returned, zero if new cell created using + * @cell_prealloc. + */ +int dm_get_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result); + +/* + * An atomic op that combines retrieving or creating a cell, and adding a + * bio to it. + * + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + */ +int dm_bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result); + +void dm_cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *bios); +void dm_cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates); +void dm_cell_error(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, blk_status_t error); + +/* + * Visits the cell and then releases. Guarantees no new inmates are + * inserted between the visit and release. + */ +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, struct dm_bio_prison_cell *cell); + +/* + * Rather than always releasing the prisoners in a cell, the client may + * want to promote one of them to be the new holder. There is a race here + * though between releasing an empty cell, and other threads adding new + * inmates. So this function makes the decision with its lock held. + * + * This function can have two outcomes: + * i) An inmate is promoted to be the holder of the cell (return value of 0). + * ii) The cell has no inmate for promotion and is released (return value of 1). + */ +int dm_cell_promote_or_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell); + +/*----------------------------------------------------------------*/ + +/* + * We use the deferred set to keep track of pending reads to shared blocks. + * We do this to ensure the new mapping caused by a write isn't performed + * until these prior reads have completed. Otherwise the insertion of the + * new mapping could free the old block that the read bios are mapped to. + */ + +struct dm_deferred_set; +struct dm_deferred_entry; + +struct dm_deferred_set *dm_deferred_set_create(void); +void dm_deferred_set_destroy(struct dm_deferred_set *ds); + +struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds); +void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head); +int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c new file mode 100644 index 000000000..0cc0d13c4 --- /dev/null +++ b/drivers/md/dm-bio-prison-v2.c @@ -0,0 +1,364 @@ +/* + * Copyright (C) 2012-2017 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison-v2.h" + +#include +#include +#include +#include +#include + +/*----------------------------------------------------------------*/ + +#define MIN_CELLS 1024 + +struct dm_bio_prison_v2 { + struct workqueue_struct *wq; + + spinlock_t lock; + struct rb_root cells; + mempool_t cell_pool; +}; + +static struct kmem_cache *_cell_cache; + +/*----------------------------------------------------------------*/ + +/* + * @nr_cells should be the number of cells you want in use _concurrently_. + * Don't confuse it with the number of distinct keys. + */ +struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq) +{ + struct dm_bio_prison_v2 *prison = kzalloc(sizeof(*prison), GFP_KERNEL); + int ret; + + if (!prison) + return NULL; + + prison->wq = wq; + spin_lock_init(&prison->lock); + + ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache); + if (ret) { + kfree(prison); + return NULL; + } + + prison->cells = RB_ROOT; + + return prison; +} +EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2); + +void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison) +{ + mempool_exit(&prison->cell_pool); + kfree(prison); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2); + +struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp) +{ + return mempool_alloc(&prison->cell_pool, gfp); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2); + +void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell) +{ + mempool_free(cell, &prison->cell_pool); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2); + +static void __setup_new_cell(struct dm_cell_key_v2 *key, + struct dm_bio_prison_cell_v2 *cell) +{ + memset(cell, 0, sizeof(*cell)); + memcpy(&cell->key, key, sizeof(cell->key)); + bio_list_init(&cell->bios); +} + +static int cmp_keys(struct dm_cell_key_v2 *lhs, + struct dm_cell_key_v2 *rhs) +{ + if (lhs->virtual < rhs->virtual) + return -1; + + if (lhs->virtual > rhs->virtual) + return 1; + + if (lhs->dev < rhs->dev) + return -1; + + if (lhs->dev > rhs->dev) + return 1; + + if (lhs->block_end <= rhs->block_begin) + return -1; + + if (lhs->block_begin >= rhs->block_end) + return 1; + + return 0; +} + +/* + * Returns true if node found, otherwise it inserts a new one. + */ +static bool __find_or_insert(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **result) +{ + int r; + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; + + while (*new) { + struct dm_bio_prison_cell_v2 *cell = + rb_entry(*new, struct dm_bio_prison_cell_v2, node); + + r = cmp_keys(key, &cell->key); + + parent = *new; + if (r < 0) + new = &((*new)->rb_left); + + else if (r > 0) + new = &((*new)->rb_right); + + else { + *result = cell; + return true; + } + } + + __setup_new_cell(key, cell_prealloc); + *result = cell_prealloc; + rb_link_node(&cell_prealloc->node, parent, new); + rb_insert_color(&cell_prealloc->node, &prison->cells); + + return false; +} + +static bool __get(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned int lock_level, + struct bio *inmate, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell) +{ + if (__find_or_insert(prison, key, cell_prealloc, cell)) { + if ((*cell)->exclusive_lock) { + if (lock_level <= (*cell)->exclusive_level) { + bio_list_add(&(*cell)->bios, inmate); + return false; + } + } + + (*cell)->shared_count++; + + } else + (*cell)->shared_count = 1; + + return true; +} + +bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned int lock_level, + struct bio *inmate, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result) +{ + int r; + + spin_lock_irq(&prison->lock); + r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); + spin_unlock_irq(&prison->lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_get_v2); + +static bool __put(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell) +{ + BUG_ON(!cell->shared_count); + cell->shared_count--; + + // FIXME: shared locks granted above the lock level could starve this + if (!cell->shared_count) { + if (cell->exclusive_lock){ + if (cell->quiesce_continuation) { + queue_work(prison->wq, cell->quiesce_continuation); + cell->quiesce_continuation = NULL; + } + } else { + rb_erase(&cell->node, &prison->cells); + return true; + } + } + + return false; +} + +bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell) +{ + bool r; + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + r = __put(prison, cell); + spin_unlock_irqrestore(&prison->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_put_v2); + +static int __lock(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned int lock_level, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result) +{ + struct dm_bio_prison_cell_v2 *cell; + + if (__find_or_insert(prison, key, cell_prealloc, &cell)) { + if (cell->exclusive_lock) + return -EBUSY; + + cell->exclusive_lock = true; + cell->exclusive_level = lock_level; + *cell_result = cell; + + // FIXME: we don't yet know what level these shared locks + // were taken at, so have to quiesce them all. + return cell->shared_count > 0; + + } else { + cell = cell_prealloc; + cell->shared_count = 0; + cell->exclusive_lock = true; + cell->exclusive_level = lock_level; + *cell_result = cell; + } + + return 0; +} + +int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned int lock_level, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result) +{ + int r; + + spin_lock_irq(&prison->lock); + r = __lock(prison, key, lock_level, cell_prealloc, cell_result); + spin_unlock_irq(&prison->lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_lock_v2); + +static void __quiesce(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct work_struct *continuation) +{ + if (!cell->shared_count) + queue_work(prison->wq, continuation); + else + cell->quiesce_continuation = continuation; +} + +void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct work_struct *continuation) +{ + spin_lock_irq(&prison->lock); + __quiesce(prison, cell, continuation); + spin_unlock_irq(&prison->lock); +} +EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); + +static int __promote(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + unsigned int new_lock_level) +{ + if (!cell->exclusive_lock) + return -EINVAL; + + cell->exclusive_level = new_lock_level; + return cell->shared_count > 0; +} + +int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + unsigned int new_lock_level) +{ + int r; + + spin_lock_irq(&prison->lock); + r = __promote(prison, cell, new_lock_level); + spin_unlock_irq(&prison->lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2); + +static bool __unlock(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct bio_list *bios) +{ + BUG_ON(!cell->exclusive_lock); + + bio_list_merge(bios, &cell->bios); + bio_list_init(&cell->bios); + + if (cell->shared_count) { + cell->exclusive_lock = false; + return false; + } + + rb_erase(&cell->node, &prison->cells); + return true; +} + +bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct bio_list *bios) +{ + bool r; + + spin_lock_irq(&prison->lock); + r = __unlock(prison, cell, bios); + spin_unlock_irq(&prison->lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_unlock_v2); + +/*----------------------------------------------------------------*/ + +int __init dm_bio_prison_init_v2(void) +{ + _cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0); + if (!_cell_cache) + return -ENOMEM; + + return 0; +} + +void dm_bio_prison_exit_v2(void) +{ + kmem_cache_destroy(_cell_cache); + _cell_cache = NULL; +} diff --git a/drivers/md/dm-bio-prison-v2.h b/drivers/md/dm-bio-prison-v2.h new file mode 100644 index 000000000..5a7d996bb --- /dev/null +++ b/drivers/md/dm-bio-prison-v2.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2011-2017 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_PRISON_V2_H +#define DM_BIO_PRISON_V2_H + +#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ +#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ + +#include +#include +#include + +/*----------------------------------------------------------------*/ + +int dm_bio_prison_init_v2(void); +void dm_bio_prison_exit_v2(void); + +/* + * Sometimes we can't deal with a bio straight away. We put them in prison + * where they can't cause any mischief. Bios are put in a cell identified + * by a key, multiple bios can be in the same cell. When the cell is + * subsequently unlocked the bios become available. + */ +struct dm_bio_prison_v2; + +/* + * Keys define a range of blocks within either a virtual or physical + * device. + */ +struct dm_cell_key_v2 { + int virtual; + dm_thin_id dev; + dm_block_t block_begin, block_end; +}; + +/* + * Treat this as opaque, only in header so callers can manage allocation + * themselves. + */ +struct dm_bio_prison_cell_v2 { + // FIXME: pack these + bool exclusive_lock; + unsigned int exclusive_level; + unsigned int shared_count; + struct work_struct *quiesce_continuation; + + struct rb_node node; + struct dm_cell_key_v2 key; + struct bio_list bios; +}; + +struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq); +void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison); + +/* + * These two functions just wrap a mempool. This is a transitory step: + * Eventually all bio prison clients should manage their own cell memory. + * + * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called + * in interrupt context or passed GFP_NOWAIT. + */ +struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, + gfp_t gfp); +void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell); + +/* + * Shared locks have a bio associated with them. + * + * If the lock is granted the caller can continue to use the bio, and must + * call dm_cell_put_v2() to drop the reference count when finished using it. + * + * If the lock cannot be granted then the bio will be tracked within the + * cell, and later given to the holder of the exclusive lock. + * + * See dm_cell_lock_v2() for discussion of the lock_level parameter. + * + * Compare *cell_result with cell_prealloc to see if the prealloc was used. + * If cell_prealloc was used then inmate wasn't added to it. + * + * Returns true if the lock is granted. + */ +bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned int lock_level, + struct bio *inmate, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result); + +/* + * Decrement the shared reference count for the lock. Returns true if + * returning ownership of the cell (ie. you should free it). + */ +bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell); + +/* + * Locks a cell. No associated bio. Exclusive locks get priority. These + * locks constrain whether the io locks are granted according to level. + * + * Shared locks will still be granted if the lock_level is > (not = to) the + * exclusive lock level. + * + * If an _exclusive_ lock is already held then -EBUSY is returned. + * + * Return values: + * < 0 - error + * 0 - locked; no quiescing needed + * 1 - locked; quiescing needed + */ +int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned int lock_level, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result); + +void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct work_struct *continuation); + +/* + * Promotes an _exclusive_ lock to a higher lock level. + * + * Return values: + * < 0 - error + * 0 - promoted; no quiescing needed + * 1 - promoted; quiescing needed + */ +int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + unsigned int new_lock_level); + +/* + * Adds any held bios to the bio list. + * + * There may be shared locks still held at this point even if you quiesced + * (ie. different lock levels). + * + * Returns true if returning ownership of the cell (ie. you should free + * it). + */ +bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct bio_list *bios); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h new file mode 100644 index 000000000..745e3ab4a --- /dev/null +++ b/drivers/md/dm-bio-record.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_RECORD_H +#define DM_BIO_RECORD_H + +#include +#include + +/* + * There are lots of mutable fields in the bio struct that get + * changed by the lower levels of the block layer. Some targets, + * such as multipath, may wish to resubmit a bio on error. The + * functions in this file help the target record and restore the + * original bio state. + */ + +struct dm_bio_details { + struct block_device *bi_bdev; + int __bi_remaining; + unsigned long bi_flags; + struct bvec_iter bi_iter; + bio_end_io_t *bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload *bi_integrity; +#endif +}; + +static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) +{ + bd->bi_bdev = bio->bi_bdev; + bd->bi_flags = bio->bi_flags; + bd->bi_iter = bio->bi_iter; + bd->__bi_remaining = atomic_read(&bio->__bi_remaining); + bd->bi_end_io = bio->bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + bd->bi_integrity = bio_integrity(bio); +#endif +} + +static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) +{ + bio->bi_bdev = bd->bi_bdev; + bio->bi_flags = bd->bi_flags; + bio->bi_iter = bd->bi_iter; + atomic_set(&bio->__bi_remaining, bd->__bi_remaining); + bio->bi_end_io = bd->bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + bio->bi_integrity = bd->bi_integrity; +#endif +} + +#endif diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c new file mode 100644 index 000000000..100a6a236 --- /dev/null +++ b/drivers/md/dm-bufio.c @@ -0,0 +1,2181 @@ +/* + * Copyright (C) 2009-2011 Red Hat, Inc. + * + * Author: Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "bufio" + +/* + * Memory management policy: + * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory + * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). + * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. + * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT + * dirty buffers. + */ +#define DM_BUFIO_MIN_BUFFERS 8 + +#define DM_BUFIO_MEMORY_PERCENT 2 +#define DM_BUFIO_VMALLOC_PERCENT 25 +#define DM_BUFIO_WRITEBACK_RATIO 3 +#define DM_BUFIO_LOW_WATERMARK_RATIO 16 + +/* + * Check buffer ages in this interval (seconds) + */ +#define DM_BUFIO_WORK_TIMER_SECS 30 + +/* + * Free buffers when they are older than this (seconds) + */ +#define DM_BUFIO_DEFAULT_AGE_SECS 300 + +/* + * The nr of bytes of cached data to keep around. + */ +#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) + +/* + * Align buffer writes to this boundary. + * Tests show that SSDs have the highest IOPS when using 4k writes. + */ +#define DM_BUFIO_WRITE_ALIGN 4096 + +/* + * dm_buffer->list_mode + */ +#define LIST_CLEAN 0 +#define LIST_DIRTY 1 +#define LIST_SIZE 2 + +/* + * Linking of buffers: + * All buffers are linked to buffer_tree with their node field. + * + * Clean buffers that are not being written (B_WRITING not set) + * are linked to lru[LIST_CLEAN] with their lru_list field. + * + * Dirty and clean buffers that are being written are linked to + * lru[LIST_DIRTY] with their lru_list field. When the write + * finishes, the buffer cannot be relinked immediately (because we + * are in an interrupt context and relinking requires process + * context), so some clean-not-writing buffers can be held on + * dirty_lru too. They are later added to lru in the process + * context. + */ +struct dm_bufio_client { + struct mutex lock; + spinlock_t spinlock; + bool no_sleep; + + struct list_head lru[LIST_SIZE]; + unsigned long n_buffers[LIST_SIZE]; + + struct block_device *bdev; + unsigned int block_size; + s8 sectors_per_block_bits; + void (*alloc_callback)(struct dm_buffer *); + void (*write_callback)(struct dm_buffer *); + struct kmem_cache *slab_buffer; + struct kmem_cache *slab_cache; + struct dm_io_client *dm_io; + + struct list_head reserved_buffers; + unsigned int need_reserved_buffers; + + unsigned int minimum_buffers; + + struct rb_root buffer_tree; + wait_queue_head_t free_buffer_wait; + + sector_t start; + + int async_write_error; + + struct list_head client_list; + + struct shrinker shrinker; + struct work_struct shrink_work; + atomic_long_t need_shrink; +}; + +/* + * Buffer state bits. + */ +#define B_READING 0 +#define B_WRITING 1 +#define B_DIRTY 2 + +/* + * Describes how the block was allocated: + * kmem_cache_alloc(), __get_free_pages() or vmalloc(). + * See the comment at alloc_buffer_data. + */ +enum data_mode { + DATA_MODE_SLAB = 0, + DATA_MODE_GET_FREE_PAGES = 1, + DATA_MODE_VMALLOC = 2, + DATA_MODE_LIMIT = 3 +}; + +struct dm_buffer { + struct rb_node node; + struct list_head lru_list; + struct list_head global_list; + sector_t block; + void *data; + unsigned char data_mode; /* DATA_MODE_* */ + unsigned char list_mode; /* LIST_* */ + blk_status_t read_error; + blk_status_t write_error; + unsigned int accessed; + unsigned int hold_count; + unsigned long state; + unsigned long last_accessed; + unsigned int dirty_start; + unsigned int dirty_end; + unsigned int write_start; + unsigned int write_end; + struct dm_bufio_client *c; + struct list_head write_list; + void (*end_io)(struct dm_buffer *, blk_status_t); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING +#define MAX_STACK 10 + unsigned int stack_len; + unsigned long stack_entries[MAX_STACK]; +#endif +}; + +static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); + +/*----------------------------------------------------------------*/ + +#define dm_bufio_in_request() (!!current->bio_list) + +static void dm_bufio_lock(struct dm_bufio_client *c) +{ + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + spin_lock_bh(&c->spinlock); + else + mutex_lock_nested(&c->lock, dm_bufio_in_request()); +} + +static int dm_bufio_trylock(struct dm_bufio_client *c) +{ + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + return spin_trylock_bh(&c->spinlock); + else + return mutex_trylock(&c->lock); +} + +static void dm_bufio_unlock(struct dm_bufio_client *c) +{ + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + spin_unlock_bh(&c->spinlock); + else + mutex_unlock(&c->lock); +} + +/*----------------------------------------------------------------*/ + +/* + * Default cache size: available memory divided by the ratio. + */ +static unsigned long dm_bufio_default_cache_size; + +/* + * Total cache size set by the user. + */ +static unsigned long dm_bufio_cache_size; + +/* + * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change + * at any time. If it disagrees, the user has changed cache size. + */ +static unsigned long dm_bufio_cache_size_latch; + +static DEFINE_SPINLOCK(global_spinlock); + +static LIST_HEAD(global_queue); + +static unsigned long global_num = 0; + +/* + * Buffers are freed after this timeout + */ +static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; + +static unsigned long dm_bufio_peak_allocated; +static unsigned long dm_bufio_allocated_kmem_cache; +static unsigned long dm_bufio_allocated_get_free_pages; +static unsigned long dm_bufio_allocated_vmalloc; +static unsigned long dm_bufio_current_allocated; + +/*----------------------------------------------------------------*/ + +/* + * The current number of clients. + */ +static int dm_bufio_client_count; + +/* + * The list of all clients. + */ +static LIST_HEAD(dm_bufio_all_clients); + +/* + * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count + */ +static DEFINE_MUTEX(dm_bufio_clients_lock); + +static struct workqueue_struct *dm_bufio_wq; +static struct delayed_work dm_bufio_cleanup_old_work; +static struct work_struct dm_bufio_replacement_work; + + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING +static void buffer_record_stack(struct dm_buffer *b) +{ + b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2); +} +#endif + +/*---------------------------------------------------------------- + * A red/black tree acts as an index for all the buffers. + *--------------------------------------------------------------*/ +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + n = block < b->block ? n->rb_left : n->rb_right; + } + + return NULL; +} + +static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + struct dm_buffer *best = NULL; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + if (block <= b->block) { + n = n->rb_left; + best = b; + } else { + n = n->rb_right; + } + } + + return best; +} + +static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) +{ + struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; + struct dm_buffer *found; + + while (*new) { + found = container_of(*new, struct dm_buffer, node); + + if (found->block == b->block) { + BUG_ON(found != b); + return; + } + + parent = *new; + new = b->block < found->block ? + &found->node.rb_left : &found->node.rb_right; + } + + rb_link_node(&b->node, parent, new); + rb_insert_color(&b->node, &c->buffer_tree); +} + +static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) +{ + rb_erase(&b->node, &c->buffer_tree); +} + +/*----------------------------------------------------------------*/ + +static void adjust_total_allocated(struct dm_buffer *b, bool unlink) +{ + unsigned char data_mode; + long diff; + + static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { + &dm_bufio_allocated_kmem_cache, + &dm_bufio_allocated_get_free_pages, + &dm_bufio_allocated_vmalloc, + }; + + data_mode = b->data_mode; + diff = (long)b->c->block_size; + if (unlink) + diff = -diff; + + spin_lock(&global_spinlock); + + *class_ptr[data_mode] += diff; + + dm_bufio_current_allocated += diff; + + if (dm_bufio_current_allocated > dm_bufio_peak_allocated) + dm_bufio_peak_allocated = dm_bufio_current_allocated; + + b->accessed = 1; + + if (!unlink) { + list_add(&b->global_list, &global_queue); + global_num++; + if (dm_bufio_current_allocated > dm_bufio_cache_size) + queue_work(dm_bufio_wq, &dm_bufio_replacement_work); + } else { + list_del(&b->global_list); + global_num--; + } + + spin_unlock(&global_spinlock); +} + +/* + * Change the number of clients and recalculate per-client limit. + */ +static void __cache_size_refresh(void) +{ + BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); + BUG_ON(dm_bufio_client_count < 0); + + dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size); + + /* + * Use default if set to 0 and report the actual cache size used. + */ + if (!dm_bufio_cache_size_latch) { + (void)cmpxchg(&dm_bufio_cache_size, 0, + dm_bufio_default_cache_size); + dm_bufio_cache_size_latch = dm_bufio_default_cache_size; + } +} + +/* + * Allocating buffer data. + * + * Small buffers are allocated with kmem_cache, to use space optimally. + * + * For large buffers, we choose between get_free_pages and vmalloc. + * Each has advantages and disadvantages. + * + * __get_free_pages can randomly fail if the memory is fragmented. + * __vmalloc won't randomly fail, but vmalloc space is limited (it may be + * as low as 128M) so using it for caching is not appropriate. + * + * If the allocation may fail we use __get_free_pages. Memory fragmentation + * won't have a fatal effect here, but it just causes flushes of some other + * buffers and more I/O will be performed. Don't use __get_free_pages if it + * always fails (i.e. order >= MAX_ORDER). + * + * If the allocation shouldn't fail we use __vmalloc. This is only for the + * initial reserve allocation, so there's no risk of wasting all vmalloc + * space. + */ +static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, + unsigned char *data_mode) +{ + if (unlikely(c->slab_cache != NULL)) { + *data_mode = DATA_MODE_SLAB; + return kmem_cache_alloc(c->slab_cache, gfp_mask); + } + + if (c->block_size <= KMALLOC_MAX_SIZE && + gfp_mask & __GFP_NORETRY) { + *data_mode = DATA_MODE_GET_FREE_PAGES; + return (void *)__get_free_pages(gfp_mask, + c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); + } + + *data_mode = DATA_MODE_VMALLOC; + + /* + * __vmalloc allocates the data pages and auxiliary structures with + * gfp_flags that were specified, but pagetables are always allocated + * with GFP_KERNEL, no matter what was specified as gfp_mask. + * + * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that + * all allocations done by this process (including pagetables) are done + * as if GFP_NOIO was specified. + */ + if (gfp_mask & __GFP_NORETRY) { + unsigned int noio_flag = memalloc_noio_save(); + void *ptr = __vmalloc(c->block_size, gfp_mask); + + memalloc_noio_restore(noio_flag); + return ptr; + } + + return __vmalloc(c->block_size, gfp_mask); +} + +/* + * Free buffer's data. + */ +static void free_buffer_data(struct dm_bufio_client *c, + void *data, unsigned char data_mode) +{ + switch (data_mode) { + case DATA_MODE_SLAB: + kmem_cache_free(c->slab_cache, data); + break; + + case DATA_MODE_GET_FREE_PAGES: + free_pages((unsigned long)data, + c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); + break; + + case DATA_MODE_VMALLOC: + vfree(data); + break; + + default: + DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", + data_mode); + BUG(); + } +} + +/* + * Allocate buffer and its data. + */ +static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) +{ + struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask); + + if (!b) + return NULL; + + b->c = c; + + b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); + if (!b->data) { + kmem_cache_free(c->slab_buffer, b); + return NULL; + } + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + b->stack_len = 0; +#endif + return b; +} + +/* + * Free buffer and its data. + */ +static void free_buffer(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + free_buffer_data(c, b->data, b->data_mode); + kmem_cache_free(c->slab_buffer, b); +} + +/* + * Link buffer to the buffer tree and clean or dirty queue. + */ +static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) +{ + struct dm_bufio_client *c = b->c; + + c->n_buffers[dirty]++; + b->block = block; + b->list_mode = dirty; + list_add(&b->lru_list, &c->lru[dirty]); + __insert(b->c, b); + b->last_accessed = jiffies; + + adjust_total_allocated(b, false); +} + +/* + * Unlink buffer from the buffer tree and dirty or clean queue. + */ +static void __unlink_buffer(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + BUG_ON(!c->n_buffers[b->list_mode]); + + c->n_buffers[b->list_mode]--; + __remove(b->c, b); + list_del(&b->lru_list); + + adjust_total_allocated(b, true); +} + +/* + * Place the buffer to the head of dirty or clean LRU queue. + */ +static void __relink_lru(struct dm_buffer *b, int dirty) +{ + struct dm_bufio_client *c = b->c; + + b->accessed = 1; + + BUG_ON(!c->n_buffers[b->list_mode]); + + c->n_buffers[b->list_mode]--; + c->n_buffers[dirty]++; + b->list_mode = dirty; + list_move(&b->lru_list, &c->lru[dirty]); + b->last_accessed = jiffies; +} + +/*---------------------------------------------------------------- + * Submit I/O on the buffer. + * + * Bio interface is faster but it has some problems: + * the vector list is limited (increasing this limit increases + * memory-consumption per buffer, so it is not viable); + * + * the memory must be direct-mapped, not vmalloced; + * + * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and + * it is not vmalloced, try using the bio interface. + * + * If the buffer is big, if it is vmalloced or if the underlying device + * rejects the bio because it is too large, use dm-io layer to do the I/O. + * The dm-io layer splits the I/O into multiple requests, avoiding the above + * shortcomings. + *--------------------------------------------------------------*/ + +/* + * dm-io completion routine. It just calls b->bio.bi_end_io, pretending + * that the request was handled directly with bio interface. + */ +static void dmio_complete(unsigned long error, void *context) +{ + struct dm_buffer *b = context; + + b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0); +} + +static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector, + unsigned int n_sectors, unsigned int offset) +{ + int r; + struct dm_io_request io_req = { + .bi_opf = op, + .notify.fn = dmio_complete, + .notify.context = b, + .client = b->c->dm_io, + }; + struct dm_io_region region = { + .bdev = b->c->bdev, + .sector = sector, + .count = n_sectors, + }; + + if (b->data_mode != DATA_MODE_VMALLOC) { + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = (char *)b->data + offset; + } else { + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.vma = (char *)b->data + offset; + } + + r = dm_io(&io_req, 1, ®ion, NULL); + if (unlikely(r)) + b->end_io(b, errno_to_blk_status(r)); +} + +static void bio_complete(struct bio *bio) +{ + struct dm_buffer *b = bio->bi_private; + blk_status_t status = bio->bi_status; + bio_uninit(bio); + kfree(bio); + b->end_io(b, status); +} + +static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, + unsigned int n_sectors, unsigned int offset) +{ + struct bio *bio; + char *ptr; + unsigned int vec_size, len; + + vec_size = b->c->block_size >> PAGE_SHIFT; + if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT)) + vec_size += 2; + + bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); + if (!bio) { +dmio: + use_dmio(b, op, sector, n_sectors, offset); + return; + } + bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, op); + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = bio_complete; + bio->bi_private = b; + + ptr = (char *)b->data + offset; + len = n_sectors << SECTOR_SHIFT; + + do { + unsigned int this_step = min((unsigned int)(PAGE_SIZE - offset_in_page(ptr)), len); + if (!bio_add_page(bio, virt_to_page(ptr), this_step, + offset_in_page(ptr))) { + bio_put(bio); + goto dmio; + } + + len -= this_step; + ptr += this_step; + } while (len > 0); + + submit_bio(bio); +} + +static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block) +{ + sector_t sector; + + if (likely(c->sectors_per_block_bits >= 0)) + sector = block << c->sectors_per_block_bits; + else + sector = block * (c->block_size >> SECTOR_SHIFT); + sector += c->start; + + return sector; +} + +static void submit_io(struct dm_buffer *b, enum req_op op, + void (*end_io)(struct dm_buffer *, blk_status_t)) +{ + unsigned int n_sectors; + sector_t sector; + unsigned int offset, end; + + b->end_io = end_io; + + sector = block_to_sector(b->c, b->block); + + if (op != REQ_OP_WRITE) { + n_sectors = b->c->block_size >> SECTOR_SHIFT; + offset = 0; + } else { + if (b->c->write_callback) + b->c->write_callback(b); + offset = b->write_start; + end = b->write_end; + offset &= -DM_BUFIO_WRITE_ALIGN; + end += DM_BUFIO_WRITE_ALIGN - 1; + end &= -DM_BUFIO_WRITE_ALIGN; + if (unlikely(end > b->c->block_size)) + end = b->c->block_size; + + sector += offset >> SECTOR_SHIFT; + n_sectors = (end - offset) >> SECTOR_SHIFT; + } + + if (b->data_mode != DATA_MODE_VMALLOC) + use_bio(b, op, sector, n_sectors, offset); + else + use_dmio(b, op, sector, n_sectors, offset); +} + +/*---------------------------------------------------------------- + * Writing dirty buffers + *--------------------------------------------------------------*/ + +/* + * The endio routine for write. + * + * Set the error, clear B_WRITING bit and wake anyone who was waiting on + * it. + */ +static void write_endio(struct dm_buffer *b, blk_status_t status) +{ + b->write_error = status; + if (unlikely(status)) { + struct dm_bufio_client *c = b->c; + + (void)cmpxchg(&c->async_write_error, 0, + blk_status_to_errno(status)); + } + + BUG_ON(!test_bit(B_WRITING, &b->state)); + + smp_mb__before_atomic(); + clear_bit(B_WRITING, &b->state); + smp_mb__after_atomic(); + + wake_up_bit(&b->state, B_WRITING); +} + +/* + * Initiate a write on a dirty buffer, but don't wait for it. + * + * - If the buffer is not dirty, exit. + * - If there some previous write going on, wait for it to finish (we can't + * have two writes on the same buffer simultaneously). + * - Submit our write and don't wait on it. We set B_WRITING indicating + * that there is a write in progress. + */ +static void __write_dirty_buffer(struct dm_buffer *b, + struct list_head *write_list) +{ + if (!test_bit(B_DIRTY, &b->state)) + return; + + clear_bit(B_DIRTY, &b->state); + wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); + + b->write_start = b->dirty_start; + b->write_end = b->dirty_end; + + if (!write_list) + submit_io(b, REQ_OP_WRITE, write_endio); + else + list_add_tail(&b->write_list, write_list); +} + +static void __flush_write_list(struct list_head *write_list) +{ + struct blk_plug plug; + blk_start_plug(&plug); + while (!list_empty(write_list)) { + struct dm_buffer *b = + list_entry(write_list->next, struct dm_buffer, write_list); + list_del(&b->write_list); + submit_io(b, REQ_OP_WRITE, write_endio); + cond_resched(); + } + blk_finish_plug(&plug); +} + +/* + * Wait until any activity on the buffer finishes. Possibly write the + * buffer if it is dirty. When this function finishes, there is no I/O + * running on the buffer and the buffer is not dirty. + */ +static void __make_buffer_clean(struct dm_buffer *b) +{ + BUG_ON(b->hold_count); + + /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */ + if (!smp_load_acquire(&b->state)) /* fast case */ + return; + + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + __write_dirty_buffer(b, NULL); + wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); +} + +/* + * Find some buffer that is not held by anybody, clean it, unlink it and + * return it. + */ +static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + + list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { + BUG_ON(test_bit(B_WRITING, &b->state)); + BUG_ON(test_bit(B_DIRTY, &b->state)); + + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep && + unlikely(test_bit_acquire(B_READING, &b->state))) + continue; + + if (!b->hold_count) { + __make_buffer_clean(b); + __unlink_buffer(b); + return b; + } + cond_resched(); + } + + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + return NULL; + + list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { + BUG_ON(test_bit(B_READING, &b->state)); + + if (!b->hold_count) { + __make_buffer_clean(b); + __unlink_buffer(b); + return b; + } + cond_resched(); + } + + return NULL; +} + +/* + * Wait until some other threads free some buffer or release hold count on + * some buffer. + * + * This function is entered with c->lock held, drops it and regains it + * before exiting. + */ +static void __wait_for_free_buffer(struct dm_bufio_client *c) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&c->free_buffer_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + dm_bufio_unlock(c); + + io_schedule(); + + remove_wait_queue(&c->free_buffer_wait, &wait); + + dm_bufio_lock(c); +} + +enum new_flag { + NF_FRESH = 0, + NF_READ = 1, + NF_GET = 2, + NF_PREFETCH = 3 +}; + +/* + * Allocate a new buffer. If the allocation is not possible, wait until + * some other thread frees a buffer. + * + * May drop the lock and regain it. + */ +static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) +{ + struct dm_buffer *b; + bool tried_noio_alloc = false; + + /* + * dm-bufio is resistant to allocation failures (it just keeps + * one buffer reserved in cases all the allocations fail). + * So set flags to not try too hard: + * GFP_NOWAIT: don't wait; if we need to sleep we'll release our + * mutex and wait ourselves. + * __GFP_NORETRY: don't retry and rather return failure + * __GFP_NOMEMALLOC: don't use emergency reserves + * __GFP_NOWARN: don't print a warning in case of failure + * + * For debugging, if we set the cache size to 1, no new buffers will + * be allocated. + */ + while (1) { + if (dm_bufio_cache_size_latch != 1) { + b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (b) + return b; + } + + if (nf == NF_PREFETCH) + return NULL; + + if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) { + dm_bufio_unlock(c); + b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + dm_bufio_lock(c); + if (b) + return b; + tried_noio_alloc = true; + } + + if (!list_empty(&c->reserved_buffers)) { + b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + c->need_reserved_buffers++; + + return b; + } + + b = __get_unclaimed_buffer(c); + if (b) + return b; + + __wait_for_free_buffer(c); + } +} + +static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) +{ + struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); + + if (!b) + return NULL; + + if (c->alloc_callback) + c->alloc_callback(b); + + return b; +} + +/* + * Free a buffer and wake other threads waiting for free buffers. + */ +static void __free_buffer_wake(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + if (!c->need_reserved_buffers) + free_buffer(b); + else { + list_add(&b->lru_list, &c->reserved_buffers); + c->need_reserved_buffers--; + } + + wake_up(&c->free_buffer_wait); +} + +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, + struct list_head *write_list) +{ + struct dm_buffer *b, *tmp; + + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { + BUG_ON(test_bit(B_READING, &b->state)); + + if (!test_bit(B_DIRTY, &b->state) && + !test_bit(B_WRITING, &b->state)) { + __relink_lru(b, LIST_CLEAN); + continue; + } + + if (no_wait && test_bit(B_WRITING, &b->state)) + return; + + __write_dirty_buffer(b, write_list); + cond_resched(); + } +} + +/* + * Check if we're over watermark. + * If we are over threshold_buffers, start freeing buffers. + * If we're over "limit_buffers", block until we get under the limit. + */ +static void __check_watermark(struct dm_bufio_client *c, + struct list_head *write_list) +{ + if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO) + __write_dirty_buffers_async(c, 1, write_list); +} + +/*---------------------------------------------------------------- + * Getting a buffer + *--------------------------------------------------------------*/ + +static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, + enum new_flag nf, int *need_submit, + struct list_head *write_list) +{ + struct dm_buffer *b, *new_b = NULL; + + *need_submit = 0; + + b = __find(c, block); + if (b) + goto found_buffer; + + if (nf == NF_GET) + return NULL; + + new_b = __alloc_buffer_wait(c, nf); + if (!new_b) + return NULL; + + /* + * We've had a period where the mutex was unlocked, so need to + * recheck the buffer tree. + */ + b = __find(c, block); + if (b) { + __free_buffer_wake(new_b); + goto found_buffer; + } + + __check_watermark(c, write_list); + + b = new_b; + b->hold_count = 1; + b->read_error = 0; + b->write_error = 0; + __link_buffer(b, block, LIST_CLEAN); + + if (nf == NF_FRESH) { + b->state = 0; + return b; + } + + b->state = 1 << B_READING; + *need_submit = 1; + + return b; + +found_buffer: + if (nf == NF_PREFETCH) + return NULL; + /* + * Note: it is essential that we don't wait for the buffer to be + * read if dm_bufio_get function is used. Both dm_bufio_get and + * dm_bufio_prefetch can be used in the driver request routine. + * If the user called both dm_bufio_prefetch and dm_bufio_get on + * the same buffer, it would deadlock if we waited. + */ + if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) + return NULL; + + b->hold_count++; + __relink_lru(b, test_bit(B_DIRTY, &b->state) || + test_bit(B_WRITING, &b->state)); + return b; +} + +/* + * The endio routine for reading: set the error, clear the bit and wake up + * anyone waiting on the buffer. + */ +static void read_endio(struct dm_buffer *b, blk_status_t status) +{ + b->read_error = status; + + BUG_ON(!test_bit(B_READING, &b->state)); + + smp_mb__before_atomic(); + clear_bit(B_READING, &b->state); + smp_mb__after_atomic(); + + wake_up_bit(&b->state, B_READING); +} + +/* + * A common routine for dm_bufio_new and dm_bufio_read. Operation of these + * functions is similar except that dm_bufio_new doesn't read the + * buffer from the disk (assuming that the caller overwrites all the data + * and uses dm_bufio_mark_buffer_dirty to write new data back). + */ +static void *new_read(struct dm_bufio_client *c, sector_t block, + enum new_flag nf, struct dm_buffer **bp) +{ + int need_submit; + struct dm_buffer *b; + + LIST_HEAD(write_list); + + dm_bufio_lock(c); + b = __bufio_new(c, block, nf, &need_submit, &write_list); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + if (b && b->hold_count == 1) + buffer_record_stack(b); +#endif + dm_bufio_unlock(c); + + __flush_write_list(&write_list); + + if (!b) + return NULL; + + if (need_submit) + submit_io(b, REQ_OP_READ, read_endio); + + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + + if (b->read_error) { + int error = blk_status_to_errno(b->read_error); + + dm_bufio_release(b); + + return ERR_PTR(error); + } + + *bp = b; + + return b->data; +} + +void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + return new_read(c, block, NF_GET, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_get); + +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + BUG_ON(dm_bufio_in_request()); + + return new_read(c, block, NF_READ, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_read); + +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + BUG_ON(dm_bufio_in_request()); + + return new_read(c, block, NF_FRESH, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_new); + +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned int n_blocks) +{ + struct blk_plug plug; + + LIST_HEAD(write_list); + + BUG_ON(dm_bufio_in_request()); + + blk_start_plug(&plug); + dm_bufio_lock(c); + + for (; n_blocks--; block++) { + int need_submit; + struct dm_buffer *b; + b = __bufio_new(c, block, NF_PREFETCH, &need_submit, + &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + blk_finish_plug(&plug); + __flush_write_list(&write_list); + blk_start_plug(&plug); + dm_bufio_lock(c); + } + if (unlikely(b != NULL)) { + dm_bufio_unlock(c); + + if (need_submit) + submit_io(b, REQ_OP_READ, read_endio); + dm_bufio_release(b); + + cond_resched(); + + if (!n_blocks) + goto flush_plug; + dm_bufio_lock(c); + } + } + + dm_bufio_unlock(c); + +flush_plug: + blk_finish_plug(&plug); +} +EXPORT_SYMBOL_GPL(dm_bufio_prefetch); + +void dm_bufio_release(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + dm_bufio_lock(c); + + BUG_ON(!b->hold_count); + + b->hold_count--; + if (!b->hold_count) { + wake_up(&c->free_buffer_wait); + + /* + * If there were errors on the buffer, and the buffer is not + * to be written, free the buffer. There is no point in caching + * invalid buffer. + */ + if ((b->read_error || b->write_error) && + !test_bit_acquire(B_READING, &b->state) && + !test_bit(B_WRITING, &b->state) && + !test_bit(B_DIRTY, &b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_release); + +void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b, + unsigned int start, unsigned int end) +{ + struct dm_bufio_client *c = b->c; + + BUG_ON(start >= end); + BUG_ON(end > b->c->block_size); + + dm_bufio_lock(c); + + BUG_ON(test_bit(B_READING, &b->state)); + + if (!test_and_set_bit(B_DIRTY, &b->state)) { + b->dirty_start = start; + b->dirty_end = end; + __relink_lru(b, LIST_DIRTY); + } else { + if (start < b->dirty_start) + b->dirty_start = start; + if (end > b->dirty_end) + b->dirty_end = end; + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty); + +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) +{ + dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size); +} +EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); + +void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) +{ + LIST_HEAD(write_list); + + BUG_ON(dm_bufio_in_request()); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); +} +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); + +/* + * For performance, it is essential that the buffers are written asynchronously + * and simultaneously (so that the block layer can merge the writes) and then + * waited upon. + * + * Finally, we flush hardware disk cache. + */ +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) +{ + int a, f; + unsigned long buffers_processed = 0; + struct dm_buffer *b, *tmp; + + LIST_HEAD(write_list); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); + dm_bufio_lock(c); + +again: + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { + int dropped_lock = 0; + + if (buffers_processed < c->n_buffers[LIST_DIRTY]) + buffers_processed++; + + BUG_ON(test_bit(B_READING, &b->state)); + + if (test_bit(B_WRITING, &b->state)) { + if (buffers_processed < c->n_buffers[LIST_DIRTY]) { + dropped_lock = 1; + b->hold_count++; + dm_bufio_unlock(c); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); + dm_bufio_lock(c); + b->hold_count--; + } else + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); + } + + if (!test_bit(B_DIRTY, &b->state) && + !test_bit(B_WRITING, &b->state)) + __relink_lru(b, LIST_CLEAN); + + cond_resched(); + + /* + * If we dropped the lock, the list is no longer consistent, + * so we must restart the search. + * + * In the most common case, the buffer just processed is + * relinked to the clean list, so we won't loop scanning the + * same buffer again and again. + * + * This may livelock if there is another thread simultaneously + * dirtying buffers, so we count the number of buffers walked + * and if it exceeds the total number of buffers, it means that + * someone is doing some writes simultaneously with us. In + * this case, stop, dropping the lock. + */ + if (dropped_lock) + goto again; + } + wake_up(&c->free_buffer_wait); + dm_bufio_unlock(c); + + a = xchg(&c->async_write_error, 0); + f = dm_bufio_issue_flush(c); + if (a) + return a; + + return f; +} +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); + +/* + * Use dm-io to send an empty barrier to flush the device. + */ +int dm_bufio_issue_flush(struct dm_bufio_client *c) +{ + struct dm_io_request io_req = { + .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = 0, + .count = 0, + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); + +/* + * Use dm-io to send a discard request to flush the device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + struct dm_io_request io_req = { + .bi_opf = REQ_OP_DISCARD | REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = block_to_sector(c, block), + .count = block_to_sector(c, count), + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); + +/* + * We first delete any other buffer that may be at that new location. + * + * Then, we write the buffer to the original location if it was dirty. + * + * Then, if we are the only one who is holding the buffer, relink the buffer + * in the buffer tree for the new location. + * + * If there was someone else holding the buffer, we write it to the new + * location but not relink it, because that other user needs to have the buffer + * at the same place. + */ +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) +{ + struct dm_bufio_client *c = b->c; + struct dm_buffer *new; + + BUG_ON(dm_bufio_in_request()); + + dm_bufio_lock(c); + +retry: + new = __find(c, new_block); + if (new) { + if (new->hold_count) { + __wait_for_free_buffer(c); + goto retry; + } + + /* + * FIXME: Is there any point waiting for a write that's going + * to be overwritten in a bit? + */ + __make_buffer_clean(new); + __unlink_buffer(new); + __free_buffer_wake(new); + } + + BUG_ON(!b->hold_count); + BUG_ON(test_bit(B_READING, &b->state)); + + __write_dirty_buffer(b, NULL); + if (b->hold_count == 1) { + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); + set_bit(B_DIRTY, &b->state); + b->dirty_start = 0; + b->dirty_end = c->block_size; + __unlink_buffer(b); + __link_buffer(b, new_block, LIST_DIRTY); + } else { + sector_t old_block; + wait_on_bit_lock_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); + /* + * Relink buffer to "new_block" so that write_callback + * sees "new_block" as a block number. + * After the write, link the buffer back to old_block. + * All this must be done in bufio lock, so that block number + * change isn't visible to other threads. + */ + old_block = b->block; + __unlink_buffer(b); + __link_buffer(b, new_block, b->list_mode); + submit_io(b, REQ_OP_WRITE, write_endio); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); + __unlink_buffer(b); + __link_buffer(b, old_block, b->list_mode); + } + + dm_bufio_unlock(c); + dm_bufio_release(b); +} +EXPORT_SYMBOL_GPL(dm_bufio_release_move); + +static void forget_buffer_locked(struct dm_buffer *b) +{ + if (likely(!b->hold_count) && likely(!smp_load_acquire(&b->state))) { + __unlink_buffer(b); + __free_buffer_wake(b); + } +} + +/* + * Free the given buffer. + * + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) +{ + struct dm_buffer *b; + + dm_bufio_lock(c); + + b = __find(c, block); + if (b) + forget_buffer_locked(b); + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_forget); + +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks) +{ + struct dm_buffer *b; + sector_t end_block = block + n_blocks; + + while (block < end_block) { + dm_bufio_lock(c); + + b = __find_next(c, block); + if (b) { + block = b->block + 1; + forget_buffer_locked(b); + } + + dm_bufio_unlock(c); + + if (!b) + break; + } + +} +EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers); + +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n) +{ + c->minimum_buffers = n; +} +EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers); + +unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c) +{ + return c->block_size; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); + +sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) +{ + sector_t s = bdev_nr_sectors(c->bdev); + if (s >= c->start) + s -= c->start; + else + s = 0; + if (likely(c->sectors_per_block_bits >= 0)) + s >>= c->sectors_per_block_bits; + else + sector_div(s, c->block_size >> SECTOR_SHIFT); + return s; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); + +struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c) +{ + return c->dm_io; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client); + +sector_t dm_bufio_get_block_number(struct dm_buffer *b) +{ + return b->block; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); + +void *dm_bufio_get_block_data(struct dm_buffer *b) +{ + return b->data; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); + +void *dm_bufio_get_aux_data(struct dm_buffer *b) +{ + return b + 1; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); + +struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) +{ + return b->c; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_client); + +static void drop_buffers(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + int i; + bool warned = false; + + BUG_ON(dm_bufio_in_request()); + + /* + * An optimization so that the buffers are not written one-by-one. + */ + dm_bufio_write_dirty_buffers_async(c); + + dm_bufio_lock(c); + + while ((b = __get_unclaimed_buffer(c))) + __free_buffer_wake(b); + + for (i = 0; i < LIST_SIZE; i++) + list_for_each_entry(b, &c->lru[i], lru_list) { + WARN_ON(!warned); + warned = true; + DMERR("leaked buffer %llx, hold count %u, list %d", + (unsigned long long)b->block, b->hold_count, i); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + stack_trace_print(b->stack_entries, b->stack_len, 1); + /* mark unclaimed to avoid BUG_ON below */ + b->hold_count = 0; +#endif + } + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + while ((b = __get_unclaimed_buffer(c))) + __free_buffer_wake(b); +#endif + + for (i = 0; i < LIST_SIZE; i++) + BUG_ON(!list_empty(&c->lru[i])); + + dm_bufio_unlock(c); +} + +/* + * We may not be able to evict this buffer if IO pending or the client + * is still using it. Caller is expected to know buffer is too old. + * + * And if GFP_NOFS is used, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets + * rerouted to different bufio client. + */ +static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) +{ + if (!(gfp & __GFP_FS) || + (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { + if (test_bit_acquire(B_READING, &b->state) || + test_bit(B_WRITING, &b->state) || + test_bit(B_DIRTY, &b->state)) + return false; + } + + if (b->hold_count) + return false; + + __make_buffer_clean(b); + __unlink_buffer(b); + __free_buffer_wake(b); + + return true; +} + +static unsigned long get_retain_buffers(struct dm_bufio_client *c) +{ + unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); + if (likely(c->sectors_per_block_bits >= 0)) + retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT; + else + retain_bytes /= c->block_size; + return retain_bytes; +} + +static void __scan(struct dm_bufio_client *c) +{ + int l; + struct dm_buffer *b, *tmp; + unsigned long freed = 0; + unsigned long count = c->n_buffers[LIST_CLEAN] + + c->n_buffers[LIST_DIRTY]; + unsigned long retain_target = get_retain_buffers(c); + + for (l = 0; l < LIST_SIZE; l++) { + list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { + if (count - freed <= retain_target) + atomic_long_set(&c->need_shrink, 0); + if (!atomic_long_read(&c->need_shrink)) + return; + if (__try_evict_buffer(b, GFP_KERNEL)) { + atomic_long_dec(&c->need_shrink); + freed++; + } + cond_resched(); + } + } +} + +static void shrink_work(struct work_struct *w) +{ + struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work); + + dm_bufio_lock(c); + __scan(c); + dm_bufio_unlock(c); +} + +static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + struct dm_bufio_client *c; + + c = container_of(shrink, struct dm_bufio_client, shrinker); + atomic_long_add(sc->nr_to_scan, &c->need_shrink); + queue_work(dm_bufio_wq, &c->shrink_work); + + return sc->nr_to_scan; +} + +static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); + unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) + + READ_ONCE(c->n_buffers[LIST_DIRTY]); + unsigned long retain_target = get_retain_buffers(c); + unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink); + + if (unlikely(count < retain_target)) + count = 0; + else + count -= retain_target; + + if (unlikely(count < queued_for_cleanup)) + count = 0; + else + count -= queued_for_cleanup; + + return count; +} + +/* + * Create the buffering interface + */ +struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size, + unsigned int reserved_buffers, unsigned int aux_size, + void (*alloc_callback)(struct dm_buffer *), + void (*write_callback)(struct dm_buffer *), + unsigned int flags) +{ + int r; + struct dm_bufio_client *c; + unsigned int i; + char slab_name[27]; + + if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) { + DMERR("%s: block size not specified or is not multiple of 512b", __func__); + r = -EINVAL; + goto bad_client; + } + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) { + r = -ENOMEM; + goto bad_client; + } + c->buffer_tree = RB_ROOT; + + c->bdev = bdev; + c->block_size = block_size; + if (is_power_of_2(block_size)) + c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; + else + c->sectors_per_block_bits = -1; + + c->alloc_callback = alloc_callback; + c->write_callback = write_callback; + + if (flags & DM_BUFIO_CLIENT_NO_SLEEP) { + c->no_sleep = true; + static_branch_inc(&no_sleep_enabled); + } + + for (i = 0; i < LIST_SIZE; i++) { + INIT_LIST_HEAD(&c->lru[i]); + c->n_buffers[i] = 0; + } + + mutex_init(&c->lock); + spin_lock_init(&c->spinlock); + INIT_LIST_HEAD(&c->reserved_buffers); + c->need_reserved_buffers = reserved_buffers; + + dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS); + + init_waitqueue_head(&c->free_buffer_wait); + c->async_write_error = 0; + + c->dm_io = dm_io_client_create(); + if (IS_ERR(c->dm_io)) { + r = PTR_ERR(c->dm_io); + goto bad_dm_io; + } + + if (block_size <= KMALLOC_MAX_SIZE && + (block_size < PAGE_SIZE || !is_power_of_2(block_size))) { + unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE); + snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size); + c->slab_cache = kmem_cache_create(slab_name, block_size, align, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!c->slab_cache) { + r = -ENOMEM; + goto bad; + } + } + if (aux_size) + snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size); + else + snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer"); + c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size, + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (!c->slab_buffer) { + r = -ENOMEM; + goto bad; + } + + while (c->need_reserved_buffers) { + struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); + + if (!b) { + r = -ENOMEM; + goto bad; + } + __free_buffer_wake(b); + } + + INIT_WORK(&c->shrink_work, shrink_work); + atomic_long_set(&c->need_shrink, 0); + + c->shrinker.count_objects = dm_bufio_shrink_count; + c->shrinker.scan_objects = dm_bufio_shrink_scan; + c->shrinker.seeks = 1; + c->shrinker.batch = 0; + r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (r) + goto bad; + + mutex_lock(&dm_bufio_clients_lock); + dm_bufio_client_count++; + list_add(&c->client_list, &dm_bufio_all_clients); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + + return c; + +bad: + while (!list_empty(&c->reserved_buffers)) { + struct dm_buffer *b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + free_buffer(b); + } + kmem_cache_destroy(c->slab_cache); + kmem_cache_destroy(c->slab_buffer); + dm_io_client_destroy(c->dm_io); +bad_dm_io: + mutex_destroy(&c->lock); + if (c->no_sleep) + static_branch_dec(&no_sleep_enabled); + kfree(c); +bad_client: + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_create); + +/* + * Free the buffering interface. + * It is required that there are no references on any buffers. + */ +void dm_bufio_client_destroy(struct dm_bufio_client *c) +{ + unsigned int i; + + drop_buffers(c); + + unregister_shrinker(&c->shrinker); + flush_work(&c->shrink_work); + + mutex_lock(&dm_bufio_clients_lock); + + list_del(&c->client_list); + dm_bufio_client_count--; + __cache_size_refresh(); + + mutex_unlock(&dm_bufio_clients_lock); + + BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); + BUG_ON(c->need_reserved_buffers); + + while (!list_empty(&c->reserved_buffers)) { + struct dm_buffer *b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + free_buffer(b); + } + + for (i = 0; i < LIST_SIZE; i++) + if (c->n_buffers[i]) + DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); + + for (i = 0; i < LIST_SIZE; i++) + BUG_ON(c->n_buffers[i]); + + kmem_cache_destroy(c->slab_cache); + kmem_cache_destroy(c->slab_buffer); + dm_io_client_destroy(c->dm_io); + mutex_destroy(&c->lock); + if (c->no_sleep) + static_branch_dec(&no_sleep_enabled); + kfree(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); + +void dm_bufio_client_reset(struct dm_bufio_client *c) +{ + drop_buffers(c); + flush_work(&c->shrink_work); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_reset); + +void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start) +{ + c->start = start; +} +EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); + +static unsigned int get_max_age_hz(void) +{ + unsigned int max_age = READ_ONCE(dm_bufio_max_age); + + if (max_age > UINT_MAX / HZ) + max_age = UINT_MAX / HZ; + + return max_age * HZ; +} + +static bool older_than(struct dm_buffer *b, unsigned long age_hz) +{ + return time_after_eq(jiffies, b->last_accessed + age_hz); +} + +static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) +{ + struct dm_buffer *b, *tmp; + unsigned long retain_target = get_retain_buffers(c); + unsigned long count; + LIST_HEAD(write_list); + + dm_bufio_lock(c); + + __check_watermark(c, &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + __flush_write_list(&write_list); + dm_bufio_lock(c); + } + + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { + if (count <= retain_target) + break; + + if (!older_than(b, age_hz)) + break; + + if (__try_evict_buffer(b, 0)) + count--; + + cond_resched(); + } + + dm_bufio_unlock(c); +} + +static void do_global_cleanup(struct work_struct *w) +{ + struct dm_bufio_client *locked_client = NULL; + struct dm_bufio_client *current_client; + struct dm_buffer *b; + unsigned int spinlock_hold_count; + unsigned long threshold = dm_bufio_cache_size - + dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO; + unsigned long loops = global_num * 2; + + mutex_lock(&dm_bufio_clients_lock); + + while (1) { + cond_resched(); + + spin_lock(&global_spinlock); + if (unlikely(dm_bufio_current_allocated <= threshold)) + break; + + spinlock_hold_count = 0; +get_next: + if (!loops--) + break; + if (unlikely(list_empty(&global_queue))) + break; + b = list_entry(global_queue.prev, struct dm_buffer, global_list); + + if (b->accessed) { + b->accessed = 0; + list_move(&b->global_list, &global_queue); + if (likely(++spinlock_hold_count < 16)) + goto get_next; + spin_unlock(&global_spinlock); + continue; + } + + current_client = b->c; + if (unlikely(current_client != locked_client)) { + if (locked_client) + dm_bufio_unlock(locked_client); + + if (!dm_bufio_trylock(current_client)) { + spin_unlock(&global_spinlock); + dm_bufio_lock(current_client); + locked_client = current_client; + continue; + } + + locked_client = current_client; + } + + spin_unlock(&global_spinlock); + + if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) { + spin_lock(&global_spinlock); + list_move(&b->global_list, &global_queue); + spin_unlock(&global_spinlock); + } + } + + spin_unlock(&global_spinlock); + + if (locked_client) + dm_bufio_unlock(locked_client); + + mutex_unlock(&dm_bufio_clients_lock); +} + +static void cleanup_old_buffers(void) +{ + unsigned long max_age_hz = get_max_age_hz(); + struct dm_bufio_client *c; + + mutex_lock(&dm_bufio_clients_lock); + + __cache_size_refresh(); + + list_for_each_entry(c, &dm_bufio_all_clients, client_list) + __evict_old_buffers(c, max_age_hz); + + mutex_unlock(&dm_bufio_clients_lock); +} + +static void work_fn(struct work_struct *w) +{ + cleanup_old_buffers(); + + queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, + DM_BUFIO_WORK_TIMER_SECS * HZ); +} + +/*---------------------------------------------------------------- + * Module setup + *--------------------------------------------------------------*/ + +/* + * This is called only once for the whole dm_bufio module. + * It initializes memory limit. + */ +static int __init dm_bufio_init(void) +{ + __u64 mem; + + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + + mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(), + DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; + + if (mem > ULONG_MAX) + mem = ULONG_MAX; + +#ifdef CONFIG_MMU + if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100)) + mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100); +#endif + + dm_bufio_default_cache_size = mem; + + mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + + dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0); + if (!dm_bufio_wq) + return -ENOMEM; + + INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); + INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); + queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, + DM_BUFIO_WORK_TIMER_SECS * HZ); + + return 0; +} + +/* + * This is called once when unloading the dm_bufio module. + */ +static void __exit dm_bufio_exit(void) +{ + int bug = 0; + + cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); + destroy_workqueue(dm_bufio_wq); + + if (dm_bufio_client_count) { + DMCRIT("%s: dm_bufio_client_count leaked: %d", + __func__, dm_bufio_client_count); + bug = 1; + } + + if (dm_bufio_current_allocated) { + DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", + __func__, dm_bufio_current_allocated); + bug = 1; + } + + if (dm_bufio_allocated_get_free_pages) { + DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", + __func__, dm_bufio_allocated_get_free_pages); + bug = 1; + } + + if (dm_bufio_allocated_vmalloc) { + DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", + __func__, dm_bufio_allocated_vmalloc); + bug = 1; + } + + BUG_ON(bug); +} + +module_init(dm_bufio_init) +module_exit(dm_bufio_exit) + +module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); + +module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); + +module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); + +module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); + +module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); + +module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); + +module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); + +module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); +MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); + +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c new file mode 100644 index 000000000..8eb52e425 --- /dev/null +++ b/drivers/md/dm-builtin.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "dm-core.h" + +/* + * The kobject release method must not be placed in the module itself, + * otherwise we are subject to module unload races. + * + * The release method is called when the last reference to the kobject is + * dropped. It may be called by any other kernel code that drops the last + * reference. + * + * The release method suffers from module unload race. We may prevent the + * module from being unloaded at the start of the release method (using + * increased module reference count or synchronizing against the release + * method), however there is no way to prevent the module from being + * unloaded at the end of the release method. + * + * If this code were placed in the dm module, the following race may + * happen: + * 1. Some other process takes a reference to dm kobject + * 2. The user issues ioctl function to unload the dm device + * 3. dm_sysfs_exit calls kobject_put, however the object is not released + * because of the other reference taken at step 1 + * 4. dm_sysfs_exit waits on the completion + * 5. The other process that took the reference in step 1 drops it, + * dm_kobject_release is called from this process + * 6. dm_kobject_release calls complete() + * 7. a reschedule happens before dm_kobject_release returns + * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference + * count is decremented + * 9. The user unloads the dm module + * 10. The other process that was rescheduled in step 7 continues to run, + * it is now executing code in unloaded module, so it crashes + * + * Note that if the process that takes the foreign reference to dm kobject + * has a low priority and the system is sufficiently loaded with + * higher-priority processes that prevent the low-priority process from + * being scheduled long enough, this bug may really happen. + * + * In order to fix this module unload race, we place the release method + * into a helper code that is compiled directly into the kernel. + */ + +void dm_kobject_release(struct kobject *kobj) +{ + complete(dm_get_completion_from_kobject(kobj)); +} + +EXPORT_SYMBOL(dm_kobject_release); diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c new file mode 100644 index 000000000..c606e6bfc --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-background-tracker.h" + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "dm-background-tracker" + +struct bt_work { + struct list_head list; + struct rb_node node; + struct policy_work work; +}; + +struct background_tracker { + unsigned int max_work; + atomic_t pending_promotes; + atomic_t pending_writebacks; + atomic_t pending_demotes; + + struct list_head issued; + struct list_head queued; + struct rb_root pending; + + struct kmem_cache *work_cache; +}; + +struct background_tracker *btracker_create(unsigned int max_work) +{ + struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); + + if (!b) { + DMERR("couldn't create background_tracker"); + return NULL; + } + + b->max_work = max_work; + atomic_set(&b->pending_promotes, 0); + atomic_set(&b->pending_writebacks, 0); + atomic_set(&b->pending_demotes, 0); + + INIT_LIST_HEAD(&b->issued); + INIT_LIST_HEAD(&b->queued); + + b->pending = RB_ROOT; + b->work_cache = KMEM_CACHE(bt_work, 0); + if (!b->work_cache) { + DMERR("couldn't create mempool for background work items"); + kfree(b); + b = NULL; + } + + return b; +} +EXPORT_SYMBOL_GPL(btracker_create); + +void btracker_destroy(struct background_tracker *b) +{ + struct bt_work *w, *tmp; + + BUG_ON(!list_empty(&b->issued)); + list_for_each_entry_safe (w, tmp, &b->queued, list) { + list_del(&w->list); + kmem_cache_free(b->work_cache, w); + } + + kmem_cache_destroy(b->work_cache); + kfree(b); +} +EXPORT_SYMBOL_GPL(btracker_destroy); + +static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) +{ + if (from_oblock(lhs) < from_oblock(rhs)) + return -1; + + if (from_oblock(rhs) < from_oblock(lhs)) + return 1; + + return 0; +} + +static bool __insert_pending(struct background_tracker *b, + struct bt_work *nw) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node, *parent = NULL; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + parent = *new; + cmp = cmp_oblock(w->work.oblock, nw->work.oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + /* already present */ + return false; + } + + rb_link_node(&nw->node, parent, new); + rb_insert_color(&nw->node, &b->pending); + + return true; +} + +static struct bt_work *__find_pending(struct background_tracker *b, + dm_oblock_t oblock) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + cmp = cmp_oblock(w->work.oblock, oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + break; + } + + return *new ? w : NULL; +} + + +static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) +{ + switch (w->op) { + case POLICY_PROMOTE: + atomic_add(delta, &b->pending_promotes); + break; + + case POLICY_DEMOTE: + atomic_add(delta, &b->pending_demotes); + break; + + case POLICY_WRITEBACK: + atomic_add(delta, &b->pending_writebacks); + break; + } +} + +unsigned int btracker_nr_writebacks_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_writebacks); +} +EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); + +unsigned int btracker_nr_demotions_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_demotes); +} +EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); + +static bool max_work_reached(struct background_tracker *b) +{ + return atomic_read(&b->pending_promotes) + + atomic_read(&b->pending_writebacks) + + atomic_read(&b->pending_demotes) >= b->max_work; +} + +static struct bt_work *alloc_work(struct background_tracker *b) +{ + if (max_work_reached(b)) + return NULL; + + return kmem_cache_alloc(b->work_cache, GFP_NOWAIT); +} + +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork) +{ + struct bt_work *w; + + if (pwork) + *pwork = NULL; + + w = alloc_work(b); + if (!w) + return -ENOMEM; + + memcpy(&w->work, work, sizeof(*work)); + + if (!__insert_pending(b, w)) { + /* + * There was a race, we'll just ignore this second + * bit of work for the same oblock. + */ + kmem_cache_free(b->work_cache, w); + return -EINVAL; + } + + if (pwork) { + *pwork = &w->work; + list_add(&w->list, &b->issued); + } else + list_add(&w->list, &b->queued); + update_stats(b, &w->work, 1); + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_queue); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work) +{ + struct bt_work *w; + + if (list_empty(&b->queued)) + return -ENODATA; + + w = list_first_entry(&b->queued, struct bt_work, list); + list_move(&w->list, &b->issued); + *work = &w->work; + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_issue); + +void btracker_complete(struct background_tracker *b, + struct policy_work *op) +{ + struct bt_work *w = container_of(op, struct bt_work, work); + + update_stats(b, &w->work, -1); + rb_erase(&w->node, &b->pending); + list_del(&w->list); + kmem_cache_free(b->work_cache, w); +} +EXPORT_SYMBOL_GPL(btracker_complete); + +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock) +{ + return __find_pending(b, oblock) != NULL; +} +EXPORT_SYMBOL_GPL(btracker_promotion_already_present); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h new file mode 100644 index 000000000..14d3d53dc --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BACKGROUND_WORK_H +#define DM_CACHE_BACKGROUND_WORK_H + +#include +#include "dm-cache-policy.h" + +/*----------------------------------------------------------------*/ + +/* + * The cache policy decides what background work should be performed, + * such as promotions, demotions and writebacks. The core cache target + * is in charge of performing the work, and does so when it sees fit. + * + * The background_tracker acts as a go between. Keeping track of future + * work that the policy has decided upon, and handing (issuing) it to + * the core target when requested. + * + * There is no locking in this, so calls will probably need to be + * protected with a spinlock. + */ + +struct background_work; +struct background_tracker; + +/* + * Create a new tracker, it will not be able to queue more than + * 'max_work' entries. + */ +struct background_tracker *btracker_create(unsigned int max_work); + +/* + * Destroy the tracker. No issued, but not complete, work should + * exist when this is called. It is fine to have queued but unissued + * work. + */ +void btracker_destroy(struct background_tracker *b); + +unsigned int btracker_nr_writebacks_queued(struct background_tracker *b); +unsigned int btracker_nr_demotions_queued(struct background_tracker *b); + +/* + * Queue some work within the tracker. 'work' should point to the work + * to queue, this will be copied (ownership doesn't pass). If pwork + * is not NULL then it will be set to point to the tracker's internal + * copy of the work. + * + * returns -EINVAL iff the work is already queued. -ENOMEM if the work + * couldn't be queued for another reason. + */ +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork); + +/* + * Hands out the next piece of work to be performed. + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work); + +/* + * Informs the tracker that the work has been completed and it may forget + * about it. + */ +void btracker_complete(struct background_tracker *b, struct policy_work *op); + +/* + * Predicate to see if an origin block is already scheduled for promotion. + */ +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h new file mode 100644 index 000000000..389c9e8ac --- /dev/null +++ b/drivers/md/dm-cache-block-types.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BLOCK_TYPES_H +#define DM_CACHE_BLOCK_TYPES_H + +#include "persistent-data/dm-block-manager.h" + +/*----------------------------------------------------------------*/ + +/* + * It's helpful to get sparse to differentiate between indexes into the + * origin device, indexes into the cache device, and indexes into the + * discard bitset. + */ + +typedef dm_block_t __bitwise dm_oblock_t; +typedef uint32_t __bitwise dm_cblock_t; +typedef dm_block_t __bitwise dm_dblock_t; + +static inline dm_oblock_t to_oblock(dm_block_t b) +{ + return (__force dm_oblock_t) b; +} + +static inline dm_block_t from_oblock(dm_oblock_t b) +{ + return (__force dm_block_t) b; +} + +static inline dm_cblock_t to_cblock(uint32_t b) +{ + return (__force dm_cblock_t) b; +} + +static inline uint32_t from_cblock(dm_cblock_t b) +{ + return (__force uint32_t) b; +} + +static inline dm_dblock_t to_dblock(dm_block_t b) +{ + return (__force dm_dblock_t) b; +} + +static inline dm_block_t from_dblock(dm_dblock_t b) +{ + return (__force dm_block_t) b; +} + +#endif /* DM_CACHE_BLOCK_TYPES_H */ diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c new file mode 100644 index 000000000..f5b4c996d --- /dev/null +++ b/drivers/md/dm-cache-metadata.c @@ -0,0 +1,1860 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-cache-metadata.h" + +#include "persistent-data/dm-array.h" +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-disk.h" +#include "persistent-data/dm-transaction-manager.h" + +#include +#include + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "cache metadata" + +#define CACHE_SUPERBLOCK_MAGIC 06142003 +#define CACHE_SUPERBLOCK_LOCATION 0 + +/* + * defines a range of metadata versions that this module can handle. + */ +#define MIN_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 2 + +/* + * 3 for btree insert + + * 2 for btree lookup used within space map + */ +#define CACHE_MAX_CONCURRENT_LOCKS 5 +#define SPACE_MAP_ROOT_SIZE 128 + +enum superblock_flag_bits { + /* for spotting crashes that would invalidate the dirty bitset */ + CLEAN_SHUTDOWN, + /* metadata must be checked using the tools */ + NEEDS_CHECK, +}; + +/* + * Each mapping from cache block -> origin block carries a set of flags. + */ +enum mapping_bits { + /* + * A valid mapping. Because we're using an array we clear this + * flag for an non existant mapping. + */ + M_VALID = 1, + + /* + * The data on the cache is different from that on the origin. + * This flag is only used by metadata format 1. + */ + M_DIRTY = 2 +}; + +struct cache_disk_superblock { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[16]; + __le64 magic; + __le32 version; + + __u8 policy_name[CACHE_POLICY_NAME_SIZE]; + __le32 policy_hint_size; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + __le64 mapping_root; + __le64 hint_root; + + __le64 discard_root; + __le64 discard_block_size; + __le64 discard_nr_blocks; + + __le32 data_block_size; + __le32 metadata_block_size; + __le32 cache_blocks; + + __le32 compat_flags; + __le32 compat_ro_flags; + __le32 incompat_flags; + + __le32 read_hits; + __le32 read_misses; + __le32 write_hits; + __le32 write_misses; + + __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; + + /* + * Metadata format 2 fields. + */ + __le64 dirty_root; +} __packed; + +struct dm_cache_metadata { + refcount_t ref_count; + struct list_head list; + + unsigned int version; + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_transaction_manager *tm; + + struct dm_array_info info; + struct dm_array_info hint_info; + struct dm_disk_bitset discard_info; + + struct rw_semaphore root_lock; + unsigned long flags; + dm_block_t root; + dm_block_t hint_root; + dm_block_t discard_root; + + sector_t discard_block_size; + dm_dblock_t discard_nr_blocks; + + sector_t data_block_size; + dm_cblock_t cache_blocks; + bool changed:1; + bool clean_when_opened:1; + + char policy_name[CACHE_POLICY_NAME_SIZE]; + unsigned int policy_version[CACHE_POLICY_VERSION_SIZE]; + size_t policy_hint_size; + struct dm_cache_statistics stats; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + /* + * Set if a transaction has to be aborted but the attempt to roll + * back to the previous (good) transaction failed. The only + * metadata operation permissible in this state is the closing of + * the device. + */ + bool fail_io:1; + + /* + * Metadata format 2 fields. + */ + dm_block_t dirty_root; + struct dm_disk_bitset dirty_info; + + /* + * These structures are used when loading metadata. They're too + * big to put on the stack. + */ + struct dm_array_cursor mapping_cursor; + struct dm_array_cursor hint_cursor; + struct dm_bitset_cursor dirty_cursor; +}; + +/*------------------------------------------------------------------- + * superblock validator + *-----------------------------------------------------------------*/ + +#define SUPERBLOCK_CSUM_XOR 9031977 + +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct cache_disk_superblock *disk_super = dm_block_data(b); + + disk_super->blocknr = cpu_to_le64(dm_block_location(b)); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int check_metadata_version(struct cache_disk_superblock *disk_super) +{ + uint32_t metadata_version = le32_to_cpu(disk_super->version); + + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { + DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", + metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); + return -EINVAL; + } + + return 0; +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct cache_disk_superblock *disk_super = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { + DMERR("sb_check failed: blocknr %llu: wanted %llu", + le64_to_cpu(disk_super->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: wanted %llu", + le64_to_cpu(disk_super->magic), + (unsigned long long)CACHE_SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk_super->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); + return -EILSEQ; + } + + return check_metadata_version(disk_super); +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*----------------------------------------------------------------*/ + +static int superblock_read_lock(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock_zero(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +/*----------------------------------------------------------------*/ + +static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) +{ + int r; + unsigned int i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned int sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = true; + for (i = 0; i < sb_block_size; i++) { + if (data_le[i] != zero) { + *result = false; + break; + } + } + + dm_bm_unlock(b); + + return 0; +} + +static void __setup_mapping_info(struct dm_cache_metadata *cmd) +{ + struct dm_btree_value_type vt; + + vt.context = NULL; + vt.size = sizeof(__le64); + vt.inc = NULL; + vt.dec = NULL; + vt.equal = NULL; + dm_array_info_init(&cmd->info, cmd->tm, &vt); + + if (cmd->policy_hint_size) { + vt.size = sizeof(__le32); + dm_array_info_init(&cmd->hint_info, cmd->tm, &vt); + } +} + +static int __save_sm_root(struct dm_cache_metadata *cmd) +{ + int r; + size_t metadata_len; + + r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); + if (r < 0) + return r; + + return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root, + metadata_len); +} + +static void __copy_sm_root(struct dm_cache_metadata *cmd, + struct cache_disk_superblock *disk_super) +{ + memcpy(&disk_super->metadata_space_map_root, + &cmd->metadata_space_map_root, + sizeof(cmd->metadata_space_map_root)); +} + +static bool separate_dirty_bits(struct dm_cache_metadata *cmd) +{ + return cmd->version >= 2; +} + +static int __write_initial_superblock(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct cache_disk_superblock *disk_super; + sector_t bdev_size = bdev_nr_sectors(cmd->bdev); + + /* FIXME: see if we can lose the max sectors limit */ + if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) + bdev_size = DM_CACHE_METADATA_MAX_SECTORS; + + r = dm_tm_pre_commit(cmd->tm); + if (r < 0) + return r; + + /* + * dm_sm_copy_root() can fail. So we need to do it before we start + * updating the superblock. + */ + r = __save_sm_root(cmd); + if (r) + return r; + + r = superblock_lock_zero(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->flags = 0; + memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); + disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); + disk_super->version = cpu_to_le32(cmd->version); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); + disk_super->policy_hint_size = cpu_to_le32(0); + + __copy_sm_root(cmd, disk_super); + + disk_super->mapping_root = cpu_to_le64(cmd->root); + disk_super->hint_root = cpu_to_le64(cmd->hint_root); + disk_super->discard_root = cpu_to_le64(cmd->discard_root); + disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); + disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); + disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); + disk_super->cache_blocks = cpu_to_le32(0); + + disk_super->read_hits = cpu_to_le32(0); + disk_super->read_misses = cpu_to_le32(0); + disk_super->write_hits = cpu_to_le32(0); + disk_super->write_misses = cpu_to_le32(0); + + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); + + return dm_tm_commit(cmd->tm, sblock); +} + +static int __format_metadata(struct dm_cache_metadata *cmd) +{ + int r; + + r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &cmd->tm, &cmd->metadata_sm); + if (r < 0) { + DMERR("tm_create_with_sm failed"); + return r; + } + + __setup_mapping_info(cmd); + + r = dm_array_empty(&cmd->info, &cmd->root); + if (r < 0) + goto bad; + + if (separate_dirty_bits(cmd)) { + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); + r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root); + if (r < 0) + goto bad; + } + + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); + if (r < 0) + goto bad; + + cmd->discard_block_size = 0; + cmd->discard_nr_blocks = 0; + + r = __write_initial_superblock(cmd); + if (r) + goto bad; + + cmd->clean_when_opened = true; + return 0; + +bad: + dm_tm_destroy(cmd->tm); + dm_sm_destroy(cmd->metadata_sm); + + return r; +} + +static int __check_incompat_features(struct cache_disk_superblock *disk_super, + struct dm_cache_metadata *cmd) +{ + uint32_t incompat_flags, features; + + incompat_flags = le32_to_cpu(disk_super->incompat_flags); + features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; + if (features) { + DMERR("could not access metadata due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + /* + * Check for read-only metadata to skip the following RDWR checks. + */ + if (bdev_read_only(cmd->bdev)) + return 0; + + features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP; + if (features) { + DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + return 0; +} + +static int __open_metadata(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct cache_disk_superblock *disk_super; + unsigned long sb_flags; + + r = superblock_read_lock(cmd, &sblock); + if (r < 0) { + DMERR("couldn't read lock superblock"); + return r; + } + + disk_super = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk_super->data_block_size), + (unsigned long long)cmd->data_block_size); + r = -EINVAL; + goto bad; + } + + r = __check_incompat_features(disk_super, cmd); + if (r < 0) + goto bad; + + r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + disk_super->metadata_space_map_root, + sizeof(disk_super->metadata_space_map_root), + &cmd->tm, &cmd->metadata_sm); + if (r < 0) { + DMERR("tm_open_with_sm failed"); + goto bad; + } + + __setup_mapping_info(cmd); + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + sb_flags = le32_to_cpu(disk_super->flags); + cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); + dm_bm_unlock(sblock); + + return 0; + +bad: + dm_bm_unlock(sblock); + return r; +} + +static int __open_or_format_metadata(struct dm_cache_metadata *cmd, + bool format_device) +{ + int r; + bool unformatted = false; + + r = __superblock_all_zeroes(cmd->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return format_device ? __format_metadata(cmd) : -EPERM; + + return __open_metadata(cmd); +} + +static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, + bool may_format_device) +{ + int r; + cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + CACHE_MAX_CONCURRENT_LOCKS); + if (IS_ERR(cmd->bm)) { + DMERR("could not create block manager"); + r = PTR_ERR(cmd->bm); + cmd->bm = NULL; + return r; + } + + r = __open_or_format_metadata(cmd, may_format_device); + if (r) { + dm_block_manager_destroy(cmd->bm); + cmd->bm = NULL; + } + + return r; +} + +static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd, + bool destroy_bm) +{ + dm_sm_destroy(cmd->metadata_sm); + dm_tm_destroy(cmd->tm); + if (destroy_bm) + dm_block_manager_destroy(cmd->bm); +} + +typedef unsigned long (*flags_mutator)(unsigned long); + +static void update_flags(struct cache_disk_superblock *disk_super, + flags_mutator mutator) +{ + uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags)); + disk_super->flags = cpu_to_le32(sb_flags); +} + +static unsigned long set_clean_shutdown(unsigned long flags) +{ + set_bit(CLEAN_SHUTDOWN, &flags); + return flags; +} + +static unsigned long clear_clean_shutdown(unsigned long flags) +{ + clear_bit(CLEAN_SHUTDOWN, &flags); + return flags; +} + +static void read_superblock_fields(struct dm_cache_metadata *cmd, + struct cache_disk_superblock *disk_super) +{ + cmd->version = le32_to_cpu(disk_super->version); + cmd->flags = le32_to_cpu(disk_super->flags); + cmd->root = le64_to_cpu(disk_super->mapping_root); + cmd->hint_root = le64_to_cpu(disk_super->hint_root); + cmd->discard_root = le64_to_cpu(disk_super->discard_root); + cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); + cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); + cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); + strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); + cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); + cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); + cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); + + cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); + cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses); + cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); + cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); + + if (separate_dirty_bits(cmd)) + cmd->dirty_root = le64_to_cpu(disk_super->dirty_root); + + cmd->changed = false; +} + +/* + * The mutator updates the superblock flags. + */ +static int __begin_transaction_flags(struct dm_cache_metadata *cmd, + flags_mutator mutator) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + r = superblock_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + update_flags(disk_super, mutator); + read_superblock_fields(cmd, disk_super); + dm_bm_unlock(sblock); + + return dm_bm_flush(cmd->bm); +} + +static int __begin_transaction(struct dm_cache_metadata *cmd) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We re-read the superblock every time. Shouldn't need to do this + * really. + */ + r = superblock_read_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + read_superblock_fields(cmd, disk_super); + dm_bm_unlock(sblock); + + return 0; +} + +static int __commit_transaction(struct dm_cache_metadata *cmd, + flags_mutator mutator) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We need to know if the cache_disk_superblock exceeds a 512-byte sector. + */ + BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root, + &cmd->dirty_root); + if (r) + return r; + } + + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, + &cmd->discard_root); + if (r) + return r; + + r = dm_tm_pre_commit(cmd->tm); + if (r < 0) + return r; + + r = __save_sm_root(cmd); + if (r) + return r; + + r = superblock_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + + disk_super->flags = cpu_to_le32(cmd->flags); + if (mutator) + update_flags(disk_super, mutator); + + disk_super->mapping_root = cpu_to_le64(cmd->root); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); + disk_super->hint_root = cpu_to_le64(cmd->hint_root); + disk_super->discard_root = cpu_to_le64(cmd->discard_root); + disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); + disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); + strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); + disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); + disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); + disk_super->policy_hint_size = cpu_to_le32(cmd->policy_hint_size); + + disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); + disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); + disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); + disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); + __copy_sm_root(cmd, disk_super); + + return dm_tm_commit(cmd->tm, sblock); +} + +/*----------------------------------------------------------------*/ + +/* + * The mappings are held in a dm-array that has 64-bit values stored in + * little-endian format. The index is the cblock, the high 48bits of the + * value are the oblock and the low 16 bit the flags. + */ +#define FLAGS_MASK ((1 << 16) - 1) + +static __le64 pack_value(dm_oblock_t block, unsigned int flags) +{ + uint64_t value = from_oblock(block); + value <<= 16; + value = value | (flags & FLAGS_MASK); + return cpu_to_le64(value); +} + +static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned int *flags) +{ + uint64_t value = le64_to_cpu(value_le); + uint64_t b = value >> 16; + *block = to_oblock(b); + *flags = value & FLAGS_MASK; +} + +/*----------------------------------------------------------------*/ + +static struct dm_cache_metadata *metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size, + unsigned int metadata_version) +{ + int r; + struct dm_cache_metadata *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + DMERR("could not allocate metadata struct"); + return ERR_PTR(-ENOMEM); + } + + cmd->version = metadata_version; + refcount_set(&cmd->ref_count, 1); + init_rwsem(&cmd->root_lock); + cmd->bdev = bdev; + cmd->data_block_size = data_block_size; + cmd->cache_blocks = 0; + cmd->policy_hint_size = policy_hint_size; + cmd->changed = true; + cmd->fail_io = false; + + r = __create_persistent_data_objects(cmd, may_format_device); + if (r) { + kfree(cmd); + return ERR_PTR(r); + } + + r = __begin_transaction_flags(cmd, clear_clean_shutdown); + if (r < 0) { + dm_cache_metadata_close(cmd); + return ERR_PTR(r); + } + + return cmd; +} + +/* + * We keep a little list of ref counted metadata objects to prevent two + * different target instances creating separate bufio instances. This is + * an issue if a table is reloaded before the suspend. + */ +static DEFINE_MUTEX(table_lock); +static LIST_HEAD(table); + +static struct dm_cache_metadata *lookup(struct block_device *bdev) +{ + struct dm_cache_metadata *cmd; + + list_for_each_entry(cmd, &table, list) + if (cmd->bdev == bdev) { + refcount_inc(&cmd->ref_count); + return cmd; + } + + return NULL; +} + +static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size, + unsigned int metadata_version) +{ + struct dm_cache_metadata *cmd, *cmd2; + + mutex_lock(&table_lock); + cmd = lookup(bdev); + mutex_unlock(&table_lock); + + if (cmd) + return cmd; + + cmd = metadata_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); + if (!IS_ERR(cmd)) { + mutex_lock(&table_lock); + cmd2 = lookup(bdev); + if (cmd2) { + mutex_unlock(&table_lock); + __destroy_persistent_data_objects(cmd, true); + kfree(cmd); + return cmd2; + } + list_add(&cmd->list, &table); + mutex_unlock(&table_lock); + } + + return cmd; +} + +static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) +{ + if (cmd->data_block_size != data_block_size) { + DMERR("data_block_size (%llu) different from that in metadata (%llu)", + (unsigned long long) data_block_size, + (unsigned long long) cmd->data_block_size); + return false; + } + + return true; +} + +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size, + unsigned int metadata_version) +{ + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); + + if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { + dm_cache_metadata_close(cmd); + return ERR_PTR(-EINVAL); + } + + return cmd; +} + +void dm_cache_metadata_close(struct dm_cache_metadata *cmd) +{ + if (refcount_dec_and_test(&cmd->ref_count)) { + mutex_lock(&table_lock); + list_del(&cmd->list); + mutex_unlock(&table_lock); + + if (!cmd->fail_io) + __destroy_persistent_data_objects(cmd, true); + kfree(cmd); + } +} + +/* + * Checks that the given cache block is either unmapped or clean. + */ +static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) +{ + int r; + __le64 value; + dm_oblock_t ob; + unsigned int flags; + + r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); + if (r) + return r; + + unpack_value(value, &ob, &flags); + *result = !((flags & M_VALID) && (flags & M_DIRTY)); + + return 0; +} + +static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + *result = true; + + while (begin != end) { + r = block_clean_combined_dirty(cmd, begin, result); + if (r) { + DMERR("block_clean_combined_dirty failed"); + return r; + } + + if (!*result) { + DMERR("cache block %llu is dirty", + (unsigned long long) from_cblock(begin)); + return 0; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + return 0; +} + +static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + bool dirty_flag; + *result = true; + + if (from_cblock(cmd->cache_blocks) == 0) + /* Nothing to do */ + return 0; + + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), &cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__); + return r; + } + + r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin)); + if (r) { + DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + while (begin != end) { + /* + * We assume that unmapped blocks have their dirty bit + * cleared. + */ + dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor); + if (dirty_flag) { + DMERR("%s: cache block %llu is dirty", __func__, + (unsigned long long) from_cblock(begin)); + dm_bitset_cursor_end(&cmd->dirty_cursor); + *result = false; + return 0; + } + + begin = to_cblock(from_cblock(begin) + 1); + if (begin == end) + break; + + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + } + + dm_bitset_cursor_end(&cmd->dirty_cursor); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + if (separate_dirty_bits(cmd)) + return blocks_are_clean_separate_dirty(cmd, begin, end, result); + else + return blocks_are_clean_combined_dirty(cmd, begin, end, result); +} + +static bool cmd_write_lock(struct dm_cache_metadata *cmd) +{ + down_write(&cmd->root_lock); + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { + up_write(&cmd->root_lock); + return false; + } + return true; +} + +#define WRITE_LOCK(cmd) \ + do { \ + if (!cmd_write_lock((cmd))) \ + return -EINVAL; \ + } while(0) + +#define WRITE_LOCK_VOID(cmd) \ + do { \ + if (!cmd_write_lock((cmd))) \ + return; \ + } while(0) + +#define WRITE_UNLOCK(cmd) \ + up_write(&(cmd)->root_lock) + +static bool cmd_read_lock(struct dm_cache_metadata *cmd) +{ + down_read(&cmd->root_lock); + if (cmd->fail_io) { + up_read(&cmd->root_lock); + return false; + } + return true; +} + +#define READ_LOCK(cmd) \ + do { \ + if (!cmd_read_lock((cmd))) \ + return -EINVAL; \ + } while(0) + +#define READ_LOCK_VOID(cmd) \ + do { \ + if (!cmd_read_lock((cmd))) \ + return; \ + } while(0) + +#define READ_UNLOCK(cmd) \ + up_read(&(cmd)->root_lock) + +int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) +{ + int r; + bool clean; + __le64 null_mapping = pack_value(0, 0); + + WRITE_LOCK(cmd); + __dm_bless_for_disk(&null_mapping); + + if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) { + r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean); + if (r) { + __dm_unbless_for_disk(&null_mapping); + goto out; + } + + if (!clean) { + DMERR("unable to shrink cache due to dirty blocks"); + r = -EINVAL; + __dm_unbless_for_disk(&null_mapping); + goto out; + } + } + + r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), + from_cblock(new_cache_size), + &null_mapping, &cmd->root); + if (r) + goto out; + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), + false, &cmd->dirty_root); + if (r) + goto out; + } + + cmd->cache_blocks = new_cache_size; + cmd->changed = true; + +out: + WRITE_UNLOCK(cmd); + + return r; +} + +int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, + sector_t discard_block_size, + dm_dblock_t new_nr_entries) +{ + int r; + + WRITE_LOCK(cmd); + r = dm_bitset_resize(&cmd->discard_info, + cmd->discard_root, + from_dblock(cmd->discard_nr_blocks), + from_dblock(new_nr_entries), + false, &cmd->discard_root); + if (!r) { + cmd->discard_block_size = discard_block_size; + cmd->discard_nr_blocks = new_nr_entries; + } + + cmd->changed = true; + WRITE_UNLOCK(cmd); + + return r; +} + +static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) +{ + return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, + from_dblock(b), &cmd->discard_root); +} + +static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) +{ + return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, + from_dblock(b), &cmd->discard_root); +} + +static int __discard(struct dm_cache_metadata *cmd, + dm_dblock_t dblock, bool discard) +{ + int r; + + r = (discard ? __set_discard : __clear_discard)(cmd, dblock); + if (r) + return r; + + cmd->changed = true; + return 0; +} + +int dm_cache_set_discard(struct dm_cache_metadata *cmd, + dm_dblock_t dblock, bool discard) +{ + int r; + + WRITE_LOCK(cmd); + r = __discard(cmd, dblock, discard); + WRITE_UNLOCK(cmd); + + return r; +} + +static int __load_discards(struct dm_cache_metadata *cmd, + load_discard_fn fn, void *context) +{ + int r = 0; + uint32_t b; + struct dm_bitset_cursor c; + + if (from_dblock(cmd->discard_nr_blocks) == 0) + /* nothing to do */ + return 0; + + if (cmd->clean_when_opened) { + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); + if (r) + return r; + + r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root, + from_dblock(cmd->discard_nr_blocks), &c); + if (r) + return r; + + for (b = 0; ; b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), + dm_bitset_cursor_get_value(&c)); + if (r) + break; + + if (b >= (from_dblock(cmd->discard_nr_blocks) - 1)) + break; + + r = dm_bitset_cursor_next(&c); + if (r) + break; + } + + dm_bitset_cursor_end(&c); + + } else { + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), false); + if (r) + return r; + } + } + + return r; +} + +int dm_cache_load_discards(struct dm_cache_metadata *cmd, + load_discard_fn fn, void *context) +{ + int r; + + READ_LOCK(cmd); + r = __load_discards(cmd, fn, context); + READ_UNLOCK(cmd); + + return r; +} + +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result) +{ + READ_LOCK(cmd); + *result = cmd->cache_blocks; + READ_UNLOCK(cmd); + + return 0; +} + +static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock) +{ + int r; + __le64 value = pack_value(0, 0); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), + &value, &cmd->root); + if (r) + return r; + + cmd->changed = true; + return 0; +} + +int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock) +{ + int r; + + WRITE_LOCK(cmd); + r = __remove(cmd, cblock); + WRITE_UNLOCK(cmd); + + return r; +} + +static int __insert(struct dm_cache_metadata *cmd, + dm_cblock_t cblock, dm_oblock_t oblock) +{ + int r; + __le64 value = pack_value(oblock, M_VALID); + __dm_bless_for_disk(&value); + + r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), + &value, &cmd->root); + if (r) + return r; + + cmd->changed = true; + return 0; +} + +int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, + dm_cblock_t cblock, dm_oblock_t oblock) +{ + int r; + + WRITE_LOCK(cmd); + r = __insert(cmd, cblock, oblock); + WRITE_UNLOCK(cmd); + + return r; +} + +struct thunk { + load_mapping_fn fn; + void *context; + + struct dm_cache_metadata *cmd; + bool respect_dirty_flags; + bool hints_valid; +}; + +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned int *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + +static bool hints_array_initialized(struct dm_cache_metadata *cmd) +{ + return cmd->hint_root && cmd->policy_hint_size; +} + +static bool hints_array_available(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && + hints_array_initialized(cmd); +} + +static int __load_mapping_v1(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + load_mapping_fn fn, void *context) +{ + int r = 0; + + __le64 mapping; + __le32 hint = 0; + + __le64 *mapping_value_le; + __le32 *hint_value_le; + + dm_oblock_t oblock; + unsigned int flags; + bool dirty = true; + + dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); + memcpy(&mapping, mapping_value_le, sizeof(mapping)); + unpack_value(mapping, &oblock, &flags); + + if (flags & M_VALID) { + if (hints_valid) { + dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); + memcpy(&hint, hint_value_le, sizeof(hint)); + } + if (cmd->clean_when_opened) + dirty = flags & M_DIRTY; + + r = fn(context, oblock, to_cblock(cb), dirty, + le32_to_cpu(hint), hints_valid); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } + } + + return r; +} + +static int __load_mapping_v2(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + struct dm_bitset_cursor *dirty_cursor, + load_mapping_fn fn, void *context) +{ + int r = 0; + + __le64 mapping; + __le32 hint = 0; + + __le64 *mapping_value_le; + __le32 *hint_value_le; + + dm_oblock_t oblock; + unsigned int flags; + bool dirty = true; + + dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); + memcpy(&mapping, mapping_value_le, sizeof(mapping)); + unpack_value(mapping, &oblock, &flags); + + if (flags & M_VALID) { + if (hints_valid) { + dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); + memcpy(&hint, hint_value_le, sizeof(hint)); + } + if (cmd->clean_when_opened) + dirty = dm_bitset_cursor_get_value(dirty_cursor); + + r = fn(context, oblock, to_cblock(cb), dirty, + le32_to_cpu(hint), hints_valid); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } + } + + return r; +} + +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, + load_mapping_fn fn, void *context) +{ + int r; + uint64_t cb; + + bool hints_valid = hints_array_available(cmd, policy); + + if (from_cblock(cmd->cache_blocks) == 0) + /* Nothing to do */ + return 0; + + r = dm_array_cursor_begin(&cmd->info, cmd->root, &cmd->mapping_cursor); + if (r) + return r; + + if (hints_valid) { + r = dm_array_cursor_begin(&cmd->hint_info, cmd->hint_root, &cmd->hint_cursor); + if (r) { + dm_array_cursor_end(&cmd->mapping_cursor); + return r; + } + } + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), + &cmd->dirty_cursor); + if (r) { + dm_array_cursor_end(&cmd->hint_cursor); + dm_array_cursor_end(&cmd->mapping_cursor); + return r; + } + } + + for (cb = 0; ; cb++) { + if (separate_dirty_bits(cmd)) + r = __load_mapping_v2(cmd, cb, hints_valid, + &cmd->mapping_cursor, + &cmd->hint_cursor, + &cmd->dirty_cursor, + fn, context); + else + r = __load_mapping_v1(cmd, cb, hints_valid, + &cmd->mapping_cursor, &cmd->hint_cursor, + fn, context); + if (r) + goto out; + + /* + * We need to break out before we move the cursors. + */ + if (cb >= (from_cblock(cmd->cache_blocks) - 1)) + break; + + r = dm_array_cursor_next(&cmd->mapping_cursor); + if (r) { + DMERR("dm_array_cursor_next for mapping failed"); + goto out; + } + + if (hints_valid) { + r = dm_array_cursor_next(&cmd->hint_cursor); + if (r) { + dm_array_cursor_end(&cmd->hint_cursor); + hints_valid = false; + } + } + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("dm_bitset_cursor_next for dirty failed"); + goto out; + } + } + } +out: + dm_array_cursor_end(&cmd->mapping_cursor); + if (hints_valid) + dm_array_cursor_end(&cmd->hint_cursor); + + if (separate_dirty_bits(cmd)) + dm_bitset_cursor_end(&cmd->dirty_cursor); + + return r; +} + +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, + load_mapping_fn fn, void *context) +{ + int r; + + READ_LOCK(cmd); + r = __load_mappings(cmd, policy, fn, context); + READ_UNLOCK(cmd); + + return r; +} + +static int __dump_mapping(void *context, uint64_t cblock, void *leaf) +{ + __le64 value; + dm_oblock_t oblock; + unsigned int flags; + + memcpy(&value, leaf, sizeof(value)); + unpack_value(value, &oblock, &flags); + + return 0; +} + +static int __dump_mappings(struct dm_cache_metadata *cmd) +{ + return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL); +} + +void dm_cache_dump(struct dm_cache_metadata *cmd) +{ + READ_LOCK_VOID(cmd); + __dump_mappings(cmd); + READ_UNLOCK(cmd); +} + +int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd) +{ + int r; + + READ_LOCK(cmd); + r = cmd->changed; + READ_UNLOCK(cmd); + + return r; +} + +static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty) +{ + int r; + unsigned int flags; + dm_oblock_t oblock; + __le64 value; + + r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value); + if (r) + return r; + + unpack_value(value, &oblock, &flags); + + if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty)) + /* nothing to be done */ + return 0; + + value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0)); + __dm_bless_for_disk(&value); + + r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), + &value, &cmd->root); + if (r) + return r; + + cmd->changed = true; + return 0; + +} + +static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned int nr_bits, unsigned long *bits) +{ + int r; + unsigned int i; + for (i = 0; i < nr_bits; i++) { + r = __dirty(cmd, to_cblock(i), test_bit(i, bits)); + if (r) + return r; + } + + return 0; +} + +static int is_dirty_callback(uint32_t index, bool *value, void *context) +{ + unsigned long *bits = context; + *value = test_bit(index, bits); + return 0; +} + +static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned int nr_bits, unsigned long *bits) +{ + int r = 0; + + /* nr_bits is really just a sanity check */ + if (nr_bits != from_cblock(cmd->cache_blocks)) { + DMERR("dirty bitset is wrong size"); + return -EINVAL; + } + + r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root); + if (r) + return r; + + cmd->changed = true; + return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits); +} + +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned int nr_bits, + unsigned long *bits) +{ + int r; + + WRITE_LOCK(cmd); + if (separate_dirty_bits(cmd)) + r = __set_dirty_bits_v2(cmd, nr_bits, bits); + else + r = __set_dirty_bits_v1(cmd, nr_bits, bits); + WRITE_UNLOCK(cmd); + + return r; +} + +void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats) +{ + READ_LOCK_VOID(cmd); + *stats = cmd->stats; + READ_UNLOCK(cmd); +} + +void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats) +{ + WRITE_LOCK_VOID(cmd); + cmd->stats = *stats; + WRITE_UNLOCK(cmd); +} + +int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown) +{ + int r = -EINVAL; + flags_mutator mutator = (clean_shutdown ? set_clean_shutdown : + clear_clean_shutdown); + + WRITE_LOCK(cmd); + if (cmd->fail_io) + goto out; + + r = __commit_transaction(cmd, mutator); + if (r) + goto out; + + r = __begin_transaction(cmd); +out: + WRITE_UNLOCK(cmd); + return r; +} + +int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + READ_LOCK(cmd); + if (!cmd->fail_io) + r = dm_sm_get_nr_free(cmd->metadata_sm, result); + READ_UNLOCK(cmd); + + return r; +} + +int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + READ_LOCK(cmd); + if (!cmd->fail_io) + r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); + READ_UNLOCK(cmd); + + return r; +} + +/*----------------------------------------------------------------*/ + +static int get_hint(uint32_t index, void *value_le, void *context) +{ + uint32_t value; + struct dm_cache_policy *policy = context; + + value = policy_get_hint(policy, to_cblock(index)); + *((__le32 *) value_le) = cpu_to_le32(value); + + return 0; +} + +/* + * It's quicker to always delete the hint array, and recreate with + * dm_array_new(). + */ +static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) +{ + int r; + size_t hint_size; + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned int *policy_version = dm_cache_policy_get_version(policy); + + if (!policy_name[0] || + (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) + return -EINVAL; + + strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); + + hint_size = dm_cache_policy_get_hint_size(policy); + if (!hint_size) + return 0; /* short-circuit hints initialization */ + cmd->policy_hint_size = hint_size; + + if (cmd->hint_root) { + r = dm_array_del(&cmd->hint_info, cmd->hint_root); + if (r) + return r; + } + + return dm_array_new(&cmd->hint_info, &cmd->hint_root, + from_cblock(cmd->cache_blocks), + get_hint, policy); +} + +int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) +{ + int r; + + WRITE_LOCK(cmd); + r = write_hints(cmd, policy); + WRITE_UNLOCK(cmd); + + return r; +} + +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) +{ + int r; + + READ_LOCK(cmd); + r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + READ_UNLOCK(cmd); + + return r; +} + +void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd) +{ + WRITE_LOCK_VOID(cmd); + dm_bm_set_read_only(cmd->bm); + WRITE_UNLOCK(cmd); +} + +void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd) +{ + WRITE_LOCK_VOID(cmd); + dm_bm_set_read_write(cmd->bm); + WRITE_UNLOCK(cmd); +} + +int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct cache_disk_superblock *disk_super; + + WRITE_LOCK(cmd); + set_bit(NEEDS_CHECK, &cmd->flags); + + r = superblock_lock(cmd, &sblock); + if (r) { + DMERR("couldn't read superblock"); + goto out; + } + + disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(cmd->flags); + + dm_bm_unlock(sblock); + +out: + WRITE_UNLOCK(cmd); + return r; +} + +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result) +{ + READ_LOCK(cmd); + *result = !!test_bit(NEEDS_CHECK, &cmd->flags); + READ_UNLOCK(cmd); + + return 0; +} + +int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) +{ + int r = -EINVAL; + struct dm_block_manager *old_bm = NULL, *new_bm = NULL; + + /* fail_io is double-checked with cmd->root_lock held below */ + if (unlikely(cmd->fail_io)) + return r; + + /* + * Replacement block manager (new_bm) is created and old_bm destroyed outside of + * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of + * shrinker associated with the block manager's bufio client vs cmd root_lock). + * - must take shrinker_rwsem without holding cmd->root_lock + */ + new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + CACHE_MAX_CONCURRENT_LOCKS); + + WRITE_LOCK(cmd); + if (cmd->fail_io) { + WRITE_UNLOCK(cmd); + goto out; + } + + __destroy_persistent_data_objects(cmd, false); + old_bm = cmd->bm; + if (IS_ERR(new_bm)) { + DMERR("could not create block manager during abort"); + cmd->bm = NULL; + r = PTR_ERR(new_bm); + goto out_unlock; + } + + cmd->bm = new_bm; + r = __open_or_format_metadata(cmd, false); + if (r) { + cmd->bm = NULL; + goto out_unlock; + } + new_bm = NULL; +out_unlock: + if (r) + cmd->fail_io = true; + WRITE_UNLOCK(cmd); + dm_block_manager_destroy(old_bm); +out: + if (new_bm && !IS_ERR(new_bm)) + dm_block_manager_destroy(new_bm); + + return r; +} diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h new file mode 100644 index 000000000..b40322bc4 --- /dev/null +++ b/drivers/md/dm-cache-metadata.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_METADATA_H +#define DM_CACHE_METADATA_H + +#include "dm-cache-block-types.h" +#include "dm-cache-policy-internal.h" +#include "persistent-data/dm-space-map-metadata.h" + +/*----------------------------------------------------------------*/ + +#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE + +/* FIXME: remove this restriction */ +/* + * The metadata device is currently limited in size. + */ +#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +/*----------------------------------------------------------------*/ + +/* + * Ext[234]-style compat feature flags. + * + * A new feature which old metadata will still be compatible with should + * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful). + * + * A new feature that is not compatible with old code should define a + * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with + * that flag. + * + * A new feature that is not compatible with old code accessing the + * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and + * guard the relevant code with that flag. + * + * As these various flags are defined they should be added to the + * following masks. + */ + +#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL +#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL +#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL + +struct dm_cache_metadata; + +/* + * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on + * failure. If reopening then features must match. + */ +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size, + unsigned int metadata_version); + +void dm_cache_metadata_close(struct dm_cache_metadata *cmd); + +/* + * The metadata needs to know how many cache blocks there are. We don't + * care about the origin, assuming the core target is giving us valid + * origin blocks to map to. + */ +int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size); +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result); + +int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, + sector_t discard_block_size, + dm_dblock_t new_nr_entries); + +typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, + dm_dblock_t dblock, bool discarded); +int dm_cache_load_discards(struct dm_cache_metadata *cmd, + load_discard_fn fn, void *context); + +int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard); + +int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); +int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); +int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd); + +typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, + dm_cblock_t cblock, bool dirty, + uint32_t hint, bool hint_valid); +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, + load_mapping_fn fn, + void *context); + +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned int nr_bits, unsigned long *bits); + +struct dm_cache_statistics { + uint32_t read_hits; + uint32_t read_misses; + uint32_t write_hits; + uint32_t write_misses; +}; + +void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats); + +/* + * 'void' because it's no big deal if it fails. + */ +void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats); + +int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown); + +int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, + dm_block_t *result); + +int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, + dm_block_t *result); + +void dm_cache_dump(struct dm_cache_metadata *cmd); + +/* + * The policy is invited to save a 32bit hint value for every cblock (eg, + * for a hit count). These are stored against the policy name. If + * policies are changed, then hints will be lost. If the machine crashes, + * hints will be lost. + * + * The hints are indexed by the cblock, but many policies will not + * necessarily have a fast way of accessing efficiently via cblock. So + * rather than querying the policy for each cblock, we let it walk its data + * structures and fill in the hints in whatever order it wishes. + */ +int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); + +/* + * Query method. Are all the blocks in the cache clean? + */ +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); + +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result); +int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd); +void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); +void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd); +int dm_cache_metadata_abort(struct dm_cache_metadata *cmd); + +/*----------------------------------------------------------------*/ + +#endif /* DM_CACHE_METADATA_H */ diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h new file mode 100644 index 000000000..8e49baa78 --- /dev/null +++ b/drivers/md/dm-cache-policy-internal.h @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_POLICY_INTERNAL_H +#define DM_CACHE_POLICY_INTERNAL_H + +#include +#include "dm-cache-policy.h" + +/*----------------------------------------------------------------*/ + +static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued) +{ + return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); +} + +static inline int policy_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) +{ + if (!p->lookup_with_work) { + *work = NULL; + return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); + } + + return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); +} + +static inline int policy_get_background_work(struct dm_cache_policy *p, + bool idle, struct policy_work **result) +{ + return p->get_background_work(p, idle, result); +} + +static inline void policy_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) +{ + return p->complete_background_work(p, work, success); +} + +static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + p->set_dirty(p, cblock); +} + +static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + p->clear_dirty(p, cblock); +} + +static inline int policy_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); +} + +static inline int policy_invalidate_mapping(struct dm_cache_policy *p, + dm_cblock_t cblock) +{ + return p->invalidate_mapping(p, cblock); +} + +static inline uint32_t policy_get_hint(struct dm_cache_policy *p, + dm_cblock_t cblock) +{ + return p->get_hint ? p->get_hint(p, cblock) : 0; +} + +static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) +{ + return p->residency(p); +} + +static inline void policy_tick(struct dm_cache_policy *p, bool can_block) +{ + if (p->tick) + return p->tick(p, can_block); +} + +static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, + unsigned int maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + if (p->emit_config_values) + return p->emit_config_values(p, result, maxlen, sz_ptr); + + DMEMIT("0 "); + *sz_ptr = sz; + return 0; +} + +static inline int policy_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; +} + +static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + return p->allow_migrations(p, allow); +} + +/*----------------------------------------------------------------*/ + +/* + * Some utility functions commonly used by policies and the core target. + */ +static inline size_t bitset_size_in_bytes(unsigned int nr_entries) +{ + return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); +} + +static inline unsigned long *alloc_bitset(unsigned int nr_entries) +{ + size_t s = bitset_size_in_bytes(nr_entries); + return vzalloc(s); +} + +static inline void clear_bitset(void *bitset, unsigned int nr_entries) +{ + size_t s = bitset_size_in_bytes(nr_entries); + memset(bitset, 0, s); +} + +static inline void free_bitset(unsigned long *bits) +{ + vfree(bits); +} + +/*----------------------------------------------------------------*/ + +/* + * Creates a new cache policy given a policy name, a cache size, an origin size and the block size. + */ +struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size, + sector_t origin_size, sector_t block_size); + +/* + * Destroys the policy. This drops references to the policy module as well + * as calling it's destroy method. So always use this rather than calling + * the policy->destroy method directly. + */ +void dm_cache_policy_destroy(struct dm_cache_policy *p); + +/* + * In case we've forgotten. + */ +const char *dm_cache_policy_get_name(struct dm_cache_policy *p); + +const unsigned int *dm_cache_policy_get_version(struct dm_cache_policy *p); + +size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); + +/*----------------------------------------------------------------*/ + +#endif /* DM_CACHE_POLICY_INTERNAL_H */ diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c new file mode 100644 index 000000000..c983cf240 --- /dev/null +++ b/drivers/md/dm-cache-policy-smq.c @@ -0,0 +1,1953 @@ +/* + * Copyright (C) 2015 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-background-tracker.h" +#include "dm-cache-policy-internal.h" +#include "dm-cache-policy.h" +#include "dm.h" + +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "cache-policy-smq" + +/*----------------------------------------------------------------*/ + +/* + * Safe division functions that return zero on divide by zero. + */ +static unsigned int safe_div(unsigned int n, unsigned int d) +{ + return d ? n / d : 0u; +} + +static unsigned int safe_mod(unsigned int n, unsigned int d) +{ + return d ? n % d : 0u; +} + +/*----------------------------------------------------------------*/ + +struct entry { + unsigned int hash_next:28; + unsigned int prev:28; + unsigned int next:28; + unsigned int level:6; + bool dirty:1; + bool allocated:1; + bool sentinel:1; + bool pending_work:1; + + dm_oblock_t oblock; +}; + +/*----------------------------------------------------------------*/ + +#define INDEXER_NULL ((1u << 28u) - 1u) + +/* + * An entry_space manages a set of entries that we use for the queues. + * The clean and dirty queues share entries, so this object is separate + * from the queue itself. + */ +struct entry_space { + struct entry *begin; + struct entry *end; +}; + +static int space_init(struct entry_space *es, unsigned int nr_entries) +{ + if (!nr_entries) { + es->begin = es->end = NULL; + return 0; + } + + es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry))); + if (!es->begin) + return -ENOMEM; + + es->end = es->begin + nr_entries; + return 0; +} + +static void space_exit(struct entry_space *es) +{ + vfree(es->begin); +} + +static struct entry *__get_entry(struct entry_space *es, unsigned int block) +{ + struct entry *e; + + e = es->begin + block; + BUG_ON(e >= es->end); + + return e; +} + +static unsigned int to_index(struct entry_space *es, struct entry *e) +{ + BUG_ON(e < es->begin || e >= es->end); + return e - es->begin; +} + +static struct entry *to_entry(struct entry_space *es, unsigned int block) +{ + if (block == INDEXER_NULL) + return NULL; + + return __get_entry(es, block); +} + +/*----------------------------------------------------------------*/ + +struct ilist { + unsigned int nr_elts; /* excluding sentinel entries */ + unsigned int head, tail; +}; + +static void l_init(struct ilist *l) +{ + l->nr_elts = 0; + l->head = l->tail = INDEXER_NULL; +} + +static struct entry *l_head(struct entry_space *es, struct ilist *l) +{ + return to_entry(es, l->head); +} + +static struct entry *l_tail(struct entry_space *es, struct ilist *l) +{ + return to_entry(es, l->tail); +} + +static struct entry *l_next(struct entry_space *es, struct entry *e) +{ + return to_entry(es, e->next); +} + +static struct entry *l_prev(struct entry_space *es, struct entry *e) +{ + return to_entry(es, e->prev); +} + +static bool l_empty(struct ilist *l) +{ + return l->head == INDEXER_NULL; +} + +static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e) +{ + struct entry *head = l_head(es, l); + + e->next = l->head; + e->prev = INDEXER_NULL; + + if (head) + head->prev = l->head = to_index(es, e); + else + l->head = l->tail = to_index(es, e); + + if (!e->sentinel) + l->nr_elts++; +} + +static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e) +{ + struct entry *tail = l_tail(es, l); + + e->next = INDEXER_NULL; + e->prev = l->tail; + + if (tail) + tail->next = l->tail = to_index(es, e); + else + l->head = l->tail = to_index(es, e); + + if (!e->sentinel) + l->nr_elts++; +} + +static void l_add_before(struct entry_space *es, struct ilist *l, + struct entry *old, struct entry *e) +{ + struct entry *prev = l_prev(es, old); + + if (!prev) + l_add_head(es, l, e); + + else { + e->prev = old->prev; + e->next = to_index(es, old); + prev->next = old->prev = to_index(es, e); + + if (!e->sentinel) + l->nr_elts++; + } +} + +static void l_del(struct entry_space *es, struct ilist *l, struct entry *e) +{ + struct entry *prev = l_prev(es, e); + struct entry *next = l_next(es, e); + + if (prev) + prev->next = e->next; + else + l->head = e->next; + + if (next) + next->prev = e->prev; + else + l->tail = e->prev; + + if (!e->sentinel) + l->nr_elts--; +} + +static struct entry *l_pop_head(struct entry_space *es, struct ilist *l) +{ + struct entry *e; + + for (e = l_head(es, l); e; e = l_next(es, e)) + if (!e->sentinel) { + l_del(es, l, e); + return e; + } + + return NULL; +} + +static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l) +{ + struct entry *e; + + for (e = l_tail(es, l); e; e = l_prev(es, e)) + if (!e->sentinel) { + l_del(es, l, e); + return e; + } + + return NULL; +} + +/*----------------------------------------------------------------*/ + +/* + * The stochastic-multi-queue is a set of lru lists stacked into levels. + * Entries are moved up levels when they are used, which loosely orders the + * most accessed entries in the top levels and least in the bottom. This + * structure is *much* better than a single lru list. + */ +#define MAX_LEVELS 64u + +struct queue { + struct entry_space *es; + + unsigned int nr_elts; + unsigned int nr_levels; + struct ilist qs[MAX_LEVELS]; + + /* + * We maintain a count of the number of entries we would like in each + * level. + */ + unsigned int last_target_nr_elts; + unsigned int nr_top_levels; + unsigned int nr_in_top_levels; + unsigned int target_count[MAX_LEVELS]; +}; + +static void q_init(struct queue *q, struct entry_space *es, unsigned int nr_levels) +{ + unsigned int i; + + q->es = es; + q->nr_elts = 0; + q->nr_levels = nr_levels; + + for (i = 0; i < q->nr_levels; i++) { + l_init(q->qs + i); + q->target_count[i] = 0u; + } + + q->last_target_nr_elts = 0u; + q->nr_top_levels = 0u; + q->nr_in_top_levels = 0u; +} + +static unsigned int q_size(struct queue *q) +{ + return q->nr_elts; +} + +/* + * Insert an entry to the back of the given level. + */ +static void q_push(struct queue *q, struct entry *e) +{ + BUG_ON(e->pending_work); + + if (!e->sentinel) + q->nr_elts++; + + l_add_tail(q->es, q->qs + e->level, e); +} + +static void q_push_front(struct queue *q, struct entry *e) +{ + BUG_ON(e->pending_work); + + if (!e->sentinel) + q->nr_elts++; + + l_add_head(q->es, q->qs + e->level, e); +} + +static void q_push_before(struct queue *q, struct entry *old, struct entry *e) +{ + BUG_ON(e->pending_work); + + if (!e->sentinel) + q->nr_elts++; + + l_add_before(q->es, q->qs + e->level, old, e); +} + +static void q_del(struct queue *q, struct entry *e) +{ + l_del(q->es, q->qs + e->level, e); + if (!e->sentinel) + q->nr_elts--; +} + +/* + * Return the oldest entry of the lowest populated level. + */ +static struct entry *q_peek(struct queue *q, unsigned int max_level, bool can_cross_sentinel) +{ + unsigned int level; + struct entry *e; + + max_level = min(max_level, q->nr_levels); + + for (level = 0; level < max_level; level++) + for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) { + if (e->sentinel) { + if (can_cross_sentinel) + continue; + else + break; + } + + return e; + } + + return NULL; +} + +static struct entry *q_pop(struct queue *q) +{ + struct entry *e = q_peek(q, q->nr_levels, true); + + if (e) + q_del(q, e); + + return e; +} + +/* + * This function assumes there is a non-sentinel entry to pop. It's only + * used by redistribute, so we know this is true. It also doesn't adjust + * the q->nr_elts count. + */ +static struct entry *__redist_pop_from(struct queue *q, unsigned int level) +{ + struct entry *e; + + for (; level < q->nr_levels; level++) + for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) + if (!e->sentinel) { + l_del(q->es, q->qs + e->level, e); + return e; + } + + return NULL; +} + +static void q_set_targets_subrange_(struct queue *q, unsigned int nr_elts, + unsigned int lbegin, unsigned int lend) +{ + unsigned int level, nr_levels, entries_per_level, remainder; + + BUG_ON(lbegin > lend); + BUG_ON(lend > q->nr_levels); + nr_levels = lend - lbegin; + entries_per_level = safe_div(nr_elts, nr_levels); + remainder = safe_mod(nr_elts, nr_levels); + + for (level = lbegin; level < lend; level++) + q->target_count[level] = + (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level; +} + +/* + * Typically we have fewer elements in the top few levels which allows us + * to adjust the promote threshold nicely. + */ +static void q_set_targets(struct queue *q) +{ + if (q->last_target_nr_elts == q->nr_elts) + return; + + q->last_target_nr_elts = q->nr_elts; + + if (q->nr_top_levels > q->nr_levels) + q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels); + + else { + q_set_targets_subrange_(q, q->nr_in_top_levels, + q->nr_levels - q->nr_top_levels, q->nr_levels); + + if (q->nr_in_top_levels < q->nr_elts) + q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels, + 0, q->nr_levels - q->nr_top_levels); + else + q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels); + } +} + +static void q_redistribute(struct queue *q) +{ + unsigned int target, level; + struct ilist *l, *l_above; + struct entry *e; + + q_set_targets(q); + + for (level = 0u; level < q->nr_levels - 1u; level++) { + l = q->qs + level; + target = q->target_count[level]; + + /* + * Pull down some entries from the level above. + */ + while (l->nr_elts < target) { + e = __redist_pop_from(q, level + 1u); + if (!e) { + /* bug in nr_elts */ + break; + } + + e->level = level; + l_add_tail(q->es, l, e); + } + + /* + * Push some entries up. + */ + l_above = q->qs + level + 1u; + while (l->nr_elts > target) { + e = l_pop_tail(q->es, l); + + if (!e) + /* bug in nr_elts */ + break; + + e->level = level + 1u; + l_add_tail(q->es, l_above, e); + } + } +} + +static void q_requeue(struct queue *q, struct entry *e, unsigned int extra_levels, + struct entry *s1, struct entry *s2) +{ + struct entry *de; + unsigned int sentinels_passed = 0; + unsigned int new_level = min(q->nr_levels - 1u, e->level + extra_levels); + + /* try and find an entry to swap with */ + if (extra_levels && (e->level < q->nr_levels - 1u)) { + for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) + sentinels_passed++; + + if (de) { + q_del(q, de); + de->level = e->level; + if (s1) { + switch (sentinels_passed) { + case 0: + q_push_before(q, s1, de); + break; + + case 1: + q_push_before(q, s2, de); + break; + + default: + q_push(q, de); + } + } else + q_push(q, de); + } + } + + q_del(q, e); + e->level = new_level; + q_push(q, e); +} + +/*----------------------------------------------------------------*/ + +#define FP_SHIFT 8 +#define SIXTEENTH (1u << (FP_SHIFT - 4u)) +#define EIGHTH (1u << (FP_SHIFT - 3u)) + +struct stats { + unsigned int hit_threshold; + unsigned int hits; + unsigned int misses; +}; + +enum performance { + Q_POOR, + Q_FAIR, + Q_WELL +}; + +static void stats_init(struct stats *s, unsigned int nr_levels) +{ + s->hit_threshold = (nr_levels * 3u) / 4u; + s->hits = 0u; + s->misses = 0u; +} + +static void stats_reset(struct stats *s) +{ + s->hits = s->misses = 0u; +} + +static void stats_level_accessed(struct stats *s, unsigned int level) +{ + if (level >= s->hit_threshold) + s->hits++; + else + s->misses++; +} + +static void stats_miss(struct stats *s) +{ + s->misses++; +} + +/* + * There are times when we don't have any confidence in the hotspot queue. + * Such as when a fresh cache is created and the blocks have been spread + * out across the levels, or if an io load changes. We detect this by + * seeing how often a lookup is in the top levels of the hotspot queue. + */ +static enum performance stats_assess(struct stats *s) +{ + unsigned int confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses); + + if (confidence < SIXTEENTH) + return Q_POOR; + + else if (confidence < EIGHTH) + return Q_FAIR; + + else + return Q_WELL; +} + +/*----------------------------------------------------------------*/ + +struct smq_hash_table { + struct entry_space *es; + unsigned long long hash_bits; + unsigned int *buckets; +}; + +/* + * All cache entries are stored in a chained hash table. To save space we + * use indexing again, and only store indexes to the next entry. + */ +static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned int nr_entries) +{ + unsigned int i, nr_buckets; + + ht->es = es; + nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u)); + ht->hash_bits = __ffs(nr_buckets); + + ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets))); + if (!ht->buckets) + return -ENOMEM; + + for (i = 0; i < nr_buckets; i++) + ht->buckets[i] = INDEXER_NULL; + + return 0; +} + +static void h_exit(struct smq_hash_table *ht) +{ + vfree(ht->buckets); +} + +static struct entry *h_head(struct smq_hash_table *ht, unsigned int bucket) +{ + return to_entry(ht->es, ht->buckets[bucket]); +} + +static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) +{ + return to_entry(ht->es, e->hash_next); +} + +static void __h_insert(struct smq_hash_table *ht, unsigned int bucket, struct entry *e) +{ + e->hash_next = ht->buckets[bucket]; + ht->buckets[bucket] = to_index(ht->es, e); +} + +static void h_insert(struct smq_hash_table *ht, struct entry *e) +{ + unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits); + __h_insert(ht, h, e); +} + +static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned int h, dm_oblock_t oblock, + struct entry **prev) +{ + struct entry *e; + + *prev = NULL; + for (e = h_head(ht, h); e; e = h_next(ht, e)) { + if (e->oblock == oblock) + return e; + + *prev = e; + } + + return NULL; +} + +static void __h_unlink(struct smq_hash_table *ht, unsigned int h, + struct entry *e, struct entry *prev) +{ + if (prev) + prev->hash_next = e->hash_next; + else + ht->buckets[h] = e->hash_next; +} + +/* + * Also moves each entry to the front of the bucket. + */ +static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) +{ + struct entry *e, *prev; + unsigned int h = hash_64(from_oblock(oblock), ht->hash_bits); + + e = __h_lookup(ht, h, oblock, &prev); + if (e && prev) { + /* + * Move to the front because this entry is likely + * to be hit again. + */ + __h_unlink(ht, h, e, prev); + __h_insert(ht, h, e); + } + + return e; +} + +static void h_remove(struct smq_hash_table *ht, struct entry *e) +{ + unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits); + struct entry *prev; + + /* + * The down side of using a singly linked list is we have to + * iterate the bucket to remove an item. + */ + e = __h_lookup(ht, h, e->oblock, &prev); + if (e) + __h_unlink(ht, h, e, prev); +} + +/*----------------------------------------------------------------*/ + +struct entry_alloc { + struct entry_space *es; + unsigned int begin; + + unsigned int nr_allocated; + struct ilist free; +}; + +static void init_allocator(struct entry_alloc *ea, struct entry_space *es, + unsigned int begin, unsigned int end) +{ + unsigned int i; + + ea->es = es; + ea->nr_allocated = 0u; + ea->begin = begin; + + l_init(&ea->free); + for (i = begin; i != end; i++) + l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i)); +} + +static void init_entry(struct entry *e) +{ + /* + * We can't memset because that would clear the hotspot and + * sentinel bits which remain constant. + */ + e->hash_next = INDEXER_NULL; + e->next = INDEXER_NULL; + e->prev = INDEXER_NULL; + e->level = 0u; + e->dirty = true; /* FIXME: audit */ + e->allocated = true; + e->sentinel = false; + e->pending_work = false; +} + +static struct entry *alloc_entry(struct entry_alloc *ea) +{ + struct entry *e; + + if (l_empty(&ea->free)) + return NULL; + + e = l_pop_head(ea->es, &ea->free); + init_entry(e); + ea->nr_allocated++; + + return e; +} + +/* + * This assumes the cblock hasn't already been allocated. + */ +static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned int i) +{ + struct entry *e = __get_entry(ea->es, ea->begin + i); + + BUG_ON(e->allocated); + + l_del(ea->es, &ea->free, e); + init_entry(e); + ea->nr_allocated++; + + return e; +} + +static void free_entry(struct entry_alloc *ea, struct entry *e) +{ + BUG_ON(!ea->nr_allocated); + BUG_ON(!e->allocated); + + ea->nr_allocated--; + e->allocated = false; + l_add_tail(ea->es, &ea->free, e); +} + +static bool allocator_empty(struct entry_alloc *ea) +{ + return l_empty(&ea->free); +} + +static unsigned int get_index(struct entry_alloc *ea, struct entry *e) +{ + return to_index(ea->es, e) - ea->begin; +} + +static struct entry *get_entry(struct entry_alloc *ea, unsigned int index) +{ + return __get_entry(ea->es, ea->begin + index); +} + +/*----------------------------------------------------------------*/ + +#define NR_HOTSPOT_LEVELS 64u +#define NR_CACHE_LEVELS 64u + +#define WRITEBACK_PERIOD (10ul * HZ) +#define DEMOTE_PERIOD (60ul * HZ) + +#define HOTSPOT_UPDATE_PERIOD (HZ) +#define CACHE_UPDATE_PERIOD (60ul * HZ) + +struct smq_policy { + struct dm_cache_policy policy; + + /* protects everything */ + spinlock_t lock; + dm_cblock_t cache_size; + sector_t cache_block_size; + + sector_t hotspot_block_size; + unsigned int nr_hotspot_blocks; + unsigned int cache_blocks_per_hotspot_block; + unsigned int hotspot_level_jump; + + struct entry_space es; + struct entry_alloc writeback_sentinel_alloc; + struct entry_alloc demote_sentinel_alloc; + struct entry_alloc hotspot_alloc; + struct entry_alloc cache_alloc; + + unsigned long *hotspot_hit_bits; + unsigned long *cache_hit_bits; + + /* + * We maintain three queues of entries. The cache proper, + * consisting of a clean and dirty queue, containing the currently + * active mappings. The hotspot queue uses a larger block size to + * track blocks that are being hit frequently and potential + * candidates for promotion to the cache. + */ + struct queue hotspot; + struct queue clean; + struct queue dirty; + + struct stats hotspot_stats; + struct stats cache_stats; + + /* + * Keeps track of time, incremented by the core. We use this to + * avoid attributing multiple hits within the same tick. + */ + unsigned int tick; + + /* + * The hash tables allows us to quickly find an entry by origin + * block. + */ + struct smq_hash_table table; + struct smq_hash_table hotspot_table; + + bool current_writeback_sentinels; + unsigned long next_writeback_period; + + bool current_demote_sentinels; + unsigned long next_demote_period; + + unsigned int write_promote_level; + unsigned int read_promote_level; + + unsigned long next_hotspot_period; + unsigned long next_cache_period; + + struct background_tracker *bg_work; + + bool migrations_allowed:1; + + /* + * If this is set the policy will try and clean the whole cache + * even if the device is not idle. + */ + bool cleaner:1; +}; + +/*----------------------------------------------------------------*/ + +static struct entry *get_sentinel(struct entry_alloc *ea, unsigned int level, bool which) +{ + return get_entry(ea, which ? level : NR_CACHE_LEVELS + level); +} + +static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned int level) +{ + return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels); +} + +static struct entry *demote_sentinel(struct smq_policy *mq, unsigned int level) +{ + return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels); +} + +static void __update_writeback_sentinels(struct smq_policy *mq) +{ + unsigned int level; + struct queue *q = &mq->dirty; + struct entry *sentinel; + + for (level = 0; level < q->nr_levels; level++) { + sentinel = writeback_sentinel(mq, level); + q_del(q, sentinel); + q_push(q, sentinel); + } +} + +static void __update_demote_sentinels(struct smq_policy *mq) +{ + unsigned int level; + struct queue *q = &mq->clean; + struct entry *sentinel; + + for (level = 0; level < q->nr_levels; level++) { + sentinel = demote_sentinel(mq, level); + q_del(q, sentinel); + q_push(q, sentinel); + } +} + +static void update_sentinels(struct smq_policy *mq) +{ + if (time_after(jiffies, mq->next_writeback_period)) { + mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; + mq->current_writeback_sentinels = !mq->current_writeback_sentinels; + __update_writeback_sentinels(mq); + } + + if (time_after(jiffies, mq->next_demote_period)) { + mq->next_demote_period = jiffies + DEMOTE_PERIOD; + mq->current_demote_sentinels = !mq->current_demote_sentinels; + __update_demote_sentinels(mq); + } +} + +static void __sentinels_init(struct smq_policy *mq) +{ + unsigned int level; + struct entry *sentinel; + + for (level = 0; level < NR_CACHE_LEVELS; level++) { + sentinel = writeback_sentinel(mq, level); + sentinel->level = level; + q_push(&mq->dirty, sentinel); + + sentinel = demote_sentinel(mq, level); + sentinel->level = level; + q_push(&mq->clean, sentinel); + } +} + +static void sentinels_init(struct smq_policy *mq) +{ + mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; + mq->next_demote_period = jiffies + DEMOTE_PERIOD; + + mq->current_writeback_sentinels = false; + mq->current_demote_sentinels = false; + __sentinels_init(mq); + + mq->current_writeback_sentinels = !mq->current_writeback_sentinels; + mq->current_demote_sentinels = !mq->current_demote_sentinels; + __sentinels_init(mq); +} + +/*----------------------------------------------------------------*/ + +static void del_queue(struct smq_policy *mq, struct entry *e) +{ + q_del(e->dirty ? &mq->dirty : &mq->clean, e); +} + +static void push_queue(struct smq_policy *mq, struct entry *e) +{ + if (e->dirty) + q_push(&mq->dirty, e); + else + q_push(&mq->clean, e); +} + +// !h, !q, a -> h, q, a +static void push(struct smq_policy *mq, struct entry *e) +{ + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue(mq, e); +} + +static void push_queue_front(struct smq_policy *mq, struct entry *e) +{ + if (e->dirty) + q_push_front(&mq->dirty, e); + else + q_push_front(&mq->clean, e); +} + +static void push_front(struct smq_policy *mq, struct entry *e) +{ + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue_front(mq, e); +} + +static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) +{ + return to_cblock(get_index(&mq->cache_alloc, e)); +} + +static void requeue(struct smq_policy *mq, struct entry *e) +{ + /* + * Pending work has temporarily been taken out of the queues. + */ + if (e->pending_work) + return; + + if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { + if (!e->dirty) { + q_requeue(&mq->clean, e, 1u, NULL, NULL); + return; + } + + q_requeue(&mq->dirty, e, 1u, + get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), + get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); + } +} + +static unsigned int default_promote_level(struct smq_policy *mq) +{ + /* + * The promote level depends on the current performance of the + * cache. + * + * If the cache is performing badly, then we can't afford + * to promote much without causing performance to drop below that + * of the origin device. + * + * If the cache is performing well, then we don't need to promote + * much. If it isn't broken, don't fix it. + * + * If the cache is middling then we promote more. + * + * This scheme reminds me of a graph of entropy vs probability of a + * binary variable. + */ + static const unsigned int table[] = { + 1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1 + }; + + unsigned int hits = mq->cache_stats.hits; + unsigned int misses = mq->cache_stats.misses; + unsigned int index = safe_div(hits << 4u, hits + misses); + return table[index]; +} + +static void update_promote_levels(struct smq_policy *mq) +{ + /* + * If there are unused cache entries then we want to be really + * eager to promote. + */ + unsigned int threshold_level = allocator_empty(&mq->cache_alloc) ? + default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); + + threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); + + /* + * If the hotspot queue is performing badly then we have little + * confidence that we know which blocks to promote. So we cut down + * the amount of promotions. + */ + switch (stats_assess(&mq->hotspot_stats)) { + case Q_POOR: + threshold_level /= 4u; + break; + + case Q_FAIR: + threshold_level /= 2u; + break; + + case Q_WELL: + break; + } + + mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; + mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); +} + +/* + * If the hotspot queue is performing badly, then we try and move entries + * around more quickly. + */ +static void update_level_jump(struct smq_policy *mq) +{ + switch (stats_assess(&mq->hotspot_stats)) { + case Q_POOR: + mq->hotspot_level_jump = 4u; + break; + + case Q_FAIR: + mq->hotspot_level_jump = 2u; + break; + + case Q_WELL: + mq->hotspot_level_jump = 1u; + break; + } +} + +static void end_hotspot_period(struct smq_policy *mq) +{ + clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks); + update_promote_levels(mq); + + if (time_after(jiffies, mq->next_hotspot_period)) { + update_level_jump(mq); + q_redistribute(&mq->hotspot); + stats_reset(&mq->hotspot_stats); + mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD; + } +} + +static void end_cache_period(struct smq_policy *mq) +{ + if (time_after(jiffies, mq->next_cache_period)) { + clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size)); + + q_redistribute(&mq->dirty); + q_redistribute(&mq->clean); + stats_reset(&mq->cache_stats); + + mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD; + } +} + +/*----------------------------------------------------------------*/ + +/* + * Targets are given as a percentage. + */ +#define CLEAN_TARGET 25u +#define FREE_TARGET 25u + +static unsigned int percent_to_target(struct smq_policy *mq, unsigned int p) +{ + return from_cblock(mq->cache_size) * p / 100u; +} + +static bool clean_target_met(struct smq_policy *mq, bool idle) +{ + /* + * Cache entries may not be populated. So we cannot rely on the + * size of the clean queue. + */ + if (idle || mq->cleaner) { + /* + * We'd like to clean everything. + */ + return q_size(&mq->dirty) == 0u; + } + + /* + * If we're busy we don't worry about cleaning at all. + */ + return true; +} + +static bool free_target_met(struct smq_policy *mq) +{ + unsigned int nr_free; + + nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated; + return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= + percent_to_target(mq, FREE_TARGET); +} + +/*----------------------------------------------------------------*/ + +static void mark_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(e->sentinel); + BUG_ON(!e->allocated); + BUG_ON(e->pending_work); + e->pending_work = true; +} + +static void clear_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(!e->pending_work); + e->pending_work = false; +} + +static void queue_writeback(struct smq_policy *mq, bool idle) +{ + int r; + struct policy_work work; + struct entry *e; + + e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle); + if (e) { + mark_pending(mq, e); + q_del(&mq->dirty, e); + + work.op = POLICY_WRITEBACK; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + + r = btracker_queue(mq->bg_work, &work, NULL); + if (r) { + clear_pending(mq, e); + q_push_front(&mq->dirty, e); + } + } +} + +static void queue_demotion(struct smq_policy *mq) +{ + int r; + struct policy_work work; + struct entry *e; + + if (WARN_ON_ONCE(!mq->migrations_allowed)) + return; + + e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); + if (!e) { + if (!clean_target_met(mq, true)) + queue_writeback(mq, false); + return; + } + + mark_pending(mq, e); + q_del(&mq->clean, e); + + work.op = POLICY_DEMOTE; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + r = btracker_queue(mq->bg_work, &work, NULL); + if (r) { + clear_pending(mq, e); + q_push_front(&mq->clean, e); + } +} + +static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, + struct policy_work **workp) +{ + int r; + struct entry *e; + struct policy_work work; + + if (!mq->migrations_allowed) + return; + + if (allocator_empty(&mq->cache_alloc)) { + /* + * We always claim to be 'idle' to ensure some demotions happen + * with continuous loads. + */ + if (!free_target_met(mq)) + queue_demotion(mq); + return; + } + + if (btracker_promotion_already_present(mq->bg_work, oblock)) + return; + + /* + * We allocate the entry now to reserve the cblock. If the + * background work is aborted we must remember to free it. + */ + e = alloc_entry(&mq->cache_alloc); + BUG_ON(!e); + e->pending_work = true; + work.op = POLICY_PROMOTE; + work.oblock = oblock; + work.cblock = infer_cblock(mq, e); + r = btracker_queue(mq->bg_work, &work, workp); + if (r) + free_entry(&mq->cache_alloc, e); +} + +/*----------------------------------------------------------------*/ + +enum promote_result { + PROMOTE_NOT, + PROMOTE_TEMPORARY, + PROMOTE_PERMANENT +}; + +/* + * Converts a boolean into a promote result. + */ +static enum promote_result maybe_promote(bool promote) +{ + return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; +} + +static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, + int data_dir, bool fast_promote) +{ + if (data_dir == WRITE) { + if (!allocator_empty(&mq->cache_alloc) && fast_promote) + return PROMOTE_TEMPORARY; + + return maybe_promote(hs_e->level >= mq->write_promote_level); + } else + return maybe_promote(hs_e->level >= mq->read_promote_level); +} + +static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) +{ + sector_t r = from_oblock(b); + (void) sector_div(r, mq->cache_blocks_per_hotspot_block); + return to_oblock(r); +} + +static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) +{ + unsigned int hi; + dm_oblock_t hb = to_hblock(mq, b); + struct entry *e = h_lookup(&mq->hotspot_table, hb); + + if (e) { + stats_level_accessed(&mq->hotspot_stats, e->level); + + hi = get_index(&mq->hotspot_alloc, e); + q_requeue(&mq->hotspot, e, + test_and_set_bit(hi, mq->hotspot_hit_bits) ? + 0u : mq->hotspot_level_jump, + NULL, NULL); + + } else { + stats_miss(&mq->hotspot_stats); + + e = alloc_entry(&mq->hotspot_alloc); + if (!e) { + e = q_pop(&mq->hotspot); + if (e) { + h_remove(&mq->hotspot_table, e); + hi = get_index(&mq->hotspot_alloc, e); + clear_bit(hi, mq->hotspot_hit_bits); + } + + } + + if (e) { + e->oblock = hb; + q_push(&mq->hotspot, e); + h_insert(&mq->hotspot_table, e); + } + } + + return e; +} + +/*----------------------------------------------------------------*/ + +/* + * Public interface, via the policy struct. See dm-cache-policy.h for a + * description of these. + */ + +static struct smq_policy *to_smq_policy(struct dm_cache_policy *p) +{ + return container_of(p, struct smq_policy, policy); +} + +static void smq_destroy(struct dm_cache_policy *p) +{ + struct smq_policy *mq = to_smq_policy(p); + + btracker_destroy(mq->bg_work); + h_exit(&mq->hotspot_table); + h_exit(&mq->table); + free_bitset(mq->hotspot_hit_bits); + free_bitset(mq->cache_hit_bits); + space_exit(&mq->es); + kfree(mq); +} + +/*----------------------------------------------------------------*/ + +static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work, bool *background_work) +{ + struct entry *e, *hs_e; + enum promote_result pr; + + *background_work = false; + + e = h_lookup(&mq->table, oblock); + if (e) { + stats_level_accessed(&mq->cache_stats, e->level); + + requeue(mq, e); + *cblock = infer_cblock(mq, e); + return 0; + + } else { + stats_miss(&mq->cache_stats); + + /* + * The hotspot queue only gets updated with misses. + */ + hs_e = update_hotspot_queue(mq, oblock); + + pr = should_promote(mq, hs_e, data_dir, fast_copy); + if (pr != PROMOTE_NOT) { + queue_promotion(mq, oblock, work); + *background_work = true; + } + + return -ENOENT; + } +} + +static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + bool *background_work) +{ + int r; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + r = __lookup(mq, oblock, cblock, + data_dir, fast_copy, + NULL, background_work); + spin_unlock_irqrestore(&mq->lock, flags); + + return r; +} + +static int smq_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) +{ + int r; + bool background_queued; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); + spin_unlock_irqrestore(&mq->lock, flags); + + return r; +} + +static int smq_get_background_work(struct dm_cache_policy *p, bool idle, + struct policy_work **result) +{ + int r; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + r = btracker_issue(mq->bg_work, result); + if (r == -ENODATA) { + if (!clean_target_met(mq, idle)) { + queue_writeback(mq, idle); + r = btracker_issue(mq->bg_work, result); + } + } + spin_unlock_irqrestore(&mq->lock, flags); + + return r; +} + +/* + * We need to clear any pending work flags that have been set, and in the + * case of promotion free the entry for the destination cblock. + */ +static void __complete_background_work(struct smq_policy *mq, + struct policy_work *work, + bool success) +{ + struct entry *e = get_entry(&mq->cache_alloc, + from_cblock(work->cblock)); + + switch (work->op) { + case POLICY_PROMOTE: + // !h, !q, a + clear_pending(mq, e); + if (success) { + e->oblock = work->oblock; + e->level = NR_CACHE_LEVELS - 1; + push(mq, e); + // h, q, a + } else { + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } + break; + + case POLICY_DEMOTE: + // h, !q, a + if (success) { + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } else { + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + } + break; + + case POLICY_WRITEBACK: + // h, !q, a + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + break; + } + + btracker_complete(mq->bg_work, work); +} + +static void smq_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) +{ + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + __complete_background_work(mq, work, success); + spin_unlock_irqrestore(&mq->lock, flags); +} + +// in_hash(oblock) -> in_hash(oblock) +static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) +{ + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + + if (e->pending_work) + e->dirty = set; + else { + del_queue(mq, e); + e->dirty = set; + push_queue(mq, e); + } +} + +static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + __smq_set_clear_dirty(mq, cblock, true); + spin_unlock_irqrestore(&mq->lock, flags); +} + +static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + struct smq_policy *mq = to_smq_policy(p); + unsigned long flags; + + spin_lock_irqsave(&mq->lock, flags); + __smq_set_clear_dirty(mq, cblock, false); + spin_unlock_irqrestore(&mq->lock, flags); +} + +static unsigned int random_level(dm_cblock_t cblock) +{ + return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); +} + +static int smq_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct smq_policy *mq = to_smq_policy(p); + struct entry *e; + + e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); + e->oblock = oblock; + e->dirty = dirty; + e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); + e->pending_work = false; + + /* + * When we load mappings we push ahead of both sentinels in order to + * allow demotions and cleaning to occur immediately. + */ + push_front(mq, e); + + return 0; +} + +static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + + if (!e->allocated) + return -ENODATA; + + // FIXME: what if this block has pending background work? + del_queue(mq, e); + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + return 0; +} + +static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + + if (!e->allocated) + return 0; + + return e->level; +} + +static dm_cblock_t smq_residency(struct dm_cache_policy *p) +{ + dm_cblock_t r; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + r = to_cblock(mq->cache_alloc.nr_allocated); + spin_unlock_irqrestore(&mq->lock, flags); + + return r; +} + +static void smq_tick(struct dm_cache_policy *p, bool can_block) +{ + struct smq_policy *mq = to_smq_policy(p); + unsigned long flags; + + spin_lock_irqsave(&mq->lock, flags); + mq->tick++; + update_sentinels(mq); + end_hotspot_period(mq); + end_cache_period(mq); + spin_unlock_irqrestore(&mq->lock, flags); +} + +static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + struct smq_policy *mq = to_smq_policy(p); + mq->migrations_allowed = allow; +} + +/* + * smq has no config values, but the old mq policy did. To avoid breaking + * software we continue to accept these configurables for the mq policy, + * but they have no effect. + */ +static int mq_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + unsigned long tmp; + + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + if (!strcasecmp(key, "random_threshold") || + !strcasecmp(key, "sequential_threshold") || + !strcasecmp(key, "discard_promote_adjustment") || + !strcasecmp(key, "read_promote_adjustment") || + !strcasecmp(key, "write_promote_adjustment")) { + DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key); + return 0; + } + + return -EINVAL; +} + +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, + unsigned int maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + + DMEMIT("10 random_threshold 0 " + "sequential_threshold 0 " + "discard_promote_adjustment 0 " + "read_promote_adjustment 0 " + "write_promote_adjustment 0 "); + + *sz_ptr = sz; + return 0; +} + +/* Init the policy plugin interface function pointers. */ +static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) +{ + mq->policy.destroy = smq_destroy; + mq->policy.lookup = smq_lookup; + mq->policy.lookup_with_work = smq_lookup_with_work; + mq->policy.get_background_work = smq_get_background_work; + mq->policy.complete_background_work = smq_complete_background_work; + mq->policy.set_dirty = smq_set_dirty; + mq->policy.clear_dirty = smq_clear_dirty; + mq->policy.load_mapping = smq_load_mapping; + mq->policy.invalidate_mapping = smq_invalidate_mapping; + mq->policy.get_hint = smq_get_hint; + mq->policy.residency = smq_residency; + mq->policy.tick = smq_tick; + mq->policy.allow_migrations = smq_allow_migrations; + + if (mimic_mq) { + mq->policy.set_config_value = mq_set_config_value; + mq->policy.emit_config_values = mq_emit_config_values; + } +} + +static bool too_many_hotspot_blocks(sector_t origin_size, + sector_t hotspot_block_size, + unsigned int nr_hotspot_blocks) +{ + return (hotspot_block_size * nr_hotspot_blocks) > origin_size; +} + +static void calc_hotspot_params(sector_t origin_size, + sector_t cache_block_size, + unsigned int nr_cache_blocks, + sector_t *hotspot_block_size, + unsigned int *nr_hotspot_blocks) +{ + *hotspot_block_size = cache_block_size * 16u; + *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u); + + while ((*hotspot_block_size > cache_block_size) && + too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks)) + *hotspot_block_size /= 2u; +} + +static struct dm_cache_policy * +__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size, + bool mimic_mq, bool migrations_allowed, bool cleaner) +{ + unsigned int i; + unsigned int nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; + unsigned int total_sentinels = 2u * nr_sentinels_per_queue; + struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); + + if (!mq) + return NULL; + + init_policy_functions(mq, mimic_mq); + mq->cache_size = cache_size; + mq->cache_block_size = cache_block_size; + + calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size), + &mq->hotspot_block_size, &mq->nr_hotspot_blocks); + + mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size); + mq->hotspot_level_jump = 1u; + if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) { + DMERR("couldn't initialize entry space"); + goto bad_pool_init; + } + + init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); + for (i = 0; i < nr_sentinels_per_queue; i++) + get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; + + init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); + for (i = 0; i < nr_sentinels_per_queue; i++) + get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; + + init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, + total_sentinels + mq->nr_hotspot_blocks); + + init_allocator(&mq->cache_alloc, &mq->es, + total_sentinels + mq->nr_hotspot_blocks, + total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size)); + + mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks); + if (!mq->hotspot_hit_bits) { + DMERR("couldn't allocate hotspot hit bitset"); + goto bad_hotspot_hit_bits; + } + clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks); + + if (from_cblock(cache_size)) { + mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size)); + if (!mq->cache_hit_bits) { + DMERR("couldn't allocate cache hit bitset"); + goto bad_cache_hit_bits; + } + clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size)); + } else + mq->cache_hit_bits = NULL; + + mq->tick = 0; + spin_lock_init(&mq->lock); + + q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS); + mq->hotspot.nr_top_levels = 8; + mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS, + from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block); + + q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS); + q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS); + + stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS); + stats_init(&mq->cache_stats, NR_CACHE_LEVELS); + + if (h_init(&mq->table, &mq->es, from_cblock(cache_size))) + goto bad_alloc_table; + + if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks)) + goto bad_alloc_hotspot_table; + + sentinels_init(mq); + mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS; + + mq->next_hotspot_period = jiffies; + mq->next_cache_period = jiffies; + + mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */ + if (!mq->bg_work) + goto bad_btracker; + + mq->migrations_allowed = migrations_allowed; + mq->cleaner = cleaner; + + return &mq->policy; + +bad_btracker: + h_exit(&mq->hotspot_table); +bad_alloc_hotspot_table: + h_exit(&mq->table); +bad_alloc_table: + free_bitset(mq->cache_hit_bits); +bad_cache_hit_bits: + free_bitset(mq->hotspot_hit_bits); +bad_hotspot_hit_bits: + space_exit(&mq->es); +bad_pool_init: + kfree(mq); + + return NULL; +} + +static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, + false, true, false); +} + +static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, + true, true, false); +} + +static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, + false, false, true); +} + +/*----------------------------------------------------------------*/ + +static struct dm_cache_policy_type smq_policy_type = { + .name = "smq", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = smq_create +}; + +static struct dm_cache_policy_type mq_policy_type = { + .name = "mq", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create, +}; + +static struct dm_cache_policy_type cleaner_policy_type = { + .name = "cleaner", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = cleaner_create, +}; + +static struct dm_cache_policy_type default_policy_type = { + .name = "default", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = smq_create, + .real = &smq_policy_type +}; + +static int __init smq_init(void) +{ + int r; + + r = dm_cache_policy_register(&smq_policy_type); + if (r) { + DMERR("register failed %d", r); + return -ENOMEM; + } + + r = dm_cache_policy_register(&mq_policy_type); + if (r) { + DMERR("register failed (as mq) %d", r); + goto out_mq; + } + + r = dm_cache_policy_register(&cleaner_policy_type); + if (r) { + DMERR("register failed (as cleaner) %d", r); + goto out_cleaner; + } + + r = dm_cache_policy_register(&default_policy_type); + if (r) { + DMERR("register failed (as default) %d", r); + goto out_default; + } + + return 0; + +out_default: + dm_cache_policy_unregister(&cleaner_policy_type); +out_cleaner: + dm_cache_policy_unregister(&mq_policy_type); +out_mq: + dm_cache_policy_unregister(&smq_policy_type); + + return -ENOMEM; +} + +static void __exit smq_exit(void) +{ + dm_cache_policy_unregister(&cleaner_policy_type); + dm_cache_policy_unregister(&smq_policy_type); + dm_cache_policy_unregister(&mq_policy_type); + dm_cache_policy_unregister(&default_policy_type); +} + +module_init(smq_init); +module_exit(smq_exit); + +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("smq cache policy"); + +MODULE_ALIAS("dm-cache-default"); +MODULE_ALIAS("dm-cache-mq"); +MODULE_ALIAS("dm-cache-cleaner"); diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c new file mode 100644 index 000000000..2e58bbcf3 --- /dev/null +++ b/drivers/md/dm-cache-policy.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-policy-internal.h" +#include "dm.h" + +#include +#include + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "cache-policy" + +static DEFINE_SPINLOCK(register_lock); +static LIST_HEAD(register_list); + +static struct dm_cache_policy_type *__find_policy(const char *name) +{ + struct dm_cache_policy_type *t; + + list_for_each_entry(t, ®ister_list, list) + if (!strcmp(t->name, name)) + return t; + + return NULL; +} + +static struct dm_cache_policy_type *__get_policy_once(const char *name) +{ + struct dm_cache_policy_type *t = __find_policy(name); + + if (t && !try_module_get(t->owner)) { + DMWARN("couldn't get module %s", name); + t = ERR_PTR(-EINVAL); + } + + return t; +} + +static struct dm_cache_policy_type *get_policy_once(const char *name) +{ + struct dm_cache_policy_type *t; + + spin_lock(®ister_lock); + t = __get_policy_once(name); + spin_unlock(®ister_lock); + + return t; +} + +static struct dm_cache_policy_type *get_policy(const char *name) +{ + struct dm_cache_policy_type *t; + + t = get_policy_once(name); + if (IS_ERR(t)) + return NULL; + + if (t) + return t; + + request_module("dm-cache-%s", name); + + t = get_policy_once(name); + if (IS_ERR(t)) + return NULL; + + return t; +} + +static void put_policy(struct dm_cache_policy_type *t) +{ + module_put(t->owner); +} + +int dm_cache_policy_register(struct dm_cache_policy_type *type) +{ + int r; + + /* One size fits all for now */ + if (type->hint_size != 0 && type->hint_size != 4) { + DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size); + return -EINVAL; + } + + spin_lock(®ister_lock); + if (__find_policy(type->name)) { + DMWARN("attempt to register policy under duplicate name %s", type->name); + r = -EINVAL; + } else { + list_add(&type->list, ®ister_list); + r = 0; + } + spin_unlock(®ister_lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_register); + +void dm_cache_policy_unregister(struct dm_cache_policy_type *type) +{ + spin_lock(®ister_lock); + list_del_init(&type->list); + spin_unlock(®ister_lock); +} +EXPORT_SYMBOL_GPL(dm_cache_policy_unregister); + +struct dm_cache_policy *dm_cache_policy_create(const char *name, + dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + struct dm_cache_policy *p = NULL; + struct dm_cache_policy_type *type; + + type = get_policy(name); + if (!type) { + DMWARN("unknown policy type"); + return ERR_PTR(-EINVAL); + } + + p = type->create(cache_size, origin_size, cache_block_size); + if (!p) { + put_policy(type); + return ERR_PTR(-ENOMEM); + } + p->private = type; + + return p; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_create); + +void dm_cache_policy_destroy(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + p->destroy(p); + put_policy(t); +} +EXPORT_SYMBOL_GPL(dm_cache_policy_destroy); + +const char *dm_cache_policy_get_name(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + /* if t->real is set then an alias was used (e.g. "default") */ + if (t->real) + return t->real->name; + + return t->name; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); + +const unsigned int *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); + +size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->hint_size; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h new file mode 100644 index 000000000..6ba3e9c91 --- /dev/null +++ b/drivers/md/dm-cache-policy.h @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_POLICY_H +#define DM_CACHE_POLICY_H + +#include "dm-cache-block-types.h" + +#include + +/*----------------------------------------------------------------*/ + +/* + * The cache policy makes the important decisions about which blocks get to + * live on the faster cache device. + */ +enum policy_operation { + POLICY_PROMOTE, + POLICY_DEMOTE, + POLICY_WRITEBACK +}; + +/* + * This is the instruction passed back to the core target. + */ +struct policy_work { + enum policy_operation op; + dm_oblock_t oblock; + dm_cblock_t cblock; +}; + +/* + * The cache policy object. It is envisaged that this structure will be + * embedded in a bigger, policy specific structure (ie. use container_of()). + */ +struct dm_cache_policy { + /* + * Destroys this object. + */ + void (*destroy)(struct dm_cache_policy *p); + + /* + * Find the location of a block. + * + * Must not block. + * + * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for + * other errors (-EWOULDBLOCK would be typical). data_dir should be + * READ or WRITE. fast_copy should be set if migrating this block would + * be 'cheap' somehow (eg, discarded data). background_queued will be set + * if a migration has just been queued. + */ + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued); + + /* + * Sometimes the core target can optimise a migration, eg, the + * block may be discarded, or the bio may cover an entire block. + * In order to optimise it needs the migration immediately though + * so it knows to do something different with the bio. + * + * This method is optional (policy-internal will fallback to using + * lookup). + */ + int (*lookup_with_work)(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work); + + /* + * Retrieves background work. Returns -ENODATA when there's no + * background work. + */ + int (*get_background_work)(struct dm_cache_policy *p, bool idle, + struct policy_work **result); + + /* + * You must pass in the same work pointer that you were given, not + * a copy. + */ + void (*complete_background_work)(struct dm_cache_policy *p, + struct policy_work *work, + bool success); + + void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); + void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); + + /* + * Called when a cache target is first created. Used to load a + * mapping from the metadata device into the policy. + */ + int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, + dm_cblock_t cblock, bool dirty, + uint32_t hint, bool hint_valid); + + /* + * Drops the mapping, irrespective of whether it's clean or dirty. + * Returns -ENODATA if cblock is not mapped. + */ + int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); + + /* + * Gets the hint for a given cblock. Called in a single threaded + * context. So no locking required. + */ + uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); + + /* + * How full is the cache? + */ + dm_cblock_t (*residency)(struct dm_cache_policy *p); + + /* + * Because of where we sit in the block layer, we can be asked to + * map a lot of little bios that are all in the same block (no + * queue merging has occurred). To stop the policy being fooled by + * these, the core target sends regular tick() calls to the policy. + * The policy should only count an entry as hit once per tick. + * + * This method is optional. + */ + void (*tick)(struct dm_cache_policy *p, bool can_block); + + /* + * Configuration. + */ + int (*emit_config_values)(struct dm_cache_policy *p, char *result, + unsigned int maxlen, ssize_t *sz_ptr); + int (*set_config_value)(struct dm_cache_policy *p, + const char *key, const char *value); + + void (*allow_migrations)(struct dm_cache_policy *p, bool allow); + + /* + * Book keeping ptr for the policy register, not for general use. + */ + void *private; +}; + +/*----------------------------------------------------------------*/ + +/* + * We maintain a little register of the different policy types. + */ +#define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3 + +struct dm_cache_policy_type { + /* For use by the register code only. */ + struct list_head list; + + /* + * Policy writers should fill in these fields. The name field is + * what gets passed on the target line to select your policy. + */ + char name[CACHE_POLICY_NAME_SIZE]; + unsigned int version[CACHE_POLICY_VERSION_SIZE]; + + /* + * For use by an alias dm_cache_policy_type to point to the + * real dm_cache_policy_type. + */ + struct dm_cache_policy_type *real; + + /* + * Policies may store a hint for each cache block. + * Currently the size of this hint must be 0 or 4 bytes but we + * expect to relax this in future. + */ + size_t hint_size; + + struct module *owner; + struct dm_cache_policy *(*create)(dm_cblock_t cache_size, + sector_t origin_size, + sector_t block_size); +}; + +int dm_cache_policy_register(struct dm_cache_policy_type *type); +void dm_cache_policy_unregister(struct dm_cache_policy_type *type); + +/*----------------------------------------------------------------*/ + +#endif /* DM_CACHE_POLICY_H */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c new file mode 100644 index 000000000..8f7426b71 --- /dev/null +++ b/drivers/md/dm-cache-target.c @@ -0,0 +1,3454 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison-v2.h" +#include "dm-bio-record.h" +#include "dm-cache-metadata.h" +#include "dm-io-tracker.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "cache" + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, + "A percentage of time allocated for copying to and/or from cache"); + +/*----------------------------------------------------------------*/ + +/* + * Glossary: + * + * oblock: index of an origin block + * cblock: index of a cache block + * promotion: movement of a block from origin to cache + * demotion: movement of a block from cache to origin + * migration: movement of a block between the origin and cache device, + * either direction + */ + +/*----------------------------------------------------------------*/ + +/* + * Represents a chunk of future work. 'input' allows continuations to pass + * values between themselves, typically error values. + */ +struct continuation { + struct work_struct ws; + blk_status_t input; +}; + +static inline void init_continuation(struct continuation *k, + void (*fn)(struct work_struct *)) +{ + INIT_WORK(&k->ws, fn); + k->input = 0; +} + +static inline void queue_continuation(struct workqueue_struct *wq, + struct continuation *k) +{ + queue_work(wq, &k->ws); +} + +/*----------------------------------------------------------------*/ + +/* + * The batcher collects together pieces of work that need a particular + * operation to occur before they can proceed (typically a commit). + */ +struct batcher { + /* + * The operation that everyone is waiting for. + */ + blk_status_t (*commit_op)(void *context); + void *commit_context; + + /* + * This is how bios should be issued once the commit op is complete + * (accounted_request). + */ + void (*issue_op)(struct bio *bio, void *context); + void *issue_context; + + /* + * Queued work gets put on here after commit. + */ + struct workqueue_struct *wq; + + spinlock_t lock; + struct list_head work_items; + struct bio_list bios; + struct work_struct commit_work; + + bool commit_scheduled; +}; + +static void __commit(struct work_struct *_ws) +{ + struct batcher *b = container_of(_ws, struct batcher, commit_work); + blk_status_t r; + struct list_head work_items; + struct work_struct *ws, *tmp; + struct continuation *k; + struct bio *bio; + struct bio_list bios; + + INIT_LIST_HEAD(&work_items); + bio_list_init(&bios); + + /* + * We have to grab these before the commit_op to avoid a race + * condition. + */ + spin_lock_irq(&b->lock); + list_splice_init(&b->work_items, &work_items); + bio_list_merge(&bios, &b->bios); + bio_list_init(&b->bios); + b->commit_scheduled = false; + spin_unlock_irq(&b->lock); + + r = b->commit_op(b->commit_context); + + list_for_each_entry_safe(ws, tmp, &work_items, entry) { + k = container_of(ws, struct continuation, ws); + k->input = r; + INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ + queue_work(b->wq, ws); + } + + while ((bio = bio_list_pop(&bios))) { + if (r) { + bio->bi_status = r; + bio_endio(bio); + } else + b->issue_op(bio, b->issue_context); + } +} + +static void batcher_init(struct batcher *b, + blk_status_t (*commit_op)(void *), + void *commit_context, + void (*issue_op)(struct bio *bio, void *), + void *issue_context, + struct workqueue_struct *wq) +{ + b->commit_op = commit_op; + b->commit_context = commit_context; + b->issue_op = issue_op; + b->issue_context = issue_context; + b->wq = wq; + + spin_lock_init(&b->lock); + INIT_LIST_HEAD(&b->work_items); + bio_list_init(&b->bios); + INIT_WORK(&b->commit_work, __commit); + b->commit_scheduled = false; +} + +static void async_commit(struct batcher *b) +{ + queue_work(b->wq, &b->commit_work); +} + +static void continue_after_commit(struct batcher *b, struct continuation *k) +{ + bool commit_scheduled; + + spin_lock_irq(&b->lock); + commit_scheduled = b->commit_scheduled; + list_add_tail(&k->ws.entry, &b->work_items); + spin_unlock_irq(&b->lock); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Bios are errored if commit failed. + */ +static void issue_after_commit(struct batcher *b, struct bio *bio) +{ + bool commit_scheduled; + + spin_lock_irq(&b->lock); + commit_scheduled = b->commit_scheduled; + bio_list_add(&b->bios, bio); + spin_unlock_irq(&b->lock); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Call this if some urgent work is waiting for the commit to complete. + */ +static void schedule_commit(struct batcher *b) +{ + bool immediate; + + spin_lock_irq(&b->lock); + immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); + b->commit_scheduled = true; + spin_unlock_irq(&b->lock); + + if (immediate) + async_commit(b); +} + +/* + * There are a couple of places where we let a bio run, but want to do some + * work before calling its endio function. We do this by temporarily + * changing the endio fn. + */ +struct dm_hook_info { + bio_end_io_t *bi_end_io; +}; + +static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, + bio_end_io_t *bi_end_io, void *bi_private) +{ + h->bi_end_io = bio->bi_end_io; + + bio->bi_end_io = bi_end_io; + bio->bi_private = bi_private; +} + +static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) +{ + bio->bi_end_io = h->bi_end_io; +} + +/*----------------------------------------------------------------*/ + +#define MIGRATION_POOL_SIZE 128 +#define COMMIT_PERIOD HZ +#define MIGRATION_COUNT_WINDOW 10 + +/* + * The block size of the device holding cache data must be + * between 32KB and 1GB. + */ +#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) +#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) + +enum cache_metadata_mode { + CM_WRITE, /* metadata may be changed */ + CM_READ_ONLY, /* metadata may not be changed */ + CM_FAIL +}; + +enum cache_io_mode { + /* + * Data is written to cached blocks only. These blocks are marked + * dirty. If you lose the cache device you will lose data. + * Potential performance increase for both reads and writes. + */ + CM_IO_WRITEBACK, + + /* + * Data is written to both cache and origin. Blocks are never + * dirty. Potential performance benfit for reads only. + */ + CM_IO_WRITETHROUGH, + + /* + * A degraded mode useful for various cache coherency situations + * (eg, rolling back snapshots). Reads and writes always go to the + * origin. If a write goes to a cached oblock, then the cache + * block is invalidated. + */ + CM_IO_PASSTHROUGH +}; + +struct cache_features { + enum cache_metadata_mode mode; + enum cache_io_mode io_mode; + unsigned int metadata_version; + bool discard_passdown:1; +}; + +struct cache_stats { + atomic_t read_hit; + atomic_t read_miss; + atomic_t write_hit; + atomic_t write_miss; + atomic_t demotion; + atomic_t promotion; + atomic_t writeback; + atomic_t copies_avoided; + atomic_t cache_cell_clash; + atomic_t commit_count; + atomic_t discard_count; +}; + +struct cache { + struct dm_target *ti; + spinlock_t lock; + + /* + * Fields for converting from sectors to blocks. + */ + int sectors_per_block_shift; + sector_t sectors_per_block; + + struct dm_cache_metadata *cmd; + + /* + * Metadata is written to this device. + */ + struct dm_dev *metadata_dev; + + /* + * The slower of the two data devices. Typically a spindle. + */ + struct dm_dev *origin_dev; + + /* + * The faster of the two data devices. Typically an SSD. + */ + struct dm_dev *cache_dev; + + /* + * Size of the origin device in _complete_ blocks and native sectors. + */ + dm_oblock_t origin_blocks; + sector_t origin_sectors; + + /* + * Size of the cache device in blocks. + */ + dm_cblock_t cache_size; + + /* + * Invalidation fields. + */ + spinlock_t invalidation_lock; + struct list_head invalidation_requests; + + sector_t migration_threshold; + wait_queue_head_t migration_wait; + atomic_t nr_allocated_migrations; + + /* + * The number of in flight migrations that are performing + * background io. eg, promotion, writeback. + */ + atomic_t nr_io_migrations; + + struct bio_list deferred_bios; + + struct rw_semaphore quiesce_lock; + + /* + * origin_blocks entries, discarded if set. + */ + dm_dblock_t discard_nr_blocks; + unsigned long *discard_bitset; + uint32_t discard_block_size; /* a power of 2 times sectors per block */ + + /* + * Rather than reconstructing the table line for the status we just + * save it and regurgitate. + */ + unsigned int nr_ctr_args; + const char **ctr_args; + + struct dm_kcopyd_client *copier; + struct work_struct deferred_bio_worker; + struct work_struct migration_worker; + struct workqueue_struct *wq; + struct delayed_work waker; + struct dm_bio_prison_v2 *prison; + + /* + * cache_size entries, dirty if set + */ + unsigned long *dirty_bitset; + atomic_t nr_dirty; + + unsigned int policy_nr_args; + struct dm_cache_policy *policy; + + /* + * Cache features such as write-through. + */ + struct cache_features features; + + struct cache_stats stats; + + bool need_tick_bio:1; + bool sized:1; + bool invalidate:1; + bool commit_requested:1; + bool loaded_mappings:1; + bool loaded_discards:1; + + struct rw_semaphore background_work_lock; + + struct batcher committer; + struct work_struct commit_ws; + + struct dm_io_tracker tracker; + + mempool_t migration_pool; + + struct bio_set bs; +}; + +struct per_bio_data { + bool tick:1; + unsigned int req_nr:2; + struct dm_bio_prison_cell_v2 *cell; + struct dm_hook_info hook_info; + sector_t len; +}; + +struct dm_cache_migration { + struct continuation k; + struct cache *cache; + + struct policy_work *op; + struct bio *overwrite_bio; + struct dm_bio_prison_cell_v2 *cell; + + dm_cblock_t invalidate_cblock; + dm_oblock_t invalidate_oblock; +}; + +/*----------------------------------------------------------------*/ + +static bool writethrough_mode(struct cache *cache) +{ + return cache->features.io_mode == CM_IO_WRITETHROUGH; +} + +static bool writeback_mode(struct cache *cache) +{ + return cache->features.io_mode == CM_IO_WRITEBACK; +} + +static inline bool passthrough_mode(struct cache *cache) +{ + return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); +} + +/*----------------------------------------------------------------*/ + +static void wake_deferred_bio_worker(struct cache *cache) +{ + queue_work(cache->wq, &cache->deferred_bio_worker); +} + +static void wake_migration_worker(struct cache *cache) +{ + if (passthrough_mode(cache)) + return; + + queue_work(cache->wq, &cache->migration_worker); +} + +/*----------------------------------------------------------------*/ + +static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) +{ + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); +} + +static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) +{ + dm_bio_prison_free_cell_v2(cache->prison, cell); +} + +static struct dm_cache_migration *alloc_migration(struct cache *cache) +{ + struct dm_cache_migration *mg; + + mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); + + memset(mg, 0, sizeof(*mg)); + + mg->cache = cache; + atomic_inc(&cache->nr_allocated_migrations); + + return mg; +} + +static void free_migration(struct dm_cache_migration *mg) +{ + struct cache *cache = mg->cache; + + if (atomic_dec_and_test(&cache->nr_allocated_migrations)) + wake_up(&cache->migration_wait); + + mempool_free(mg, &cache->migration_pool); +} + +/*----------------------------------------------------------------*/ + +static inline dm_oblock_t oblock_succ(dm_oblock_t b) +{ + return to_oblock(from_oblock(b) + 1ull); +} + +static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) +{ + key->virtual = 0; + key->dev = 0; + key->block_begin = from_oblock(begin); + key->block_end = from_oblock(end); +} + +/* + * We have two lock levels. Level 0, which is used to prevent WRITEs, and + * level 1 which prevents *both* READs and WRITEs. + */ +#define WRITE_LOCK_LEVEL 0 +#define READ_WRITE_LOCK_LEVEL 1 + +static unsigned int lock_level(struct bio *bio) +{ + return bio_data_dir(bio) == WRITE ? + WRITE_LOCK_LEVEL : + READ_WRITE_LOCK_LEVEL; +} + +/*---------------------------------------------------------------- + * Per bio data + *--------------------------------------------------------------*/ + +static struct per_bio_data *get_per_bio_data(struct bio *bio) +{ + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + BUG_ON(!pb); + return pb; +} + +static struct per_bio_data *init_per_bio_data(struct bio *bio) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + pb->tick = false; + pb->req_nr = dm_bio_get_target_bio_nr(bio); + pb->cell = NULL; + pb->len = 0; + + return pb; +} + +/*----------------------------------------------------------------*/ + +static void defer_bio(struct cache *cache, struct bio *bio) +{ + spin_lock_irq(&cache->lock); + bio_list_add(&cache->deferred_bios, bio); + spin_unlock_irq(&cache->lock); + + wake_deferred_bio_worker(cache); +} + +static void defer_bios(struct cache *cache, struct bio_list *bios) +{ + spin_lock_irq(&cache->lock); + bio_list_merge(&cache->deferred_bios, bios); + bio_list_init(bios); + spin_unlock_irq(&cache->lock); + + wake_deferred_bio_worker(cache); +} + +/*----------------------------------------------------------------*/ + +static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) +{ + bool r; + struct per_bio_data *pb; + struct dm_cell_key_v2 key; + dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); + struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; + + cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ + + build_key(oblock, end, &key); + r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); + if (!r) { + /* + * Failed to get the lock. + */ + free_prison_cell(cache, cell_prealloc); + return r; + } + + if (cell != cell_prealloc) + free_prison_cell(cache, cell_prealloc); + + pb = get_per_bio_data(bio); + pb->cell = cell; + + return r; +} + +/*----------------------------------------------------------------*/ + +static bool is_dirty(struct cache *cache, dm_cblock_t b) +{ + return test_bit(from_cblock(b), cache->dirty_bitset); +} + +static void set_dirty(struct cache *cache, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { + atomic_inc(&cache->nr_dirty); + policy_set_dirty(cache->policy, cblock); + } +} + +/* + * These two are called when setting after migrations to force the policy + * and dirty bitset to be in sync. + */ +static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) + atomic_inc(&cache->nr_dirty); + policy_set_dirty(cache->policy, cblock); +} + +static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) +{ + if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { + if (atomic_dec_return(&cache->nr_dirty) == 0) + dm_table_event(cache->ti->table); + } + + policy_clear_dirty(cache->policy, cblock); +} + +/*----------------------------------------------------------------*/ + +static bool block_size_is_power_of_two(struct cache *cache) +{ + return cache->sectors_per_block_shift >= 0; +} + +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + +static dm_block_t oblocks_per_dblock(struct cache *cache) +{ + dm_block_t oblocks = cache->discard_block_size; + + if (block_size_is_power_of_two(cache)) + oblocks >>= cache->sectors_per_block_shift; + else + oblocks = block_div(oblocks, cache->sectors_per_block); + + return oblocks; +} + +static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) +{ + return to_dblock(block_div(from_oblock(oblock), + oblocks_per_dblock(cache))); +} + +static void set_discard(struct cache *cache, dm_dblock_t b) +{ + BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); + atomic_inc(&cache->stats.discard_count); + + spin_lock_irq(&cache->lock); + set_bit(from_dblock(b), cache->discard_bitset); + spin_unlock_irq(&cache->lock); +} + +static void clear_discard(struct cache *cache, dm_dblock_t b) +{ + spin_lock_irq(&cache->lock); + clear_bit(from_dblock(b), cache->discard_bitset); + spin_unlock_irq(&cache->lock); +} + +static bool is_discarded(struct cache *cache, dm_dblock_t b) +{ + int r; + spin_lock_irq(&cache->lock); + r = test_bit(from_dblock(b), cache->discard_bitset); + spin_unlock_irq(&cache->lock); + + return r; +} + +static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) +{ + int r; + spin_lock_irq(&cache->lock); + r = test_bit(from_dblock(oblock_to_dblock(cache, b)), + cache->discard_bitset); + spin_unlock_irq(&cache->lock); + + return r; +} + +/*---------------------------------------------------------------- + * Remapping + *--------------------------------------------------------------*/ +static void remap_to_origin(struct cache *cache, struct bio *bio) +{ + bio_set_dev(bio, cache->origin_dev->bdev); +} + +static void remap_to_cache(struct cache *cache, struct bio *bio, + dm_cblock_t cblock) +{ + sector_t bi_sector = bio->bi_iter.bi_sector; + sector_t block = from_cblock(cblock); + + bio_set_dev(bio, cache->cache_dev->bdev); + if (!block_size_is_power_of_two(cache)) + bio->bi_iter.bi_sector = + (block * cache->sectors_per_block) + + sector_div(bi_sector, cache->sectors_per_block); + else + bio->bi_iter.bi_sector = + (block << cache->sectors_per_block_shift) | + (bi_sector & (cache->sectors_per_block - 1)); +} + +static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) +{ + struct per_bio_data *pb; + + spin_lock_irq(&cache->lock); + if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && + bio_op(bio) != REQ_OP_DISCARD) { + pb = get_per_bio_data(bio); + pb->tick = true; + cache->need_tick_bio = false; + } + spin_unlock_irq(&cache->lock); +} + +static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock) +{ + // FIXME: check_if_tick_bio_needed() is called way too much through this interface + check_if_tick_bio_needed(cache, bio); + remap_to_origin(cache, bio); + if (bio_data_dir(bio) == WRITE) + clear_discard(cache, oblock_to_dblock(cache, oblock)); +} + +static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + check_if_tick_bio_needed(cache, bio); + remap_to_cache(cache, bio, cblock); + if (bio_data_dir(bio) == WRITE) { + set_dirty(cache, cblock); + clear_discard(cache, oblock_to_dblock(cache, oblock)); + } +} + +static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) +{ + sector_t block_nr = bio->bi_iter.bi_sector; + + if (!block_size_is_power_of_two(cache)) + (void) sector_div(block_nr, cache->sectors_per_block); + else + block_nr >>= cache->sectors_per_block_shift; + + return to_oblock(block_nr); +} + +static bool accountable_bio(struct cache *cache, struct bio *bio) +{ + return bio_op(bio) != REQ_OP_DISCARD; +} + +static void accounted_begin(struct cache *cache, struct bio *bio) +{ + struct per_bio_data *pb; + + if (accountable_bio(cache, bio)) { + pb = get_per_bio_data(bio); + pb->len = bio_sectors(bio); + dm_iot_io_begin(&cache->tracker, pb->len); + } +} + +static void accounted_complete(struct cache *cache, struct bio *bio) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + dm_iot_io_end(&cache->tracker, pb->len); +} + +static void accounted_request(struct cache *cache, struct bio *bio) +{ + accounted_begin(cache, bio); + dm_submit_bio_remap(bio, NULL); +} + +static void issue_op(struct bio *bio, void *context) +{ + struct cache *cache = context; + accounted_request(cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. Clone the bio and send them in parallel. + */ +static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio, + GFP_NOIO, &cache->bs); + + BUG_ON(!origin_bio); + + bio_chain(origin_bio, bio); + + if (bio_data_dir(origin_bio) == WRITE) + clear_discard(cache, oblock_to_dblock(cache, oblock)); + submit_bio(origin_bio); + + remap_to_cache(cache, bio, cblock); +} + +/*---------------------------------------------------------------- + * Failure modes + *--------------------------------------------------------------*/ +static enum cache_metadata_mode get_cache_mode(struct cache *cache) +{ + return cache->features.mode; +} + +static const char *cache_device_name(struct cache *cache) +{ + return dm_table_device_name(cache->ti->table); +} + +static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) +{ + const char *descs[] = { + "write", + "read-only", + "fail" + }; + + dm_table_event(cache->ti->table); + DMINFO("%s: switching cache to %s mode", + cache_device_name(cache), descs[(int)mode]); +} + +static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) +{ + bool needs_check; + enum cache_metadata_mode old_mode = get_cache_mode(cache); + + if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { + DMERR("%s: unable to read needs_check flag, setting failure mode.", + cache_device_name(cache)); + new_mode = CM_FAIL; + } + + if (new_mode == CM_WRITE && needs_check) { + DMERR("%s: unable to switch cache to write mode until repaired.", + cache_device_name(cache)); + if (old_mode != new_mode) + new_mode = old_mode; + else + new_mode = CM_READ_ONLY; + } + + /* Never move out of fail mode */ + if (old_mode == CM_FAIL) + new_mode = CM_FAIL; + + switch (new_mode) { + case CM_FAIL: + case CM_READ_ONLY: + dm_cache_metadata_set_read_only(cache->cmd); + break; + + case CM_WRITE: + dm_cache_metadata_set_read_write(cache->cmd); + break; + } + + cache->features.mode = new_mode; + + if (new_mode != old_mode) + notify_mode_switch(cache, new_mode); +} + +static void abort_transaction(struct cache *cache) +{ + const char *dev_name = cache_device_name(cache); + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return; + + DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); + if (dm_cache_metadata_abort(cache->cmd)) { + DMERR("%s: failed to abort metadata transaction", dev_name); + set_cache_mode(cache, CM_FAIL); + } + + if (dm_cache_metadata_set_needs_check(cache->cmd)) { + DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); + set_cache_mode(cache, CM_FAIL); + } +} + +static void metadata_operation_failed(struct cache *cache, const char *op, int r) +{ + DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", + cache_device_name(cache), op, r); + abort_transaction(cache); + set_cache_mode(cache, CM_READ_ONLY); +} + +/*----------------------------------------------------------------*/ + +static void load_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + dm_cache_metadata_get_stats(cache->cmd, &stats); + atomic_set(&cache->stats.read_hit, stats.read_hits); + atomic_set(&cache->stats.read_miss, stats.read_misses); + atomic_set(&cache->stats.write_hit, stats.write_hits); + atomic_set(&cache->stats.write_miss, stats.write_misses); +} + +static void save_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return; + + stats.read_hits = atomic_read(&cache->stats.read_hit); + stats.read_misses = atomic_read(&cache->stats.read_miss); + stats.write_hits = atomic_read(&cache->stats.write_hit); + stats.write_misses = atomic_read(&cache->stats.write_miss); + + dm_cache_metadata_set_stats(cache->cmd, &stats); +} + +static void update_stats(struct cache_stats *stats, enum policy_operation op) +{ + switch (op) { + case POLICY_PROMOTE: + atomic_inc(&stats->promotion); + break; + + case POLICY_DEMOTE: + atomic_inc(&stats->demotion); + break; + + case POLICY_WRITEBACK: + atomic_inc(&stats->writeback); + break; + } +} + +/*---------------------------------------------------------------- + * Migration processing + * + * Migration covers moving data from the origin device to the cache, or + * vice versa. + *--------------------------------------------------------------*/ + +static void inc_io_migrations(struct cache *cache) +{ + atomic_inc(&cache->nr_io_migrations); +} + +static void dec_io_migrations(struct cache *cache) +{ + atomic_dec(&cache->nr_io_migrations); +} + +static bool discard_or_flush(struct bio *bio) +{ + return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); +} + +static void calc_discard_block_range(struct cache *cache, struct bio *bio, + dm_dblock_t *b, dm_dblock_t *e) +{ + sector_t sb = bio->bi_iter.bi_sector; + sector_t se = bio_end_sector(bio); + + *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); + + if (se - sb < cache->discard_block_size) + *e = *b; + else + *e = to_dblock(block_div(se, cache->discard_block_size)); +} + +/*----------------------------------------------------------------*/ + +static void prevent_background_work(struct cache *cache) +{ + lockdep_off(); + down_write(&cache->background_work_lock); + lockdep_on(); +} + +static void allow_background_work(struct cache *cache) +{ + lockdep_off(); + up_write(&cache->background_work_lock); + lockdep_on(); +} + +static bool background_work_begin(struct cache *cache) +{ + bool r; + + lockdep_off(); + r = down_read_trylock(&cache->background_work_lock); + lockdep_on(); + + return r; +} + +static void background_work_end(struct cache *cache) +{ + lockdep_off(); + up_read(&cache->background_work_lock); + lockdep_on(); +} + +/*----------------------------------------------------------------*/ + +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} + +static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) +{ + return writeback_mode(cache) && + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); +} + +static void quiesce(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) +{ + init_continuation(&mg->k, continuation); + dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); +} + +static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) +{ + struct continuation *k = container_of(ws, struct continuation, ws); + return container_of(k, struct dm_cache_migration, k); +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); + + if (read_err || write_err) + mg->k.input = BLK_STS_IOERR; + + queue_continuation(mg->cache->wq, &mg->k); +} + +static void copy(struct dm_cache_migration *mg, bool promote) +{ + struct dm_io_region o_region, c_region; + struct cache *cache = mg->cache; + + o_region.bdev = cache->origin_dev->bdev; + o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; + o_region.count = cache->sectors_per_block; + + c_region.bdev = cache->cache_dev->bdev; + c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; + c_region.count = cache->sectors_per_block; + + if (promote) + dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); + else + dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); +} + +static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) + free_prison_cell(cache, pb->cell); + pb->cell = NULL; +} + +static void overwrite_endio(struct bio *bio) +{ + struct dm_cache_migration *mg = bio->bi_private; + struct cache *cache = mg->cache; + struct per_bio_data *pb = get_per_bio_data(bio); + + dm_unhook_bio(&pb->hook_info, bio); + + if (bio->bi_status) + mg->k.input = bio->bi_status; + + queue_continuation(cache->wq, &mg->k); +} + +static void overwrite(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) +{ + struct bio *bio = mg->overwrite_bio; + struct per_bio_data *pb = get_per_bio_data(bio); + + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + + /* + * The overwrite bio is part of the copy operation, as such it does + * not set/clear discard or dirty flags. + */ + if (mg->op->op == POLICY_PROMOTE) + remap_to_cache(mg->cache, bio, mg->op->cblock); + else + remap_to_origin(mg->cache, bio); + + init_continuation(&mg->k, continuation); + accounted_request(mg->cache, bio); +} + +/* + * Migration steps: + * + * 1) exclusive lock preventing WRITEs + * 2) quiesce + * 3) copy or issue overwrite bio + * 4) upgrade to exclusive lock preventing READs and WRITEs + * 5) quiesce + * 6) update metadata and commit + * 7) unlock + */ +static void mg_complete(struct dm_cache_migration *mg, bool success) +{ + struct bio_list bios; + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + dm_cblock_t cblock = op->cblock; + + if (success) + update_stats(&cache->stats, op->op); + + switch (op->op) { + case POLICY_PROMOTE: + clear_discard(cache, oblock_to_dblock(cache, op->oblock)); + policy_complete_background_work(cache->policy, op, success); + + if (mg->overwrite_bio) { + if (success) + force_set_dirty(cache, cblock); + else if (mg->k.input) + mg->overwrite_bio->bi_status = mg->k.input; + else + mg->overwrite_bio->bi_status = BLK_STS_IOERR; + bio_endio(mg->overwrite_bio); + } else { + if (success) + force_clear_dirty(cache, cblock); + dec_io_migrations(cache); + } + break; + + case POLICY_DEMOTE: + /* + * We clear dirty here to update the nr_dirty counter. + */ + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + + case POLICY_WRITEBACK: + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + } + + bio_list_init(&bios); + if (mg->cell) { + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + } + + free_migration(mg); + defer_bios(cache, &bios); + wake_migration_worker(cache); + + background_work_end(cache); +} + +static void mg_success(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + mg_complete(mg, mg->k.input == 0); +} + +static void mg_update_metadata(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + + switch (op->op) { + case POLICY_PROMOTE: + r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't insert mapping", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_insert_mapping", r); + + mg_complete(mg, false); + return; + } + mg_complete(mg, true); + break; + + case POLICY_DEMOTE: + r = dm_cache_remove_mapping(cache->cmd, op->cblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); + + mg_complete(mg, false); + return; + } + + /* + * It would be nice if we only had to commit when a REQ_FLUSH + * comes through. But there's one scenario that we have to + * look out for: + * + * - vblock x in a cache block + * - domotion occurs + * - cache block gets reallocated and over written + * - crash + * + * When we recover, because there was no commit the cache will + * rollback to having the data for vblock x in the cache block. + * But the cache block has since been overwritten, so it'll end + * up pointing to data that was never in 'x' during the history + * of the device. + * + * To avoid this issue we require a commit as part of the + * demotion operation. + */ + init_continuation(&mg->k, mg_success); + continue_after_commit(&cache->committer, &mg->k); + schedule_commit(&cache->committer); + break; + + case POLICY_WRITEBACK: + mg_complete(mg, true); + break; + } +} + +static void mg_update_metadata_after_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); + else + mg_update_metadata(ws); +} + +static void mg_upgrade_lock(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); + + else { + /* + * Now we want the lock to prevent both reads and writes. + */ + r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, + READ_WRITE_LOCK_LEVEL); + if (r < 0) + mg_complete(mg, false); + + else if (r) + quiesce(mg, mg_update_metadata); + + else + mg_update_metadata(ws); + } +} + +static void mg_full_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + bool is_policy_promote = (op->op == POLICY_PROMOTE); + + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || + is_discarded_oblock(cache, op->oblock)) { + mg_upgrade_lock(ws); + return; + } + + init_continuation(&mg->k, mg_upgrade_lock); + copy(mg, is_policy_promote); +} + +static void mg_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + + if (mg->overwrite_bio) { + /* + * No exclusive lock was held when we last checked if the bio + * was optimisable. So we have to check again in case things + * have changed (eg, the block may no longer be discarded). + */ + if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { + /* + * Fallback to a real full copy after doing some tidying up. + */ + bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); + BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ + mg->overwrite_bio = NULL; + inc_io_migrations(mg->cache); + mg_full_copy(ws); + return; + } + + /* + * It's safe to do this here, even though it's new data + * because all IO has been locked out of the block. + * + * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL + * so _not_ using mg_upgrade_lock() as continutation. + */ + overwrite(mg, mg_update_metadata_after_copy); + + } else + mg_full_copy(ws); +} + +static int mg_lock_writes(struct dm_cache_migration *mg) +{ + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; + + prealloc = alloc_prison_cell(cache); + + /* + * Prevent writes to the block, but allow reads to continue. + * Unless we're using an overwrite bio, in which case we lock + * everything. + */ + build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, + prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + mg_complete(mg, false); + return r; + } + + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); + + if (r == 0) + mg_copy(&mg->k.ws); + else + quiesce(mg, mg_copy); + + return 0; +} + +static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) +{ + struct dm_cache_migration *mg; + + if (!background_work_begin(cache)) { + policy_complete_background_work(cache->policy, op, false); + return -EPERM; + } + + mg = alloc_migration(cache); + + mg->op = op; + mg->overwrite_bio = bio; + + if (!bio) + inc_io_migrations(cache); + + return mg_lock_writes(mg); +} + +/*---------------------------------------------------------------- + * invalidation processing + *--------------------------------------------------------------*/ + +static void invalidate_complete(struct dm_cache_migration *mg, bool success) +{ + struct bio_list bios; + struct cache *cache = mg->cache; + + bio_list_init(&bios); + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + + if (!success && mg->overwrite_bio) + bio_io_error(mg->overwrite_bio); + + free_migration(mg); + defer_bios(cache, &bios); + + background_work_end(cache); +} + +static void invalidate_completed(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + invalidate_complete(mg, !mg->k.input); +} + +static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) +{ + int r = policy_invalidate_mapping(cache->policy, cblock); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, cblock); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); + } + + } else if (r == -ENODATA) { + /* + * Harmless, already unmapped. + */ + r = 0; + + } else + DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); + + return r; +} + +static void invalidate_remove(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + + r = invalidate_cblock(cache, mg->invalidate_cblock); + if (r) { + invalidate_complete(mg, false); + return; + } + + init_continuation(&mg->k, invalidate_completed); + continue_after_commit(&cache->committer, &mg->k); + remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); + mg->overwrite_bio = NULL; + schedule_commit(&cache->committer); +} + +static int invalidate_lock(struct dm_cache_migration *mg) +{ + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; + + prealloc = alloc_prison_cell(cache); + + build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + invalidate_complete(mg, false); + return r; + } + + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); + + if (r) + quiesce(mg, invalidate_remove); + + else { + /* + * We can't call invalidate_remove() directly here because we + * might still be in request context. + */ + init_continuation(&mg->k, invalidate_remove); + queue_work(cache->wq, &mg->k.ws); + } + + return 0; +} + +static int invalidate_start(struct cache *cache, dm_cblock_t cblock, + dm_oblock_t oblock, struct bio *bio) +{ + struct dm_cache_migration *mg; + + if (!background_work_begin(cache)) + return -EPERM; + + mg = alloc_migration(cache); + + mg->overwrite_bio = bio; + mg->invalidate_cblock = cblock; + mg->invalidate_oblock = oblock; + + return invalidate_lock(mg); +} + +/*---------------------------------------------------------------- + * bio processing + *--------------------------------------------------------------*/ + +enum busy { + IDLE, + BUSY +}; + +static enum busy spare_migration_bandwidth(struct cache *cache) +{ + bool idle = dm_iot_idle_for(&cache->tracker, HZ); + sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * + cache->sectors_per_block; + + if (idle && current_volume <= cache->migration_threshold) + return IDLE; + else + return BUSY; +} + +static void inc_hit_counter(struct cache *cache, struct bio *bio) +{ + atomic_inc(bio_data_dir(bio) == READ ? + &cache->stats.read_hit : &cache->stats.write_hit); +} + +static void inc_miss_counter(struct cache *cache, struct bio *bio) +{ + atomic_inc(bio_data_dir(bio) == READ ? + &cache->stats.read_miss : &cache->stats.write_miss); +} + +/*----------------------------------------------------------------*/ + +static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, + bool *commit_needed) +{ + int r, data_dir; + bool rb, background_queued; + dm_cblock_t cblock; + + *commit_needed = false; + + rb = bio_detain_shared(cache, block, bio); + if (!rb) { + /* + * An exclusive lock is held for this block, so we have to + * wait. We set the commit_needed flag so the current + * transaction will be committed asap, allowing this lock + * to be dropped. + */ + *commit_needed = true; + return DM_MAPIO_SUBMITTED; + } + + data_dir = bio_data_dir(bio); + + if (optimisable_bio(cache, bio, block)) { + struct policy_work *op = NULL; + + r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + + if (r == -ENOENT && op) { + bio_drop_shared_lock(cache, bio); + BUG_ON(op->op != POLICY_PROMOTE); + mg_start(cache, op, bio); + return DM_MAPIO_SUBMITTED; + } + } else { + r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + + if (background_queued) + wake_migration_worker(cache); + } + + if (r == -ENOENT) { + struct per_bio_data *pb = get_per_bio_data(bio); + + /* + * Miss. + */ + inc_miss_counter(cache, bio); + if (pb->req_nr == 0) { + accounted_begin(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); + } else { + /* + * This is a duplicate writethrough io that is no + * longer needed because the block has been demoted. + */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + } else { + /* + * Hit. + */ + inc_hit_counter(cache, bio); + + /* + * Passthrough always maps to the origin, invalidating any + * cache blocks that are written to. + */ + if (passthrough_mode(cache)) { + if (bio_data_dir(bio) == WRITE) { + bio_drop_shared_lock(cache, bio); + atomic_inc(&cache->stats.demotion); + invalidate_start(cache, cblock, block, bio); + } else + remap_to_origin_clear_discard(cache, bio, block); + } else { + if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && + !is_dirty(cache, cblock)) { + remap_to_origin_and_cache(cache, bio, block, cblock); + accounted_begin(cache, bio); + } else + remap_to_cache_dirty(cache, bio, block, cblock); + } + } + + /* + * dm core turns FUA requests into a separate payload and FLUSH req. + */ + if (bio->bi_opf & REQ_FUA) { + /* + * issue_after_commit will call accounted_begin a second time. So + * we call accounted_complete() to avoid double accounting. + */ + accounted_complete(cache, bio); + issue_after_commit(&cache->committer, bio); + *commit_needed = true; + return DM_MAPIO_SUBMITTED; + } + + return DM_MAPIO_REMAPPED; +} + +static bool process_bio(struct cache *cache, struct bio *bio) +{ + bool commit_needed; + + if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) + dm_submit_bio_remap(bio, NULL); + + return commit_needed; +} + +/* + * A non-zero return indicates read_only or fail_io mode. + */ +static int commit(struct cache *cache, bool clean_shutdown) +{ + int r; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return -EINVAL; + + atomic_inc(&cache->stats.commit_count); + r = dm_cache_commit(cache->cmd, clean_shutdown); + if (r) + metadata_operation_failed(cache, "dm_cache_commit", r); + + return r; +} + +/* + * Used by the batcher. + */ +static blk_status_t commit_op(void *context) +{ + struct cache *cache = context; + + if (dm_cache_changed_this_transaction(cache->cmd)) + return errno_to_blk_status(commit(cache, false)); + + return 0; +} + +/*----------------------------------------------------------------*/ + +static bool process_flush_bio(struct cache *cache, struct bio *bio) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + if (!pb->req_nr) + remap_to_origin(cache, bio); + else + remap_to_cache(cache, bio, 0); + + issue_after_commit(&cache->committer, bio); + return true; +} + +static bool process_discard_bio(struct cache *cache, struct bio *bio) +{ + dm_dblock_t b, e; + + // FIXME: do we need to lock the region? Or can we just assume the + // user wont be so foolish as to issue discard concurrently with + // other IO? + calc_discard_block_range(cache, bio, &b, &e); + while (b != e) { + set_discard(cache, b); + b = to_dblock(from_dblock(b) + 1); + } + + if (cache->features.discard_passdown) { + remap_to_origin(cache, bio); + dm_submit_bio_remap(bio, NULL); + } else + bio_endio(bio); + + return false; +} + +static void process_deferred_bios(struct work_struct *ws) +{ + struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); + + bool commit_needed = false; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irq(&cache->lock); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); + spin_unlock_irq(&cache->lock); + + while ((bio = bio_list_pop(&bios))) { + if (bio->bi_opf & REQ_PREFLUSH) + commit_needed = process_flush_bio(cache, bio) || commit_needed; + + else if (bio_op(bio) == REQ_OP_DISCARD) + commit_needed = process_discard_bio(cache, bio) || commit_needed; + + else + commit_needed = process_bio(cache, bio) || commit_needed; + cond_resched(); + } + + if (commit_needed) + schedule_commit(&cache->committer); +} + +/*---------------------------------------------------------------- + * Main worker loop + *--------------------------------------------------------------*/ + +static void requeue_deferred_bios(struct cache *cache) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); + + while ((bio = bio_list_pop(&bios))) { + bio->bi_status = BLK_STS_DM_REQUEUE; + bio_endio(bio); + cond_resched(); + } +} + +/* + * We want to commit periodically so that not too much + * unwritten metadata builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + + policy_tick(cache->policy, true); + wake_migration_worker(cache); + schedule_commit(&cache->committer); + queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); +} + +static void check_migrations(struct work_struct *ws) +{ + int r; + struct policy_work *op; + struct cache *cache = container_of(ws, struct cache, migration_worker); + enum busy b; + + for (;;) { + b = spare_migration_bandwidth(cache); + + r = policy_get_background_work(cache->policy, b == IDLE, &op); + if (r == -ENODATA) + break; + + if (r) { + DMERR_LIMIT("%s: policy_background_work failed", + cache_device_name(cache)); + break; + } + + r = mg_start(cache, op, NULL); + if (r) + break; + + cond_resched(); + } +} + +/*---------------------------------------------------------------- + * Target methods + *--------------------------------------------------------------*/ + +/* + * This function gets called on the error paths of the constructor, so we + * have to cope with a partially initialised struct. + */ +static void destroy(struct cache *cache) +{ + unsigned int i; + + mempool_exit(&cache->migration_pool); + + if (cache->prison) + dm_bio_prison_destroy_v2(cache->prison); + + cancel_delayed_work_sync(&cache->waker); + if (cache->wq) + destroy_workqueue(cache->wq); + + if (cache->dirty_bitset) + free_bitset(cache->dirty_bitset); + + if (cache->discard_bitset) + free_bitset(cache->discard_bitset); + + if (cache->copier) + dm_kcopyd_client_destroy(cache->copier); + + if (cache->cmd) + dm_cache_metadata_close(cache->cmd); + + if (cache->metadata_dev) + dm_put_device(cache->ti, cache->metadata_dev); + + if (cache->origin_dev) + dm_put_device(cache->ti, cache->origin_dev); + + if (cache->cache_dev) + dm_put_device(cache->ti, cache->cache_dev); + + if (cache->policy) + dm_cache_policy_destroy(cache->policy); + + for (i = 0; i < cache->nr_ctr_args ; i++) + kfree(cache->ctr_args[i]); + kfree(cache->ctr_args); + + bioset_exit(&cache->bs); + + kfree(cache); +} + +static void cache_dtr(struct dm_target *ti) +{ + struct cache *cache = ti->private; + + destroy(cache); +} + +static sector_t get_dev_size(struct dm_dev *dev) +{ + return bdev_nr_sectors(dev->bdev); +} + +/*----------------------------------------------------------------*/ + +/* + * Construct a cache device mapping. + * + * cache + * <#feature args> []* + * <#policy args> []* + * + * metadata dev : fast device holding the persistent metadata + * cache dev : fast device holding cached data blocks + * origin dev : slow device holding original data blocks + * block size : cache unit size in sectors + * + * #feature args : number of feature arguments passed + * feature args : writethrough. (The default is writeback.) + * + * policy : the replacement policy to use + * #policy args : an even number of policy arguments corresponding + * to key/value pairs passed to the policy + * policy args : key/value pairs passed to the policy + * E.g. 'sequential_threshold 1024' + * See cache-policies.txt for details. + * + * Optional feature arguments are: + * writethrough : write through caching that prohibits cache block + * content from being different from origin block content. + * Without this argument, the default behaviour is to write + * back cache block contents later for performance reasons, + * so they may differ from the corresponding origin blocks. + */ +struct cache_args { + struct dm_target *ti; + + struct dm_dev *metadata_dev; + + struct dm_dev *cache_dev; + sector_t cache_sectors; + + struct dm_dev *origin_dev; + sector_t origin_sectors; + + uint32_t block_size; + + const char *policy_name; + int policy_argc; + const char **policy_argv; + + struct cache_features features; +}; + +static void destroy_cache_args(struct cache_args *ca) +{ + if (ca->metadata_dev) + dm_put_device(ca->ti, ca->metadata_dev); + + if (ca->cache_dev) + dm_put_device(ca->ti, ca->cache_dev); + + if (ca->origin_dev) + dm_put_device(ca->ti, ca->origin_dev); + + kfree(ca); +} + +static bool at_least_one_arg(struct dm_arg_set *as, char **error) +{ + if (!as->argc) { + *error = "Insufficient args"; + return false; + } + + return true; +} + +static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + int r; + sector_t metadata_dev_size; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &ca->metadata_dev); + if (r) { + *error = "Error opening metadata device"; + return r; + } + + metadata_dev_size = get_dev_size(ca->metadata_dev); + if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", + ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); + + return 0; +} + +static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + int r; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &ca->cache_dev); + if (r) { + *error = "Error opening cache device"; + return r; + } + ca->cache_sectors = get_dev_size(ca->cache_dev); + + return 0; +} + +static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + int r; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &ca->origin_dev); + if (r) { + *error = "Error opening origin device"; + return r; + } + + ca->origin_sectors = get_dev_size(ca->origin_dev); + if (ca->ti->len > ca->origin_sectors) { + *error = "Device size larger than cached device"; + return -EINVAL; + } + + return 0; +} + +static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + unsigned long block_size; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || + block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || + block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || + block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { + *error = "Invalid data block size"; + return -EINVAL; + } + + if (block_size > ca->cache_sectors) { + *error = "Data block size is larger than the cache device"; + return -EINVAL; + } + + ca->block_size = block_size; + + return 0; +} + +static void init_features(struct cache_features *cf) +{ + cf->mode = CM_WRITE; + cf->io_mode = CM_IO_WRITEBACK; + cf->metadata_version = 1; + cf->discard_passdown = true; +} + +static int parse_features(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + static const struct dm_arg _args[] = { + {0, 3, "Invalid number of cache feature arguments"}, + }; + + int r, mode_ctr = 0; + unsigned int argc; + const char *arg; + struct cache_features *cf = &ca->features; + + init_features(cf); + + r = dm_read_arg_group(_args, as, &argc, error); + if (r) + return -EINVAL; + + while (argc--) { + arg = dm_shift_arg(as); + + if (!strcasecmp(arg, "writeback")) { + cf->io_mode = CM_IO_WRITEBACK; + mode_ctr++; + } + + else if (!strcasecmp(arg, "writethrough")) { + cf->io_mode = CM_IO_WRITETHROUGH; + mode_ctr++; + } + + else if (!strcasecmp(arg, "passthrough")) { + cf->io_mode = CM_IO_PASSTHROUGH; + mode_ctr++; + } + + else if (!strcasecmp(arg, "metadata2")) + cf->metadata_version = 2; + + else if (!strcasecmp(arg, "no_discard_passdown")) + cf->discard_passdown = false; + + else { + *error = "Unrecognised cache feature requested"; + return -EINVAL; + } + } + + if (mode_ctr > 1) { + *error = "Duplicate cache io_mode features requested"; + return -EINVAL; + } + + return 0; +} + +static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + static const struct dm_arg _args[] = { + {0, 1024, "Invalid number of policy arguments"}, + }; + + int r; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + ca->policy_name = dm_shift_arg(as); + + r = dm_read_arg_group(_args, as, &ca->policy_argc, error); + if (r) + return -EINVAL; + + ca->policy_argv = (const char **)as->argv; + dm_consume_args(as, ca->policy_argc); + + return 0; +} + +static int parse_cache_args(struct cache_args *ca, int argc, char **argv, + char **error) +{ + int r; + struct dm_arg_set as; + + as.argc = argc; + as.argv = argv; + + r = parse_metadata_dev(ca, &as, error); + if (r) + return r; + + r = parse_cache_dev(ca, &as, error); + if (r) + return r; + + r = parse_origin_dev(ca, &as, error); + if (r) + return r; + + r = parse_block_size(ca, &as, error); + if (r) + return r; + + r = parse_features(ca, &as, error); + if (r) + return r; + + r = parse_policy(ca, &as, error); + if (r) + return r; + + return 0; +} + +/*----------------------------------------------------------------*/ + +static struct kmem_cache *migration_cache; + +#define NOT_CORE_OPTION 1 + +static int process_config_option(struct cache *cache, const char *key, const char *value) +{ + unsigned long tmp; + + if (!strcasecmp(key, "migration_threshold")) { + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + cache->migration_threshold = tmp; + return 0; + } + + return NOT_CORE_OPTION; +} + +static int set_config_value(struct cache *cache, const char *key, const char *value) +{ + int r = process_config_option(cache, key, value); + + if (r == NOT_CORE_OPTION) + r = policy_set_config_value(cache->policy, key, value); + + if (r) + DMWARN("bad config value for %s: %s", key, value); + + return r; +} + +static int set_config_values(struct cache *cache, int argc, const char **argv) +{ + int r = 0; + + if (argc & 1) { + DMWARN("Odd number of policy arguments given but they should be pairs."); + return -EINVAL; + } + + while (argc) { + r = set_config_value(cache, argv[0], argv[1]); + if (r) + break; + + argc -= 2; + argv += 2; + } + + return r; +} + +static int create_cache_policy(struct cache *cache, struct cache_args *ca, + char **error) +{ + struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, + cache->cache_size, + cache->origin_sectors, + cache->sectors_per_block); + if (IS_ERR(p)) { + *error = "Error creating cache's policy"; + return PTR_ERR(p); + } + cache->policy = p; + BUG_ON(!cache->policy); + + return 0; +} + +/* + * We want the discard block size to be at least the size of the cache + * block size and have no more than 2^14 discard blocks across the origin. + */ +#define MAX_DISCARD_BLOCKS (1 << 14) + +static bool too_many_discard_blocks(sector_t discard_block_size, + sector_t origin_size) +{ + (void) sector_div(origin_size, discard_block_size); + + return origin_size > MAX_DISCARD_BLOCKS; +} + +static sector_t calculate_discard_block_size(sector_t cache_block_size, + sector_t origin_size) +{ + sector_t discard_block_size = cache_block_size; + + if (origin_size) + while (too_many_discard_blocks(discard_block_size, origin_size)) + discard_block_size *= 2; + + return discard_block_size; +} + +static void set_cache_size(struct cache *cache, dm_cblock_t size) +{ + dm_block_t nr_blocks = from_cblock(size); + + if (nr_blocks > (1 << 20) && cache->cache_size != size) + DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" + "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" + "Please consider increasing the cache block size to reduce the overall cache block count.", + (unsigned long long) nr_blocks); + + cache->cache_size = size; +} + +#define DEFAULT_MIGRATION_THRESHOLD 2048 + +static int cache_create(struct cache_args *ca, struct cache **result) +{ + int r = 0; + char **error = &ca->ti->error; + struct cache *cache; + struct dm_target *ti = ca->ti; + dm_block_t origin_blocks; + struct dm_cache_metadata *cmd; + bool may_format = ca->features.mode == CM_WRITE; + + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (!cache) + return -ENOMEM; + + cache->ti = ca->ti; + ti->private = cache; + ti->accounts_remapped_io = true; + ti->num_flush_bios = 2; + ti->flush_supported = true; + + ti->num_discard_bios = 1; + ti->discards_supported = true; + + ti->per_io_data_size = sizeof(struct per_bio_data); + + cache->features = ca->features; + if (writethrough_mode(cache)) { + /* Create bioset for writethrough bios issued to origin */ + r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); + if (r) + goto bad; + } + + cache->metadata_dev = ca->metadata_dev; + cache->origin_dev = ca->origin_dev; + cache->cache_dev = ca->cache_dev; + + ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; + + origin_blocks = cache->origin_sectors = ca->origin_sectors; + origin_blocks = block_div(origin_blocks, ca->block_size); + cache->origin_blocks = to_oblock(origin_blocks); + + cache->sectors_per_block = ca->block_size; + if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { + r = -EINVAL; + goto bad; + } + + if (ca->block_size & (ca->block_size - 1)) { + dm_block_t cache_size = ca->cache_sectors; + + cache->sectors_per_block_shift = -1; + cache_size = block_div(cache_size, ca->block_size); + set_cache_size(cache, to_cblock(cache_size)); + } else { + cache->sectors_per_block_shift = __ffs(ca->block_size); + set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); + } + + r = create_cache_policy(cache, ca, error); + if (r) + goto bad; + + cache->policy_nr_args = ca->policy_argc; + cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; + + r = set_config_values(cache, ca->policy_argc, ca->policy_argv); + if (r) { + *error = "Error setting cache policy's config values"; + goto bad; + } + + cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, + ca->block_size, may_format, + dm_cache_policy_get_hint_size(cache->policy), + ca->features.metadata_version); + if (IS_ERR(cmd)) { + *error = "Error creating metadata object"; + r = PTR_ERR(cmd); + goto bad; + } + cache->cmd = cmd; + set_cache_mode(cache, CM_WRITE); + if (get_cache_mode(cache) != CM_WRITE) { + *error = "Unable to get write access to metadata, please check/repair metadata."; + r = -EINVAL; + goto bad; + } + + if (passthrough_mode(cache)) { + bool all_clean; + + r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); + if (r) { + *error = "dm_cache_metadata_all_clean() failed"; + goto bad; + } + + if (!all_clean) { + *error = "Cannot enter passthrough mode unless all blocks are clean"; + r = -EINVAL; + goto bad; + } + + policy_allow_migrations(cache->policy, false); + } + + spin_lock_init(&cache->lock); + bio_list_init(&cache->deferred_bios); + atomic_set(&cache->nr_allocated_migrations, 0); + atomic_set(&cache->nr_io_migrations, 0); + init_waitqueue_head(&cache->migration_wait); + + r = -ENOMEM; + atomic_set(&cache->nr_dirty, 0); + cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->dirty_bitset) { + *error = "could not allocate dirty bitset"; + goto bad; + } + clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); + + cache->discard_block_size = + calculate_discard_block_size(cache->sectors_per_block, + cache->origin_sectors); + cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, + cache->discard_block_size)); + cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); + if (!cache->discard_bitset) { + *error = "could not allocate discard bitset"; + goto bad; + } + clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(cache->copier)) { + *error = "could not create kcopyd client"; + r = PTR_ERR(cache->copier); + goto bad; + } + + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); + if (!cache->wq) { + *error = "could not create workqueue for metadata object"; + goto bad; + } + INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); + INIT_WORK(&cache->migration_worker, check_migrations); + INIT_DELAYED_WORK(&cache->waker, do_waker); + + cache->prison = dm_bio_prison_create_v2(cache->wq); + if (!cache->prison) { + *error = "could not create bio prison"; + goto bad; + } + + r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, + migration_cache); + if (r) { + *error = "Error creating cache's migration mempool"; + goto bad; + } + + cache->need_tick_bio = true; + cache->sized = false; + cache->invalidate = false; + cache->commit_requested = false; + cache->loaded_mappings = false; + cache->loaded_discards = false; + + load_stats(cache); + + atomic_set(&cache->stats.demotion, 0); + atomic_set(&cache->stats.promotion, 0); + atomic_set(&cache->stats.copies_avoided, 0); + atomic_set(&cache->stats.cache_cell_clash, 0); + atomic_set(&cache->stats.commit_count, 0); + atomic_set(&cache->stats.discard_count, 0); + + spin_lock_init(&cache->invalidation_lock); + INIT_LIST_HEAD(&cache->invalidation_requests); + + batcher_init(&cache->committer, commit_op, cache, + issue_op, cache, cache->wq); + dm_iot_init(&cache->tracker); + + init_rwsem(&cache->background_work_lock); + prevent_background_work(cache); + + *result = cache; + return 0; +bad: + destroy(cache); + return r; +} + +static int copy_ctr_args(struct cache *cache, int argc, const char **argv) +{ + unsigned int i; + const char **copy; + + copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); + if (!copy) + return -ENOMEM; + for (i = 0; i < argc; i++) { + copy[i] = kstrdup(argv[i], GFP_KERNEL); + if (!copy[i]) { + while (i--) + kfree(copy[i]); + kfree(copy); + return -ENOMEM; + } + } + + cache->nr_ctr_args = argc; + cache->ctr_args = copy; + + return 0; +} + +static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r = -EINVAL; + struct cache_args *ca; + struct cache *cache = NULL; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) { + ti->error = "Error allocating memory for cache"; + return -ENOMEM; + } + ca->ti = ti; + + r = parse_cache_args(ca, argc, argv, &ti->error); + if (r) + goto out; + + r = cache_create(ca, &cache); + if (r) + goto out; + + r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); + if (r) { + destroy(cache); + goto out; + } + + ti->private = cache; +out: + destroy_cache_args(ca); + return r; +} + +/*----------------------------------------------------------------*/ + +static int cache_map(struct dm_target *ti, struct bio *bio) +{ + struct cache *cache = ti->private; + + int r; + bool commit_needed; + dm_oblock_t block = get_bio_block(cache, bio); + + init_per_bio_data(bio); + if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { + /* + * This can only occur if the io goes to a partial block at + * the end of the origin device. We don't cache these. + * Just remap to the origin and carry on. + */ + remap_to_origin(cache, bio); + accounted_begin(cache, bio); + return DM_MAPIO_REMAPPED; + } + + if (discard_or_flush(bio)) { + defer_bio(cache, bio); + return DM_MAPIO_SUBMITTED; + } + + r = map_bio(cache, bio, block, &commit_needed); + if (commit_needed) + schedule_commit(&cache->committer); + + return r; +} + +static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) +{ + struct cache *cache = ti->private; + unsigned long flags; + struct per_bio_data *pb = get_per_bio_data(bio); + + if (pb->tick) { + policy_tick(cache->policy, false); + + spin_lock_irqsave(&cache->lock, flags); + cache->need_tick_bio = true; + spin_unlock_irqrestore(&cache->lock, flags); + } + + bio_drop_shared_lock(cache, bio); + accounted_complete(cache, bio); + + return DM_ENDIO_DONE; +} + +static int write_dirty_bitset(struct cache *cache) +{ + int r; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return -EINVAL; + + r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); + if (r) + metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); + + return r; +} + +static int write_discard_bitset(struct cache *cache) +{ + unsigned int i, r; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return -EINVAL; + + r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, + cache->discard_nr_blocks); + if (r) { + DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); + return r; + } + + for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { + r = dm_cache_set_discard(cache->cmd, to_dblock(i), + is_discarded(cache, to_dblock(i))); + if (r) { + metadata_operation_failed(cache, "dm_cache_set_discard", r); + return r; + } + } + + return 0; +} + +static int write_hints(struct cache *cache) +{ + int r; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return -EINVAL; + + r = dm_cache_write_hints(cache->cmd, cache->policy); + if (r) { + metadata_operation_failed(cache, "dm_cache_write_hints", r); + return r; + } + + return 0; +} + +/* + * returns true on success + */ +static bool sync_metadata(struct cache *cache) +{ + int r1, r2, r3, r4; + + r1 = write_dirty_bitset(cache); + if (r1) + DMERR("%s: could not write dirty bitset", cache_device_name(cache)); + + r2 = write_discard_bitset(cache); + if (r2) + DMERR("%s: could not write discard bitset", cache_device_name(cache)); + + save_stats(cache); + + r3 = write_hints(cache); + if (r3) + DMERR("%s: could not write hints", cache_device_name(cache)); + + /* + * If writing the above metadata failed, we still commit, but don't + * set the clean shutdown flag. This will effectively force every + * dirty bit to be set on reload. + */ + r4 = commit(cache, !r1 && !r2 && !r3); + if (r4) + DMERR("%s: could not write cache metadata", cache_device_name(cache)); + + return !r1 && !r2 && !r3 && !r4; +} + +static void cache_postsuspend(struct dm_target *ti) +{ + struct cache *cache = ti->private; + + prevent_background_work(cache); + BUG_ON(atomic_read(&cache->nr_io_migrations)); + + cancel_delayed_work_sync(&cache->waker); + drain_workqueue(cache->wq); + WARN_ON(cache->tracker.in_flight); + + /* + * If it's a flush suspend there won't be any deferred bios, so this + * call is harmless. + */ + requeue_deferred_bios(cache); + + if (get_cache_mode(cache) == CM_WRITE) + (void) sync_metadata(cache); +} + +static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct cache *cache = context; + + if (dirty) { + set_bit(from_cblock(cblock), cache->dirty_bitset); + atomic_inc(&cache->nr_dirty); + } else + clear_bit(from_cblock(cblock), cache->dirty_bitset); + + return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); +} + +/* + * The discard block size in the on disk metadata is not + * necessarily the same as we're currently using. So we have to + * be careful to only set the discarded attribute if we know it + * covers a complete block of the new size. + */ +struct discard_load_info { + struct cache *cache; + + /* + * These blocks are sized using the on disk dblock size, rather + * than the current one. + */ + dm_block_t block_size; + dm_block_t discard_begin, discard_end; +}; + +static void discard_load_info_init(struct cache *cache, + struct discard_load_info *li) +{ + li->cache = cache; + li->discard_begin = li->discard_end = 0; +} + +static void set_discard_range(struct discard_load_info *li) +{ + sector_t b, e; + + if (li->discard_begin == li->discard_end) + return; + + /* + * Convert to sectors. + */ + b = li->discard_begin * li->block_size; + e = li->discard_end * li->block_size; + + /* + * Then convert back to the current dblock size. + */ + b = dm_sector_div_up(b, li->cache->discard_block_size); + sector_div(e, li->cache->discard_block_size); + + /* + * The origin may have shrunk, so we need to check we're still in + * bounds. + */ + if (e > from_dblock(li->cache->discard_nr_blocks)) + e = from_dblock(li->cache->discard_nr_blocks); + + for (; b < e; b++) + set_discard(li->cache, to_dblock(b)); +} + +static int load_discard(void *context, sector_t discard_block_size, + dm_dblock_t dblock, bool discard) +{ + struct discard_load_info *li = context; + + li->block_size = discard_block_size; + + if (discard) { + if (from_dblock(dblock) == li->discard_end) + /* + * We're already in a discard range, just extend it. + */ + li->discard_end = li->discard_end + 1ULL; + + else { + /* + * Emit the old range and start a new one. + */ + set_discard_range(li); + li->discard_begin = from_dblock(dblock); + li->discard_end = li->discard_begin + 1ULL; + } + } else { + set_discard_range(li); + li->discard_begin = li->discard_end = 0; + } + + return 0; +} + +static dm_cblock_t get_cache_dev_size(struct cache *cache) +{ + sector_t size = get_dev_size(cache->cache_dev); + (void) sector_div(size, cache->sectors_per_block); + return to_cblock(size); +} + +static bool can_resize(struct cache *cache, dm_cblock_t new_size) +{ + if (from_cblock(new_size) > from_cblock(cache->cache_size)) { + if (cache->sized) { + DMERR("%s: unable to extend cache due to missing cache table reload", + cache_device_name(cache)); + return false; + } + } + + /* + * We can't drop a dirty block when shrinking the cache. + */ + while (from_cblock(new_size) < from_cblock(cache->cache_size)) { + new_size = to_cblock(from_cblock(new_size) + 1); + if (is_dirty(cache, new_size)) { + DMERR("%s: unable to shrink cache; cache block %llu is dirty", + cache_device_name(cache), + (unsigned long long) from_cblock(new_size)); + return false; + } + } + + return true; +} + +static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) +{ + int r; + + r = dm_cache_resize(cache->cmd, new_size); + if (r) { + DMERR("%s: could not resize cache metadata", cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_resize", r); + return r; + } + + set_cache_size(cache, new_size); + + return 0; +} + +static int cache_preresume(struct dm_target *ti) +{ + int r = 0; + struct cache *cache = ti->private; + dm_cblock_t csize = get_cache_dev_size(cache); + + /* + * Check to see if the cache has resized. + */ + if (!cache->sized) { + r = resize_cache_dev(cache, csize); + if (r) + return r; + + cache->sized = true; + + } else if (csize != cache->cache_size) { + if (!can_resize(cache, csize)) + return -EINVAL; + + r = resize_cache_dev(cache, csize); + if (r) + return r; + } + + if (!cache->loaded_mappings) { + r = dm_cache_load_mappings(cache->cmd, cache->policy, + load_mapping, cache); + if (r) { + DMERR("%s: could not load cache mappings", cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_load_mappings", r); + return r; + } + + cache->loaded_mappings = true; + } + + if (!cache->loaded_discards) { + struct discard_load_info li; + + /* + * The discard bitset could have been resized, or the + * discard block size changed. To be safe we start by + * setting every dblock to not discarded. + */ + clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + + discard_load_info_init(cache, &li); + r = dm_cache_load_discards(cache->cmd, load_discard, &li); + if (r) { + DMERR("%s: could not load origin discards", cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_load_discards", r); + return r; + } + set_discard_range(&li); + + cache->loaded_discards = true; + } + + return r; +} + +static void cache_resume(struct dm_target *ti) +{ + struct cache *cache = ti->private; + + cache->need_tick_bio = true; + allow_background_work(cache); + do_waker(&cache->waker.work); +} + +static void emit_flags(struct cache *cache, char *result, + unsigned int maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + struct cache_features *cf = &cache->features; + unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; + + DMEMIT("%u ", count); + + if (cf->metadata_version == 2) + DMEMIT("metadata2 "); + + if (writethrough_mode(cache)) + DMEMIT("writethrough "); + + else if (passthrough_mode(cache)) + DMEMIT("passthrough "); + + else if (writeback_mode(cache)) + DMEMIT("writeback "); + + else { + DMEMIT("unknown "); + DMERR("%s: internal error: unknown io mode: %d", + cache_device_name(cache), (int) cf->io_mode); + } + + if (!cf->discard_passdown) + DMEMIT("no_discard_passdown "); + + *sz_ptr = sz; +} + +/* + * Status format: + * + * <#used metadata blocks>/<#total metadata blocks> + * <#used cache blocks>/<#total cache blocks> + * <#read hits> <#read misses> <#write hits> <#write misses> + * <#demotions> <#promotions> <#dirty> + * <#features> * + * <#core args> + * <#policy args> * + */ +static void cache_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + int r = 0; + unsigned int i; + ssize_t sz = 0; + dm_block_t nr_free_blocks_metadata = 0; + dm_block_t nr_blocks_metadata = 0; + char buf[BDEVNAME_SIZE]; + struct cache *cache = ti->private; + dm_cblock_t residency; + bool needs_check; + + switch (type) { + case STATUSTYPE_INFO: + if (get_cache_mode(cache) == CM_FAIL) { + DMEMIT("Fail"); + break; + } + + /* Commit to ensure statistics aren't out-of-date */ + if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) + (void) commit(cache, false); + + r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); + if (r) { + DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", + cache_device_name(cache), r); + goto err; + } + + r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); + if (r) { + DMERR("%s: dm_cache_get_metadata_dev_size returned %d", + cache_device_name(cache), r); + goto err; + } + + residency = policy_residency(cache->policy); + + DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", + (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE, + (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), + (unsigned long long)nr_blocks_metadata, + (unsigned long long)cache->sectors_per_block, + (unsigned long long) from_cblock(residency), + (unsigned long long) from_cblock(cache->cache_size), + (unsigned int) atomic_read(&cache->stats.read_hit), + (unsigned int) atomic_read(&cache->stats.read_miss), + (unsigned int) atomic_read(&cache->stats.write_hit), + (unsigned int) atomic_read(&cache->stats.write_miss), + (unsigned int) atomic_read(&cache->stats.demotion), + (unsigned int) atomic_read(&cache->stats.promotion), + (unsigned long) atomic_read(&cache->nr_dirty)); + + emit_flags(cache, result, maxlen, &sz); + + DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); + + DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); + if (sz < maxlen) { + r = policy_emit_config_values(cache->policy, result, maxlen, &sz); + if (r) + DMERR("%s: policy_emit_config_values returned %d", + cache_device_name(cache), r); + } + + if (get_cache_mode(cache) == CM_READ_ONLY) + DMEMIT("ro "); + else + DMEMIT("rw "); + + r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); + + if (r || needs_check) + DMEMIT("needs_check "); + else + DMEMIT("- "); + + break; + + case STATUSTYPE_TABLE: + format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + format_dev_t(buf, cache->cache_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + format_dev_t(buf, cache->origin_dev->bdev->bd_dev); + DMEMIT("%s", buf); + + for (i = 0; i < cache->nr_ctr_args - 1; i++) + DMEMIT(" %s", cache->ctr_args[i]); + if (cache->nr_ctr_args) + DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + if (get_cache_mode(cache) == CM_FAIL) + DMEMIT(",metadata_mode=fail"); + else if (get_cache_mode(cache) == CM_READ_ONLY) + DMEMIT(",metadata_mode=ro"); + else + DMEMIT(",metadata_mode=rw"); + + format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); + DMEMIT(",cache_metadata_device=%s", buf); + format_dev_t(buf, cache->cache_dev->bdev->bd_dev); + DMEMIT(",cache_device=%s", buf); + format_dev_t(buf, cache->origin_dev->bdev->bd_dev); + DMEMIT(",cache_origin_device=%s", buf); + DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n'); + DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n'); + DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n'); + DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n'); + DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y'); + DMEMIT(";"); + break; + } + + return; + +err: + DMEMIT("Error"); +} + +/* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + +/* + * A cache block range can take two forms: + * + * i) A single cblock, eg. '3456' + * ii) A begin and end cblock with a dash between, eg. 123-234 + */ +static int parse_cblock_range(struct cache *cache, const char *str, + struct cblock_range *result) +{ + char dummy; + uint64_t b, e; + int r; + + /* + * Try and parse form (ii) first. + */ + r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); + if (r < 0) + return r; + + if (r == 2) { + result->begin = to_cblock(b); + result->end = to_cblock(e); + return 0; + } + + /* + * That didn't work, try form (i). + */ + r = sscanf(str, "%llu%c", &b, &dummy); + if (r < 0) + return r; + + if (r == 1) { + result->begin = to_cblock(b); + result->end = to_cblock(from_cblock(result->begin) + 1u); + return 0; + } + + DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); + return -EINVAL; +} + +static int validate_cblock_range(struct cache *cache, struct cblock_range *range) +{ + uint64_t b = from_cblock(range->begin); + uint64_t e = from_cblock(range->end); + uint64_t n = from_cblock(cache->cache_size); + + if (b >= n) { + DMERR("%s: begin cblock out of range: %llu >= %llu", + cache_device_name(cache), b, n); + return -EINVAL; + } + + if (e > n) { + DMERR("%s: end cblock out of range: %llu > %llu", + cache_device_name(cache), e, n); + return -EINVAL; + } + + if (b >= e) { + DMERR("%s: invalid cblock range: %llu >= %llu", + cache_device_name(cache), b, e); + return -EINVAL; + } + + return 0; +} + +static inline dm_cblock_t cblock_succ(dm_cblock_t b) +{ + return to_cblock(from_cblock(b) + 1); +} + +static int request_invalidation(struct cache *cache, struct cblock_range *range) +{ + int r = 0; + + /* + * We don't need to do any locking here because we know we're in + * passthrough mode. There's is potential for a race between an + * invalidation triggered by an io and an invalidation message. This + * is harmless, we must not worry if the policy call fails. + */ + while (range->begin != range->end) { + r = invalidate_cblock(cache, range->begin); + if (r) + return r; + + range->begin = cblock_succ(range->begin); + } + + cache->commit_requested = true; + return r; +} + +static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count, + const char **cblock_ranges) +{ + int r = 0; + unsigned int i; + struct cblock_range range; + + if (!passthrough_mode(cache)) { + DMERR("%s: cache has to be in passthrough mode for invalidation", + cache_device_name(cache)); + return -EPERM; + } + + for (i = 0; i < count; i++) { + r = parse_cblock_range(cache, cblock_ranges[i], &range); + if (r) + break; + + r = validate_cblock_range(cache, &range); + if (r) + break; + + /* + * Pass begin and end origin blocks to the worker and wake it. + */ + r = request_invalidation(cache, &range); + if (r) + break; + } + + return r; +} + +/* + * Supports + * " " + * and + * "invalidate_cblocks [()|(-)]* + * + * The key migration_threshold is supported by the cache target core. + */ +static int cache_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct cache *cache = ti->private; + + if (!argc) + return -EINVAL; + + if (get_cache_mode(cache) >= CM_READ_ONLY) { + DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", + cache_device_name(cache)); + return -EOPNOTSUPP; + } + + if (!strcasecmp(argv[0], "invalidate_cblocks")) + return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); + + if (argc != 2) + return -EINVAL; + + return set_config_value(cache, argv[0], argv[1]); +} + +static int cache_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + int r = 0; + struct cache *cache = ti->private; + + r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); + if (!r) + r = fn(ti, cache->origin_dev, 0, ti->len, data); + + return r; +} + +/* + * If discard_passdown was enabled verify that the origin device + * supports discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct cache *cache) +{ + struct block_device *origin_bdev = cache->origin_dev->bdev; + struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + const char *reason = NULL; + + if (!cache->features.discard_passdown) + return; + + if (!bdev_max_discard_sectors(origin_bdev)) + reason = "discard unsupported"; + + else if (origin_limits->max_discard_sectors < cache->sectors_per_block) + reason = "max discard sectors smaller than a block"; + + if (reason) { + DMWARN("Origin device (%pg) %s: Disabling discard passdown.", + origin_bdev, reason); + cache->features.discard_passdown = false; + } +} + +static void set_discard_limits(struct cache *cache, struct queue_limits *limits) +{ + struct block_device *origin_bdev = cache->origin_dev->bdev; + struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + + if (!cache->features.discard_passdown) { + /* No passdown is done so setting own virtual limits */ + limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, + cache->origin_sectors); + limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; + return; + } + + /* + * cache_iterate_devices() is stacking both origin and fast device limits + * but discards aren't passed to fast device, so inherit origin's limits. + */ + limits->max_discard_sectors = origin_limits->max_discard_sectors; + limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; + limits->discard_granularity = origin_limits->discard_granularity; + limits->discard_alignment = origin_limits->discard_alignment; + limits->discard_misaligned = origin_limits->discard_misaligned; +} + +static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct cache *cache = ti->private; + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with the + * cache's blocksize (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < cache->sectors_per_block || + do_div(io_opt_sectors, cache->sectors_per_block)) { + blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); + blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); + } + + disable_passdown_if_not_supported(cache); + set_discard_limits(cache, limits); +} + +/*----------------------------------------------------------------*/ + +static struct target_type cache_target = { + .name = "cache", + .version = {2, 2, 0}, + .module = THIS_MODULE, + .ctr = cache_ctr, + .dtr = cache_dtr, + .map = cache_map, + .end_io = cache_end_io, + .postsuspend = cache_postsuspend, + .preresume = cache_preresume, + .resume = cache_resume, + .status = cache_status, + .message = cache_message, + .iterate_devices = cache_iterate_devices, + .io_hints = cache_io_hints, +}; + +static int __init dm_cache_init(void) +{ + int r; + + migration_cache = KMEM_CACHE(dm_cache_migration, 0); + if (!migration_cache) + return -ENOMEM; + + r = dm_register_target(&cache_target); + if (r) { + DMERR("cache target registration failed: %d", r); + kmem_cache_destroy(migration_cache); + return r; + } + + return 0; +} + +static void __exit dm_cache_exit(void) +{ + dm_unregister_target(&cache_target); + kmem_cache_destroy(migration_cache); +} + +module_init(dm_cache_init); +module_exit(dm_cache_exit); + +MODULE_DESCRIPTION(DM_NAME " cache target"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c new file mode 100644 index 000000000..c43d55672 --- /dev/null +++ b/drivers/md/dm-clone-metadata.c @@ -0,0 +1,1028 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-transaction-manager.h" + +#include "dm-clone-metadata.h" + +#define DM_MSG_PREFIX "clone metadata" + +#define SUPERBLOCK_LOCATION 0 +#define SUPERBLOCK_MAGIC 0x8af27f64 +#define SUPERBLOCK_CSUM_XOR 257649492 + +#define DM_CLONE_MAX_CONCURRENT_LOCKS 5 + +#define UUID_LEN 16 + +/* Min and max dm-clone metadata versions supported */ +#define DM_CLONE_MIN_METADATA_VERSION 1 +#define DM_CLONE_MAX_METADATA_VERSION 1 + +/* + * On-disk metadata layout + */ +struct superblock_disk { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[UUID_LEN]; + __le64 magic; + __le32 version; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + __le64 region_size; + __le64 target_size; + + __le64 bitset_root; +} __packed; + +/* + * Region and Dirty bitmaps. + * + * dm-clone logically splits the source and destination devices in regions of + * fixed size. The destination device's regions are gradually hydrated, i.e., + * we copy (clone) the source's regions to the destination device. Eventually, + * all regions will get hydrated and all I/O will be served from the + * destination device. + * + * We maintain an on-disk bitmap which tracks the state of each of the + * destination device's regions, i.e., whether they are hydrated or not. + * + * To save constantly doing look ups on disk we keep an in core copy of the + * on-disk bitmap, the region_map. + * + * In order to track which regions are hydrated during a metadata transaction, + * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two + * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap + * tracks the regions that got hydrated during the current metadata + * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of + * the dirty_regions bitmap. + * + * This allows us to precisely track the regions that were hydrated during the + * current metadata transaction and update the metadata accordingly, when we + * commit the current transaction. This is important because dm-clone should + * only commit the metadata of regions that were properly flushed to the + * destination device beforehand. Otherwise, in case of a crash, we could end + * up with a corrupted dm-clone device. + * + * When a region finishes hydrating dm-clone calls + * dm_clone_set_region_hydrated(), or for discard requests + * dm_clone_cond_set_range(), which sets the corresponding bits in region_map + * and dmap. + * + * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions + * and update the on-disk metadata accordingly. Thus, we don't have to flush to + * disk the whole region_map. We can just flush the dirty region_map bits. + * + * We use the helper dmap->dirty_words bitmap, which is smaller than the + * original region_map, to reduce the amount of memory accesses during a + * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in + * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk + * accesses. + * + * We could update directly the on-disk bitmap, when dm-clone calls either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this + * inserts significant metadata I/O overhead in dm-clone's I/O path. Also, as + * these two functions don't block, we can call them in interrupt context, + * e.g., in a hooked overwrite bio's completion routine, and further reduce the + * I/O completion latency. + * + * We maintain two dirty bitmap sets. During a metadata commit we atomically + * swap the currently used dmap with the unused one. This allows the metadata + * update functions to run concurrently with an ongoing commit. + */ +struct dirty_map { + unsigned long *dirty_words; + unsigned long *dirty_regions; + unsigned int changed; +}; + +struct dm_clone_metadata { + /* The metadata block device */ + struct block_device *bdev; + + sector_t target_size; + sector_t region_size; + unsigned long nr_regions; + unsigned long nr_words; + + /* Spinlock protecting the region and dirty bitmaps. */ + spinlock_t bitmap_lock; + struct dirty_map dmap[2]; + struct dirty_map *current_dmap; + + /* Protected by lock */ + struct dirty_map *committing_dmap; + + /* + * In core copy of the on-disk bitmap to save constantly doing look ups + * on disk. + */ + unsigned long *region_map; + + /* Protected by bitmap_lock */ + unsigned int read_only; + + struct dm_block_manager *bm; + struct dm_space_map *sm; + struct dm_transaction_manager *tm; + + struct rw_semaphore lock; + + struct dm_disk_bitset bitset_info; + dm_block_t bitset_root; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + bool hydration_done:1; + bool fail_io:1; +}; + +/*---------------------------------------------------------------------------*/ + +/* + * Superblock validation. + */ +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, size_t sb_block_size) +{ + struct superblock_disk *sb; + u32 csum; + + sb = dm_block_data(b); + sb->blocknr = cpu_to_le64(dm_block_location(b)); + + csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR); + sb->csum = cpu_to_le32(csum); +} + +static int sb_check(struct dm_block_validator *v, struct dm_block *b, + size_t sb_block_size) +{ + struct superblock_disk *sb; + u32 csum, metadata_version; + + sb = dm_block_data(b); + + if (dm_block_location(b) != le64_to_cpu(sb->blocknr)) { + DMERR("Superblock check failed: blocknr %llu, expected %llu", + le64_to_cpu(sb->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(sb->magic) != SUPERBLOCK_MAGIC) { + DMERR("Superblock check failed: magic %llu, expected %llu", + le64_to_cpu(sb->magic), + (unsigned long long)SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR); + if (sb->csum != cpu_to_le32(csum)) { + DMERR("Superblock check failed: checksum %u, expected %u", + csum, le32_to_cpu(sb->csum)); + return -EILSEQ; + } + + /* Check metadata version */ + metadata_version = le32_to_cpu(sb->version); + if (metadata_version < DM_CLONE_MIN_METADATA_VERSION || + metadata_version > DM_CLONE_MAX_METADATA_VERSION) { + DMERR("Clone metadata version %u found, but only versions between %u and %u supported.", + metadata_version, DM_CLONE_MIN_METADATA_VERSION, + DM_CLONE_MAX_METADATA_VERSION); + return -EINVAL; + } + + return 0; +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/* + * Check if the superblock is formatted or not. We consider the superblock to + * be formatted in case we find non-zero bytes in it. + */ +static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *formatted) +{ + int r; + unsigned int i, nr_words; + struct dm_block *sblock; + __le64 *data_le, zero = cpu_to_le64(0); + + /* + * We don't use a validator here because the superblock could be all + * zeroes. + */ + r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &sblock); + if (r) { + DMERR("Failed to read_lock superblock"); + return r; + } + + data_le = dm_block_data(sblock); + *formatted = false; + + /* This assumes that the block size is a multiple of 8 bytes */ + BUG_ON(dm_bm_block_size(bm) % sizeof(__le64)); + nr_words = dm_bm_block_size(bm) / sizeof(__le64); + for (i = 0; i < nr_words; i++) { + if (data_le[i] != zero) { + *formatted = true; + break; + } + } + + dm_bm_unlock(sblock); + + return 0; +} + +/*---------------------------------------------------------------------------*/ + +/* + * Low-level metadata handling. + */ +static inline int superblock_read_lock(struct dm_clone_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); +} + +static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); +} + +static int __copy_sm_root(struct dm_clone_metadata *cmd) +{ + int r; + size_t root_size; + + r = dm_sm_root_size(cmd->sm, &root_size); + if (r) + return r; + + return dm_sm_copy_root(cmd->sm, &cmd->metadata_space_map_root, root_size); +} + +/* Save dm-clone metadata in superblock */ +static void __prepare_superblock(struct dm_clone_metadata *cmd, + struct superblock_disk *sb) +{ + sb->flags = cpu_to_le32(0UL); + + /* FIXME: UUID is currently unused */ + memset(sb->uuid, 0, sizeof(sb->uuid)); + + sb->magic = cpu_to_le64(SUPERBLOCK_MAGIC); + sb->version = cpu_to_le32(DM_CLONE_MAX_METADATA_VERSION); + + /* Save the metadata space_map root */ + memcpy(&sb->metadata_space_map_root, &cmd->metadata_space_map_root, + sizeof(cmd->metadata_space_map_root)); + + sb->region_size = cpu_to_le64(cmd->region_size); + sb->target_size = cpu_to_le64(cmd->target_size); + sb->bitset_root = cpu_to_le64(cmd->bitset_root); +} + +static int __open_metadata(struct dm_clone_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *sb; + + r = superblock_read_lock(cmd, &sblock); + + if (r) { + DMERR("Failed to read_lock superblock"); + return r; + } + + sb = dm_block_data(sblock); + + /* Verify that target_size and region_size haven't changed. */ + if (cmd->region_size != le64_to_cpu(sb->region_size) || + cmd->target_size != le64_to_cpu(sb->target_size)) { + DMERR("Region and/or target size don't match the ones in metadata"); + r = -EINVAL; + goto out_with_lock; + } + + r = dm_tm_open_with_sm(cmd->bm, SUPERBLOCK_LOCATION, + sb->metadata_space_map_root, + sizeof(sb->metadata_space_map_root), + &cmd->tm, &cmd->sm); + + if (r) { + DMERR("dm_tm_open_with_sm failed"); + goto out_with_lock; + } + + dm_disk_bitset_init(cmd->tm, &cmd->bitset_info); + cmd->bitset_root = le64_to_cpu(sb->bitset_root); + +out_with_lock: + dm_bm_unlock(sblock); + + return r; +} + +static int __format_metadata(struct dm_clone_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *sb; + + r = dm_tm_create_with_sm(cmd->bm, SUPERBLOCK_LOCATION, &cmd->tm, &cmd->sm); + if (r) { + DMERR("Failed to create transaction manager"); + return r; + } + + dm_disk_bitset_init(cmd->tm, &cmd->bitset_info); + + r = dm_bitset_empty(&cmd->bitset_info, &cmd->bitset_root); + if (r) { + DMERR("Failed to create empty on-disk bitset"); + goto err_with_tm; + } + + r = dm_bitset_resize(&cmd->bitset_info, cmd->bitset_root, 0, + cmd->nr_regions, false, &cmd->bitset_root); + if (r) { + DMERR("Failed to resize on-disk bitset to %lu entries", cmd->nr_regions); + goto err_with_tm; + } + + /* Flush to disk all blocks, except the superblock */ + r = dm_tm_pre_commit(cmd->tm); + if (r) { + DMERR("dm_tm_pre_commit failed"); + goto err_with_tm; + } + + r = __copy_sm_root(cmd); + if (r) { + DMERR("__copy_sm_root failed"); + goto err_with_tm; + } + + r = superblock_write_lock_zero(cmd, &sblock); + if (r) { + DMERR("Failed to write_lock superblock"); + goto err_with_tm; + } + + sb = dm_block_data(sblock); + __prepare_superblock(cmd, sb); + r = dm_tm_commit(cmd->tm, sblock); + if (r) { + DMERR("Failed to commit superblock"); + goto err_with_tm; + } + + return 0; + +err_with_tm: + dm_sm_destroy(cmd->sm); + dm_tm_destroy(cmd->tm); + + return r; +} + +static int __open_or_format_metadata(struct dm_clone_metadata *cmd, bool may_format_device) +{ + int r; + bool formatted = false; + + r = __superblock_all_zeroes(cmd->bm, &formatted); + if (r) + return r; + + if (!formatted) + return may_format_device ? __format_metadata(cmd) : -EPERM; + + return __open_metadata(cmd); +} + +static int __create_persistent_data_structures(struct dm_clone_metadata *cmd, + bool may_format_device) +{ + int r; + + /* Create block manager */ + cmd->bm = dm_block_manager_create(cmd->bdev, + DM_CLONE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + DM_CLONE_MAX_CONCURRENT_LOCKS); + if (IS_ERR(cmd->bm)) { + DMERR("Failed to create block manager"); + return PTR_ERR(cmd->bm); + } + + r = __open_or_format_metadata(cmd, may_format_device); + if (r) + dm_block_manager_destroy(cmd->bm); + + return r; +} + +static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd) +{ + dm_sm_destroy(cmd->sm); + dm_tm_destroy(cmd->tm); + dm_block_manager_destroy(cmd->bm); +} + +/*---------------------------------------------------------------------------*/ + +static size_t bitmap_size(unsigned long nr_bits) +{ + return BITS_TO_LONGS(nr_bits) * sizeof(long); +} + +static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words, + unsigned long nr_regions) +{ + dmap->changed = 0; + + dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL); + if (!dmap->dirty_words) + return -ENOMEM; + + dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL); + if (!dmap->dirty_regions) { + kvfree(dmap->dirty_words); + return -ENOMEM; + } + + return 0; +} + +static void __dirty_map_exit(struct dirty_map *dmap) +{ + kvfree(dmap->dirty_words); + kvfree(dmap->dirty_regions); +} + +static int dirty_map_init(struct dm_clone_metadata *cmd) +{ + if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) { + DMERR("Failed to allocate dirty bitmap"); + return -ENOMEM; + } + + if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) { + DMERR("Failed to allocate dirty bitmap"); + __dirty_map_exit(&cmd->dmap[0]); + return -ENOMEM; + } + + cmd->current_dmap = &cmd->dmap[0]; + cmd->committing_dmap = NULL; + + return 0; +} + +static void dirty_map_exit(struct dm_clone_metadata *cmd) +{ + __dirty_map_exit(&cmd->dmap[0]); + __dirty_map_exit(&cmd->dmap[1]); +} + +static int __load_bitset_in_core(struct dm_clone_metadata *cmd) +{ + int r; + unsigned long i; + struct dm_bitset_cursor c; + + /* Flush bitset cache */ + r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root); + if (r) + return r; + + r = dm_bitset_cursor_begin(&cmd->bitset_info, cmd->bitset_root, cmd->nr_regions, &c); + if (r) + return r; + + for (i = 0; ; i++) { + if (dm_bitset_cursor_get_value(&c)) + __set_bit(i, cmd->region_map); + else + __clear_bit(i, cmd->region_map); + + if (i >= (cmd->nr_regions - 1)) + break; + + r = dm_bitset_cursor_next(&c); + + if (r) + break; + } + + dm_bitset_cursor_end(&c); + + return r; +} + +struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev, + sector_t target_size, + sector_t region_size) +{ + int r; + struct dm_clone_metadata *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + DMERR("Failed to allocate memory for dm-clone metadata"); + return ERR_PTR(-ENOMEM); + } + + cmd->bdev = bdev; + cmd->target_size = target_size; + cmd->region_size = region_size; + cmd->nr_regions = dm_sector_div_up(cmd->target_size, cmd->region_size); + cmd->nr_words = BITS_TO_LONGS(cmd->nr_regions); + + init_rwsem(&cmd->lock); + spin_lock_init(&cmd->bitmap_lock); + cmd->read_only = 0; + cmd->fail_io = false; + cmd->hydration_done = false; + + cmd->region_map = kvmalloc(bitmap_size(cmd->nr_regions), GFP_KERNEL); + if (!cmd->region_map) { + DMERR("Failed to allocate memory for region bitmap"); + r = -ENOMEM; + goto out_with_md; + } + + r = __create_persistent_data_structures(cmd, true); + if (r) + goto out_with_region_map; + + r = __load_bitset_in_core(cmd); + if (r) { + DMERR("Failed to load on-disk region map"); + goto out_with_pds; + } + + r = dirty_map_init(cmd); + if (r) + goto out_with_pds; + + if (bitmap_full(cmd->region_map, cmd->nr_regions)) + cmd->hydration_done = true; + + return cmd; + +out_with_pds: + __destroy_persistent_data_structures(cmd); + +out_with_region_map: + kvfree(cmd->region_map); + +out_with_md: + kfree(cmd); + + return ERR_PTR(r); +} + +void dm_clone_metadata_close(struct dm_clone_metadata *cmd) +{ + if (!cmd->fail_io) + __destroy_persistent_data_structures(cmd); + + dirty_map_exit(cmd); + kvfree(cmd->region_map); + kfree(cmd); +} + +bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd) +{ + return cmd->hydration_done; +} + +bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr) +{ + return dm_clone_is_hydration_done(cmd) || test_bit(region_nr, cmd->region_map); +} + +bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, + unsigned long start, unsigned long nr_regions) +{ + unsigned long bit; + + if (dm_clone_is_hydration_done(cmd)) + return true; + + bit = find_next_zero_bit(cmd->region_map, cmd->nr_regions, start); + + return (bit >= (start + nr_regions)); +} + +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) +{ + return bitmap_weight(cmd->region_map, cmd->nr_regions); +} + +unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd, + unsigned long start) +{ + return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start); +} + +static int __update_metadata_word(struct dm_clone_metadata *cmd, + unsigned long *dirty_regions, + unsigned long word) +{ + int r; + unsigned long index = word * BITS_PER_LONG; + unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG); + + while (index < max_index) { + if (test_bit(index, dirty_regions)) { + r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root, + index, &cmd->bitset_root); + if (r) { + DMERR("dm_bitset_set_bit failed"); + return r; + } + __clear_bit(index, dirty_regions); + } + index++; + } + + return 0; +} + +static int __metadata_commit(struct dm_clone_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *sb; + + /* Flush bitset cache */ + r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root); + if (r) { + DMERR("dm_bitset_flush failed"); + return r; + } + + /* Flush to disk all blocks, except the superblock */ + r = dm_tm_pre_commit(cmd->tm); + if (r) { + DMERR("dm_tm_pre_commit failed"); + return r; + } + + /* Save the space map root in cmd->metadata_space_map_root */ + r = __copy_sm_root(cmd); + if (r) { + DMERR("__copy_sm_root failed"); + return r; + } + + /* Lock the superblock */ + r = superblock_write_lock_zero(cmd, &sblock); + if (r) { + DMERR("Failed to write_lock superblock"); + return r; + } + + /* Save the metadata in superblock */ + sb = dm_block_data(sblock); + __prepare_superblock(cmd, sb); + + /* Unlock superblock and commit it to disk */ + r = dm_tm_commit(cmd->tm, sblock); + if (r) { + DMERR("Failed to commit superblock"); + return r; + } + + /* + * FIXME: Find a more efficient way to check if the hydration is done. + */ + if (bitmap_full(cmd->region_map, cmd->nr_regions)) + cmd->hydration_done = true; + + return 0; +} + +static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) +{ + int r; + unsigned long word; + + word = 0; + do { + word = find_next_bit(dmap->dirty_words, cmd->nr_words, word); + + if (word == cmd->nr_words) + break; + + r = __update_metadata_word(cmd, dmap->dirty_regions, word); + + if (r) + return r; + + __clear_bit(word, dmap->dirty_words); + word++; + } while (word < cmd->nr_words); + + r = __metadata_commit(cmd); + + if (r) + return r; + + /* Update the changed flag */ + spin_lock_irq(&cmd->bitmap_lock); + dmap->changed = 0; + spin_unlock_irq(&cmd->bitmap_lock); + + return 0; +} + +int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd) +{ + int r = 0; + struct dirty_map *dmap, *next_dmap; + + down_write(&cmd->lock); + + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { + r = -EPERM; + goto out; + } + + /* Get current dirty bitmap */ + dmap = cmd->current_dmap; + + /* Get next dirty bitmap */ + next_dmap = (dmap == &cmd->dmap[0]) ? &cmd->dmap[1] : &cmd->dmap[0]; + + /* + * The last commit failed, so we don't have a clean dirty-bitmap to + * use. + */ + if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) { + r = -EINVAL; + goto out; + } + + /* Swap dirty bitmaps */ + spin_lock_irq(&cmd->bitmap_lock); + cmd->current_dmap = next_dmap; + spin_unlock_irq(&cmd->bitmap_lock); + + /* Set old dirty bitmap as currently committing */ + cmd->committing_dmap = dmap; +out: + up_write(&cmd->lock); + + return r; +} + +int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) +{ + int r = -EPERM; + + down_write(&cmd->lock); + + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) + goto out; + + if (WARN_ON(!cmd->committing_dmap)) { + r = -EINVAL; + goto out; + } + + r = __flush_dmap(cmd, cmd->committing_dmap); + if (!r) { + /* Clear committing dmap */ + cmd->committing_dmap = NULL; + } +out: + up_write(&cmd->lock); + + return r; +} + +int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr) +{ + int r = 0; + struct dirty_map *dmap; + unsigned long word, flags; + + if (unlikely(region_nr >= cmd->nr_regions)) { + DMERR("Region %lu out of range (total number of regions %lu)", + region_nr, cmd->nr_regions); + return -ERANGE; + } + + word = region_nr / BITS_PER_LONG; + + spin_lock_irqsave(&cmd->bitmap_lock, flags); + + if (cmd->read_only) { + r = -EPERM; + goto out; + } + + dmap = cmd->current_dmap; + + __set_bit(word, dmap->dirty_words); + __set_bit(region_nr, dmap->dirty_regions); + __set_bit(region_nr, cmd->region_map); + dmap->changed = 1; + +out: + spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + + return r; +} + +int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, + unsigned long nr_regions) +{ + int r = 0; + struct dirty_map *dmap; + unsigned long word, region_nr; + + if (unlikely(start >= cmd->nr_regions || (start + nr_regions) < start || + (start + nr_regions) > cmd->nr_regions)) { + DMERR("Invalid region range: start %lu, nr_regions %lu (total number of regions %lu)", + start, nr_regions, cmd->nr_regions); + return -ERANGE; + } + + spin_lock_irq(&cmd->bitmap_lock); + + if (cmd->read_only) { + r = -EPERM; + goto out; + } + + dmap = cmd->current_dmap; + for (region_nr = start; region_nr < (start + nr_regions); region_nr++) { + if (!test_bit(region_nr, cmd->region_map)) { + word = region_nr / BITS_PER_LONG; + __set_bit(word, dmap->dirty_words); + __set_bit(region_nr, dmap->dirty_regions); + __set_bit(region_nr, cmd->region_map); + dmap->changed = 1; + } + } +out: + spin_unlock_irq(&cmd->bitmap_lock); + + return r; +} + +/* + * WARNING: This must not be called concurrently with either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it changes + * cmd->region_map without taking the cmd->bitmap_lock spinlock. The only + * exception is after setting the metadata to read-only mode, using + * dm_clone_metadata_set_read_only(). + * + * We don't take the spinlock because __load_bitset_in_core() does I/O, so it + * may block. + */ +int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd) +{ + int r = -EINVAL; + + down_write(&cmd->lock); + + if (cmd->fail_io) + goto out; + + r = __load_bitset_in_core(cmd); +out: + up_write(&cmd->lock); + + return r; +} + +bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd) +{ + bool r; + unsigned long flags; + + spin_lock_irqsave(&cmd->bitmap_lock, flags); + r = cmd->dmap[0].changed || cmd->dmap[1].changed; + spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + + return r; +} + +int dm_clone_metadata_abort(struct dm_clone_metadata *cmd) +{ + int r = -EPERM; + + down_write(&cmd->lock); + + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) + goto out; + + __destroy_persistent_data_structures(cmd); + + r = __create_persistent_data_structures(cmd, false); + if (r) { + /* If something went wrong we can neither write nor read the metadata */ + cmd->fail_io = true; + } +out: + up_write(&cmd->lock); + + return r; +} + +void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) +{ + down_write(&cmd->lock); + + spin_lock_irq(&cmd->bitmap_lock); + cmd->read_only = 1; + spin_unlock_irq(&cmd->bitmap_lock); + + if (!cmd->fail_io) + dm_bm_set_read_only(cmd->bm); + + up_write(&cmd->lock); +} + +void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd) +{ + down_write(&cmd->lock); + + spin_lock_irq(&cmd->bitmap_lock); + cmd->read_only = 0; + spin_unlock_irq(&cmd->bitmap_lock); + + if (!cmd->fail_io) + dm_bm_set_read_write(cmd->bm); + + up_write(&cmd->lock); +} + +int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&cmd->lock); + + if (!cmd->fail_io) + r = dm_sm_get_nr_free(cmd->sm, result); + + up_read(&cmd->lock); + + return r; +} + +int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&cmd->lock); + + if (!cmd->fail_io) + r = dm_sm_get_nr_blocks(cmd->sm, result); + + up_read(&cmd->lock); + + return r; +} diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h new file mode 100644 index 000000000..d848b8799 --- /dev/null +++ b/drivers/md/dm-clone-metadata.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. + */ + +#ifndef DM_CLONE_METADATA_H +#define DM_CLONE_METADATA_H + +#include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-space-map-metadata.h" + +#define DM_CLONE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE + +/* + * The metadata device is currently limited in size. + */ +#define DM_CLONE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define DM_CLONE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +#define SPACE_MAP_ROOT_SIZE 128 + +/* dm-clone metadata */ +struct dm_clone_metadata; + +/* + * Set region status to hydrated. + * + * @cmd: The dm-clone metadata + * @region_nr: The region number + * + * This function doesn't block, so it's safe to call it from interrupt context. + */ +int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr); + +/* + * Set status of all regions in the provided range to hydrated, if not already + * hydrated. + * + * @cmd: The dm-clone metadata + * @start: Starting region number + * @nr_regions: Number of regions in the range + * + * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq() + * it's NOT safe to call it from any context where interrupts are disabled, e.g., + * from interrupt context. + */ +int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, + unsigned long nr_regions); + +/* + * Read existing or create fresh metadata. + * + * @bdev: The device storing the metadata + * @target_size: The target size + * @region_size: The region size + * + * @returns: The dm-clone metadata + * + * This function reads the superblock of @bdev and checks if it's all zeroes. + * If it is, it formats @bdev and creates fresh metadata. If it isn't, it + * validates the metadata stored in @bdev. + */ +struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev, + sector_t target_size, + sector_t region_size); + +/* + * Free the resources related to metadata management. + */ +void dm_clone_metadata_close(struct dm_clone_metadata *cmd); + +/* + * Commit dm-clone metadata to disk. + * + * We use a two phase commit: + * + * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for + * committing. After this is called, all subsequent metadata updates, done + * through either dm_clone_set_region_hydrated() or + * dm_clone_cond_set_range(), will be part of the **next** transaction. + * + * 2. dm_clone_metadata_commit(): Actually commit the current transaction to + * disk and start a new transaction. + * + * This allows dm-clone to flush the destination device after step (1) to + * ensure that all freshly hydrated regions, for which we are updating the + * metadata, are properly written to non-volatile storage and won't be lost in + * case of a crash. + */ +int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd); +int dm_clone_metadata_commit(struct dm_clone_metadata *cmd); + +/* + * Reload the in core copy of the on-disk bitmap. + * + * This should be used after aborting a metadata transaction and setting the + * metadata to read-only, to invalidate the in-core cache and make it match the + * on-disk metadata. + * + * WARNING: It must not be called concurrently with either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it updates + * the region bitmap without taking the relevant spinlock. We don't take the + * spinlock because dm_clone_reload_in_core_bitset() does I/O, so it may block. + * + * But, it's safe to use it after calling dm_clone_metadata_set_read_only(), + * because the latter sets the metadata to read-only mode. Both + * dm_clone_set_region_hydrated() and dm_clone_cond_set_range() refuse to touch + * the region bitmap, after calling dm_clone_metadata_set_read_only(). + */ +int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd); + +/* + * Check whether dm-clone's metadata changed this transaction. + */ +bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd); + +/* + * Abort current metadata transaction and rollback metadata to the last + * committed transaction. + */ +int dm_clone_metadata_abort(struct dm_clone_metadata *cmd); + +/* + * Switches metadata to a read only mode. Once read-only mode has been entered + * the following functions will return -EPERM: + * + * dm_clone_metadata_pre_commit() + * dm_clone_metadata_commit() + * dm_clone_set_region_hydrated() + * dm_clone_cond_set_range() + * dm_clone_metadata_abort() + */ +void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd); +void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd); + +/* + * Returns true if the hydration of the destination device is finished. + */ +bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd); + +/* + * Returns true if region @region_nr is hydrated. + */ +bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr); + +/* + * Returns true if all the regions in the range are hydrated. + */ +bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, + unsigned long start, unsigned long nr_regions); + +/* + * Returns the number of hydrated regions. + */ +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); + +/* + * Returns the first unhydrated region with region_nr >= @start + */ +unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd, + unsigned long start); + +/* + * Get the number of free metadata blocks. + */ +int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, dm_block_t *result); + +/* + * Get the total number of metadata blocks. + */ +int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, dm_block_t *result); + +#endif /* DM_CLONE_METADATA_H */ diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c new file mode 100644 index 000000000..e088081b7 --- /dev/null +++ b/drivers/md/dm-clone-target.c @@ -0,0 +1,2229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm.h" +#include "dm-clone-metadata.h" + +#define DM_MSG_PREFIX "clone" + +/* + * Minimum and maximum allowed region sizes + */ +#define MIN_REGION_SIZE (1 << 3) /* 4KB */ +#define MAX_REGION_SIZE (1 << 21) /* 1GB */ + +#define MIN_HYDRATIONS 256 /* Size of hydration mempool */ +#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */ +#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */ + +#define COMMIT_PERIOD HZ /* 1 sec */ + +/* + * Hydration hash table size: 1 << HASH_TABLE_BITS + */ +#define HASH_TABLE_BITS 15 + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle, + "A percentage of time allocated for hydrating regions"); + +/* Slab cache for struct dm_clone_region_hydration */ +static struct kmem_cache *_hydration_cache; + +/* dm-clone metadata modes */ +enum clone_metadata_mode { + CM_WRITE, /* metadata may be changed */ + CM_READ_ONLY, /* metadata may not be changed */ + CM_FAIL, /* all metadata I/O fails */ +}; + +struct hash_table_bucket; + +struct clone { + struct dm_target *ti; + + struct dm_dev *metadata_dev; + struct dm_dev *dest_dev; + struct dm_dev *source_dev; + + unsigned long nr_regions; + sector_t region_size; + unsigned int region_shift; + + /* + * A metadata commit and the actions taken in case it fails should run + * as a single atomic step. + */ + struct mutex commit_lock; + + struct dm_clone_metadata *cmd; + + /* Region hydration hash table */ + struct hash_table_bucket *ht; + + atomic_t ios_in_flight; + + wait_queue_head_t hydration_stopped; + + mempool_t hydration_pool; + + unsigned long last_commit_jiffies; + + /* + * We defer incoming WRITE bios for regions that are not hydrated, + * until after these regions have been hydrated. + * + * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the + * metadata have been committed. + */ + spinlock_t lock; + struct bio_list deferred_bios; + struct bio_list deferred_discard_bios; + struct bio_list deferred_flush_bios; + struct bio_list deferred_flush_completions; + + /* Maximum number of regions being copied during background hydration. */ + unsigned int hydration_threshold; + + /* Number of regions to batch together during background hydration. */ + unsigned int hydration_batch_size; + + /* Which region to hydrate next */ + unsigned long hydration_offset; + + atomic_t hydrations_in_flight; + + /* + * Save a copy of the table line rather than reconstructing it for the + * status. + */ + unsigned int nr_ctr_args; + const char **ctr_args; + + struct workqueue_struct *wq; + struct work_struct worker; + struct delayed_work waker; + + struct dm_kcopyd_client *kcopyd_client; + + enum clone_metadata_mode mode; + unsigned long flags; +}; + +/* + * dm-clone flags + */ +#define DM_CLONE_DISCARD_PASSDOWN 0 +#define DM_CLONE_HYDRATION_ENABLED 1 +#define DM_CLONE_HYDRATION_SUSPENDED 2 + +/*---------------------------------------------------------------------------*/ + +/* + * Metadata failure handling. + */ +static enum clone_metadata_mode get_clone_mode(struct clone *clone) +{ + return READ_ONCE(clone->mode); +} + +static const char *clone_device_name(struct clone *clone) +{ + return dm_table_device_name(clone->ti->table); +} + +static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) +{ + static const char * const descs[] = { + "read-write", + "read-only", + "fail" + }; + + enum clone_metadata_mode old_mode = get_clone_mode(clone); + + /* Never move out of fail mode */ + if (old_mode == CM_FAIL) + new_mode = CM_FAIL; + + switch (new_mode) { + case CM_FAIL: + case CM_READ_ONLY: + dm_clone_metadata_set_read_only(clone->cmd); + break; + + case CM_WRITE: + dm_clone_metadata_set_read_write(clone->cmd); + break; + } + + WRITE_ONCE(clone->mode, new_mode); + + if (new_mode != old_mode) { + dm_table_event(clone->ti->table); + DMINFO("%s: Switching to %s mode", clone_device_name(clone), + descs[(int)new_mode]); + } +} + +static void __abort_transaction(struct clone *clone) +{ + const char *dev_name = clone_device_name(clone); + + if (get_clone_mode(clone) >= CM_READ_ONLY) + return; + + DMERR("%s: Aborting current metadata transaction", dev_name); + if (dm_clone_metadata_abort(clone->cmd)) { + DMERR("%s: Failed to abort metadata transaction", dev_name); + __set_clone_mode(clone, CM_FAIL); + } +} + +static void __reload_in_core_bitset(struct clone *clone) +{ + const char *dev_name = clone_device_name(clone); + + if (get_clone_mode(clone) == CM_FAIL) + return; + + /* Reload the on-disk bitset */ + DMINFO("%s: Reloading on-disk bitmap", dev_name); + if (dm_clone_reload_in_core_bitset(clone->cmd)) { + DMERR("%s: Failed to reload on-disk bitmap", dev_name); + __set_clone_mode(clone, CM_FAIL); + } +} + +static void __metadata_operation_failed(struct clone *clone, const char *op, int r) +{ + DMERR("%s: Metadata operation `%s' failed: error = %d", + clone_device_name(clone), op, r); + + __abort_transaction(clone); + __set_clone_mode(clone, CM_READ_ONLY); + + /* + * dm_clone_reload_in_core_bitset() may run concurrently with either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but + * it's safe as we have already set the metadata to read-only mode. + */ + __reload_in_core_bitset(clone); +} + +/*---------------------------------------------------------------------------*/ + +/* Wake up anyone waiting for region hydrations to stop */ +static inline void wakeup_hydration_waiters(struct clone *clone) +{ + wake_up_all(&clone->hydration_stopped); +} + +static inline void wake_worker(struct clone *clone) +{ + queue_work(clone->wq, &clone->worker); +} + +/*---------------------------------------------------------------------------*/ + +/* + * bio helper functions. + */ +static inline void remap_to_source(struct clone *clone, struct bio *bio) +{ + bio_set_dev(bio, clone->source_dev->bdev); +} + +static inline void remap_to_dest(struct clone *clone, struct bio *bio) +{ + bio_set_dev(bio, clone->dest_dev->bdev); +} + +static bool bio_triggers_commit(struct clone *clone, struct bio *bio) +{ + return op_is_flush(bio->bi_opf) && + dm_clone_changed_this_transaction(clone->cmd); +} + +/* Get the address of the region in sectors */ +static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) +{ + return ((sector_t)region_nr << clone->region_shift); +} + +/* Get the region number of the bio */ +static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) +{ + return (bio->bi_iter.bi_sector >> clone->region_shift); +} + +/* Get the region range covered by the bio */ +static void bio_region_range(struct clone *clone, struct bio *bio, + unsigned long *rs, unsigned long *nr_regions) +{ + unsigned long end; + + *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); + end = bio_end_sector(bio) >> clone->region_shift; + + if (*rs >= end) + *nr_regions = 0; + else + *nr_regions = end - *rs; +} + +/* Check whether a bio overwrites a region */ +static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size); +} + +static void fail_bios(struct bio_list *bios, blk_status_t status) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bios))) { + bio->bi_status = status; + bio_endio(bio); + } +} + +static void submit_bios(struct bio_list *bios) +{ + struct bio *bio; + struct blk_plug plug; + + blk_start_plug(&plug); + + while ((bio = bio_list_pop(bios))) + submit_bio_noacct(bio); + + blk_finish_plug(&plug); +} + +/* + * Submit bio to the underlying device. + * + * If the bio triggers a commit, delay it, until after the metadata have been + * committed. + * + * NOTE: The bio remapping must be performed by the caller. + */ +static void issue_bio(struct clone *clone, struct bio *bio) +{ + if (!bio_triggers_commit(clone, bio)) { + submit_bio_noacct(bio); + return; + } + + /* + * If the metadata mode is RO or FAIL we won't be able to commit the + * metadata, so we complete the bio with an error. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a single + * commit for them in process_deferred_flush_bios(). + */ + spin_lock_irq(&clone->lock); + bio_list_add(&clone->deferred_flush_bios, bio); + spin_unlock_irq(&clone->lock); + + wake_worker(clone); +} + +/* + * Remap bio to the destination device and submit it. + * + * If the bio triggers a commit, delay it, until after the metadata have been + * committed. + */ +static void remap_and_issue(struct clone *clone, struct bio *bio) +{ + remap_to_dest(clone, bio); + issue_bio(clone, bio); +} + +/* + * Issue bios that have been deferred until after their region has finished + * hydrating. + * + * We delegate the bio submission to the worker thread, so this is safe to call + * from interrupt context. + */ +static void issue_deferred_bios(struct clone *clone, struct bio_list *bios) +{ + struct bio *bio; + unsigned long flags; + struct bio_list flush_bios = BIO_EMPTY_LIST; + struct bio_list normal_bios = BIO_EMPTY_LIST; + + if (bio_list_empty(bios)) + return; + + while ((bio = bio_list_pop(bios))) { + if (bio_triggers_commit(clone, bio)) + bio_list_add(&flush_bios, bio); + else + bio_list_add(&normal_bios, bio); + } + + spin_lock_irqsave(&clone->lock, flags); + bio_list_merge(&clone->deferred_bios, &normal_bios); + bio_list_merge(&clone->deferred_flush_bios, &flush_bios); + spin_unlock_irqrestore(&clone->lock, flags); + + wake_worker(clone); +} + +static void complete_overwrite_bio(struct clone *clone, struct bio *bio) +{ + unsigned long flags; + + /* + * If the bio has the REQ_FUA flag set we must commit the metadata + * before signaling its completion. + * + * complete_overwrite_bio() is only called by hydration_complete(), + * after having successfully updated the metadata. This means we don't + * need to call dm_clone_changed_this_transaction() to check if the + * metadata has changed and thus we can avoid taking the metadata spin + * lock. + */ + if (!(bio->bi_opf & REQ_FUA)) { + bio_endio(bio); + return; + } + + /* + * If the metadata mode is RO or FAIL we won't be able to commit the + * metadata, so we complete the bio with an error. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a single + * commit for them in process_deferred_flush_bios(). + */ + spin_lock_irqsave(&clone->lock, flags); + bio_list_add(&clone->deferred_flush_completions, bio); + spin_unlock_irqrestore(&clone->lock, flags); + + wake_worker(clone); +} + +static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) +{ + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = to_bytes(len); +} + +static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) +{ + unsigned long rs, nr_regions; + + /* + * If the destination device supports discards, remap and trim the + * discard bio and pass it down. Otherwise complete the bio + * immediately. + */ + if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { + remap_to_dest(clone, bio); + bio_region_range(clone, bio, &rs, &nr_regions); + trim_bio(bio, region_to_sector(clone, rs), + nr_regions << clone->region_shift); + submit_bio_noacct(bio); + } else + bio_endio(bio); +} + +static void process_discard_bio(struct clone *clone, struct bio *bio) +{ + unsigned long rs, nr_regions; + + bio_region_range(clone, bio, &rs, &nr_regions); + if (!nr_regions) { + bio_endio(bio); + return; + } + + if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs || + (rs + nr_regions) > clone->nr_regions)) { + DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)", + clone_device_name(clone), rs, nr_regions, + clone->nr_regions, + (unsigned long long)bio->bi_iter.bi_sector, + bio_sectors(bio)); + bio_endio(bio); + return; + } + + /* + * The covered regions are already hydrated so we just need to pass + * down the discard. + */ + if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) { + complete_discard_bio(clone, bio, true); + return; + } + + /* + * If the metadata mode is RO or FAIL we won't be able to update the + * metadata for the regions covered by the discard so we just ignore + * it. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + bio_endio(bio); + return; + } + + /* + * Defer discard processing. + */ + spin_lock_irq(&clone->lock); + bio_list_add(&clone->deferred_discard_bios, bio); + spin_unlock_irq(&clone->lock); + + wake_worker(clone); +} + +/*---------------------------------------------------------------------------*/ + +/* + * dm-clone region hydrations. + */ +struct dm_clone_region_hydration { + struct clone *clone; + unsigned long region_nr; + + struct bio *overwrite_bio; + bio_end_io_t *overwrite_bio_end_io; + + struct bio_list deferred_bios; + + blk_status_t status; + + /* Used by hydration batching */ + struct list_head list; + + /* Used by hydration hash table */ + struct hlist_node h; +}; + +/* + * Hydration hash table implementation. + * + * Ideally we would like to use list_bl, which uses bit spin locks and employs + * the least significant bit of the list head to lock the corresponding bucket, + * reducing the memory overhead for the locks. But, currently, list_bl and bit + * spin locks don't support IRQ safe versions. Since we have to take the lock + * in both process and interrupt context, we must fall back to using regular + * spin locks; one per hash table bucket. + */ +struct hash_table_bucket { + struct hlist_head head; + + /* Spinlock protecting the bucket */ + spinlock_t lock; +}; + +#define bucket_lock_irqsave(bucket, flags) \ + spin_lock_irqsave(&(bucket)->lock, flags) + +#define bucket_unlock_irqrestore(bucket, flags) \ + spin_unlock_irqrestore(&(bucket)->lock, flags) + +#define bucket_lock_irq(bucket) \ + spin_lock_irq(&(bucket)->lock) + +#define bucket_unlock_irq(bucket) \ + spin_unlock_irq(&(bucket)->lock) + +static int hash_table_init(struct clone *clone) +{ + unsigned int i, sz; + struct hash_table_bucket *bucket; + + sz = 1 << HASH_TABLE_BITS; + + clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL); + if (!clone->ht) + return -ENOMEM; + + for (i = 0; i < sz; i++) { + bucket = clone->ht + i; + + INIT_HLIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + return 0; +} + +static void hash_table_exit(struct clone *clone) +{ + kvfree(clone->ht); +} + +static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, + unsigned long region_nr) +{ + return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)]; +} + +/* + * Search hash table for a hydration with hd->region_nr == region_nr + * + * NOTE: Must be called with the bucket lock held + */ +static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, + unsigned long region_nr) +{ + struct dm_clone_region_hydration *hd; + + hlist_for_each_entry(hd, &bucket->head, h) { + if (hd->region_nr == region_nr) + return hd; + } + + return NULL; +} + +/* + * Insert a hydration into the hash table. + * + * NOTE: Must be called with the bucket lock held. + */ +static inline void __insert_region_hydration(struct hash_table_bucket *bucket, + struct dm_clone_region_hydration *hd) +{ + hlist_add_head(&hd->h, &bucket->head); +} + +/* + * This function inserts a hydration into the hash table, unless someone else + * managed to insert a hydration for the same region first. In the latter case + * it returns the existing hydration descriptor for this region. + * + * NOTE: Must be called with the hydration hash table lock held. + */ +static struct dm_clone_region_hydration * +__find_or_insert_region_hydration(struct hash_table_bucket *bucket, + struct dm_clone_region_hydration *hd) +{ + struct dm_clone_region_hydration *hd2; + + hd2 = __hash_find(bucket, hd->region_nr); + if (hd2) + return hd2; + + __insert_region_hydration(bucket, hd); + + return hd; +} + +/*---------------------------------------------------------------------------*/ + +/* Allocate a hydration */ +static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone) +{ + struct dm_clone_region_hydration *hd; + + /* + * Allocate a hydration from the hydration mempool. + * This might block but it can't fail. + */ + hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO); + hd->clone = clone; + + return hd; +} + +static inline void free_hydration(struct dm_clone_region_hydration *hd) +{ + mempool_free(hd, &hd->clone->hydration_pool); +} + +/* Initialize a hydration */ +static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr) +{ + hd->region_nr = region_nr; + hd->overwrite_bio = NULL; + bio_list_init(&hd->deferred_bios); + hd->status = 0; + + INIT_LIST_HEAD(&hd->list); + INIT_HLIST_NODE(&hd->h); +} + +/*---------------------------------------------------------------------------*/ + +/* + * Update dm-clone's metadata after a region has finished hydrating and remove + * hydration from the hash table. + */ +static int hydration_update_metadata(struct dm_clone_region_hydration *hd) +{ + int r = 0; + unsigned long flags; + struct hash_table_bucket *bucket; + struct clone *clone = hd->clone; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) + r = -EPERM; + + /* Update the metadata */ + if (likely(!r) && hd->status == BLK_STS_OK) + r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr); + + bucket = get_hash_table_bucket(clone, hd->region_nr); + + /* Remove hydration from hash table */ + bucket_lock_irqsave(bucket, flags); + hlist_del(&hd->h); + bucket_unlock_irqrestore(bucket, flags); + + return r; +} + +/* + * Complete a region's hydration: + * + * 1. Update dm-clone's metadata. + * 2. Remove hydration from hash table. + * 3. Complete overwrite bio. + * 4. Issue deferred bios. + * 5. If this was the last hydration, wake up anyone waiting for + * hydrations to finish. + */ +static void hydration_complete(struct dm_clone_region_hydration *hd) +{ + int r; + blk_status_t status; + struct clone *clone = hd->clone; + + r = hydration_update_metadata(hd); + + if (hd->status == BLK_STS_OK && likely(!r)) { + if (hd->overwrite_bio) + complete_overwrite_bio(clone, hd->overwrite_bio); + + issue_deferred_bios(clone, &hd->deferred_bios); + } else { + status = r ? BLK_STS_IOERR : hd->status; + + if (hd->overwrite_bio) + bio_list_add(&hd->deferred_bios, hd->overwrite_bio); + + fail_bios(&hd->deferred_bios, status); + } + + free_hydration(hd); + + if (atomic_dec_and_test(&clone->hydrations_in_flight)) + wakeup_hydration_waiters(clone); +} + +static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context) +{ + blk_status_t status; + + struct dm_clone_region_hydration *tmp, *hd = context; + struct clone *clone = hd->clone; + + LIST_HEAD(batched_hydrations); + + if (read_err || write_err) { + DMERR_LIMIT("%s: hydration failed", clone_device_name(clone)); + status = BLK_STS_IOERR; + } else { + status = BLK_STS_OK; + } + list_splice_tail(&hd->list, &batched_hydrations); + + hd->status = status; + hydration_complete(hd); + + /* Complete batched hydrations */ + list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) { + hd->status = status; + hydration_complete(hd); + } + + /* Continue background hydration, if there is no I/O in-flight */ + if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && + !atomic_read(&clone->ios_in_flight)) + wake_worker(clone); +} + +static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions) +{ + unsigned long region_start, region_end; + sector_t tail_size, region_size, total_size; + struct dm_io_region from, to; + struct clone *clone = hd->clone; + + if (WARN_ON(!nr_regions)) + return; + + region_size = clone->region_size; + region_start = hd->region_nr; + region_end = region_start + nr_regions - 1; + + total_size = region_to_sector(clone, nr_regions - 1); + + if (region_end == clone->nr_regions - 1) { + /* + * The last region of the target might be smaller than + * region_size. + */ + tail_size = clone->ti->len & (region_size - 1); + if (!tail_size) + tail_size = region_size; + } else { + tail_size = region_size; + } + + total_size += tail_size; + + from.bdev = clone->source_dev->bdev; + from.sector = region_to_sector(clone, region_start); + from.count = total_size; + + to.bdev = clone->dest_dev->bdev; + to.sector = from.sector; + to.count = from.count; + + /* Issue copy */ + atomic_add(nr_regions, &clone->hydrations_in_flight); + dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0, + hydration_kcopyd_callback, hd); +} + +static void overwrite_endio(struct bio *bio) +{ + struct dm_clone_region_hydration *hd = bio->bi_private; + + bio->bi_end_io = hd->overwrite_bio_end_io; + hd->status = bio->bi_status; + + hydration_complete(hd); +} + +static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio) +{ + /* + * We don't need to save and restore bio->bi_private because device + * mapper core generates a new bio for us to use, with clean + * bi_private. + */ + hd->overwrite_bio = bio; + hd->overwrite_bio_end_io = bio->bi_end_io; + + bio->bi_end_io = overwrite_endio; + bio->bi_private = hd; + + atomic_inc(&hd->clone->hydrations_in_flight); + submit_bio_noacct(bio); +} + +/* + * Hydrate bio's region. + * + * This function starts the hydration of the bio's region and puts the bio in + * the list of deferred bios for this region. In case, by the time this + * function is called, the region has finished hydrating it's submitted to the + * destination device. + * + * NOTE: The bio remapping must be performed by the caller. + */ +static void hydrate_bio_region(struct clone *clone, struct bio *bio) +{ + unsigned long region_nr; + struct hash_table_bucket *bucket; + struct dm_clone_region_hydration *hd, *hd2; + + region_nr = bio_to_region(clone, bio); + bucket = get_hash_table_bucket(clone, region_nr); + + bucket_lock_irq(bucket); + + hd = __hash_find(bucket, region_nr); + if (hd) { + /* Someone else is hydrating the region */ + bio_list_add(&hd->deferred_bios, bio); + bucket_unlock_irq(bucket); + return; + } + + if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { + /* The region has been hydrated */ + bucket_unlock_irq(bucket); + issue_bio(clone, bio); + return; + } + + /* + * We must allocate a hydration descriptor and start the hydration of + * the corresponding region. + */ + bucket_unlock_irq(bucket); + + hd = alloc_hydration(clone); + hydration_init(hd, region_nr); + + bucket_lock_irq(bucket); + + /* Check if the region has been hydrated in the meantime. */ + if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { + bucket_unlock_irq(bucket); + free_hydration(hd); + issue_bio(clone, bio); + return; + } + + hd2 = __find_or_insert_region_hydration(bucket, hd); + if (hd2 != hd) { + /* Someone else started the region's hydration. */ + bio_list_add(&hd2->deferred_bios, bio); + bucket_unlock_irq(bucket); + free_hydration(hd); + return; + } + + /* + * If the metadata mode is RO or FAIL then there is no point starting a + * hydration, since we will not be able to update the metadata when the + * hydration finishes. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + hlist_del(&hd->h); + bucket_unlock_irq(bucket); + free_hydration(hd); + bio_io_error(bio); + return; + } + + /* + * Start region hydration. + * + * If a bio overwrites a region, i.e., its size is equal to the + * region's size, then we don't need to copy the region from the source + * to the destination device. + */ + if (is_overwrite_bio(clone, bio)) { + bucket_unlock_irq(bucket); + hydration_overwrite(hd, bio); + } else { + bio_list_add(&hd->deferred_bios, bio); + bucket_unlock_irq(bucket); + hydration_copy(hd, 1); + } +} + +/*---------------------------------------------------------------------------*/ + +/* + * Background hydrations. + */ + +/* + * Batch region hydrations. + * + * To better utilize device bandwidth we batch together the hydration of + * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which + * is good for small, random write performance (because of the overwriting of + * un-hydrated regions) and at the same time issue big copy requests to kcopyd + * to achieve high hydration bandwidth. + */ +struct batch_info { + struct dm_clone_region_hydration *head; + unsigned int nr_batched_regions; +}; + +static void __batch_hydration(struct batch_info *batch, + struct dm_clone_region_hydration *hd) +{ + struct clone *clone = hd->clone; + unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size); + + if (batch->head) { + /* Try to extend the current batch */ + if (batch->nr_batched_regions < max_batch_size && + (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) { + list_add_tail(&hd->list, &batch->head->list); + batch->nr_batched_regions++; + hd = NULL; + } + + /* Check if we should issue the current batch */ + if (batch->nr_batched_regions >= max_batch_size || hd) { + hydration_copy(batch->head, batch->nr_batched_regions); + batch->head = NULL; + batch->nr_batched_regions = 0; + } + } + + if (!hd) + return; + + /* We treat max batch sizes of zero and one equivalently */ + if (max_batch_size <= 1) { + hydration_copy(hd, 1); + return; + } + + /* Start a new batch */ + BUG_ON(!list_empty(&hd->list)); + batch->head = hd; + batch->nr_batched_regions = 1; +} + +static unsigned long __start_next_hydration(struct clone *clone, + unsigned long offset, + struct batch_info *batch) +{ + struct hash_table_bucket *bucket; + struct dm_clone_region_hydration *hd; + unsigned long nr_regions = clone->nr_regions; + + hd = alloc_hydration(clone); + + /* Try to find a region to hydrate. */ + do { + offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset); + if (offset == nr_regions) + break; + + bucket = get_hash_table_bucket(clone, offset); + bucket_lock_irq(bucket); + + if (!dm_clone_is_region_hydrated(clone->cmd, offset) && + !__hash_find(bucket, offset)) { + hydration_init(hd, offset); + __insert_region_hydration(bucket, hd); + bucket_unlock_irq(bucket); + + /* Batch hydration */ + __batch_hydration(batch, hd); + + return (offset + 1); + } + + bucket_unlock_irq(bucket); + + } while (++offset < nr_regions); + + if (hd) + free_hydration(hd); + + return offset; +} + +/* + * This function searches for regions that still reside in the source device + * and starts their hydration. + */ +static void do_hydration(struct clone *clone) +{ + unsigned int current_volume; + unsigned long offset, nr_regions = clone->nr_regions; + + struct batch_info batch = { + .head = NULL, + .nr_batched_regions = 0, + }; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) + return; + + if (dm_clone_is_hydration_done(clone->cmd)) + return; + + /* + * Avoid race with device suspension. + */ + atomic_inc(&clone->hydrations_in_flight); + + /* + * Make sure atomic_inc() is ordered before test_bit(), otherwise we + * might race with clone_postsuspend() and start a region hydration + * after the target has been suspended. + * + * This is paired with the smp_mb__after_atomic() in + * clone_postsuspend(). + */ + smp_mb__after_atomic(); + + offset = clone->hydration_offset; + while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) && + !atomic_read(&clone->ios_in_flight) && + test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && + offset < nr_regions) { + current_volume = atomic_read(&clone->hydrations_in_flight); + current_volume += batch.nr_batched_regions; + + if (current_volume > READ_ONCE(clone->hydration_threshold)) + break; + + offset = __start_next_hydration(clone, offset, &batch); + } + + if (batch.head) + hydration_copy(batch.head, batch.nr_batched_regions); + + if (offset >= nr_regions) + offset = 0; + + clone->hydration_offset = offset; + + if (atomic_dec_and_test(&clone->hydrations_in_flight)) + wakeup_hydration_waiters(clone); +} + +/*---------------------------------------------------------------------------*/ + +static bool need_commit_due_to_time(struct clone *clone) +{ + return !time_in_range(jiffies, clone->last_commit_jiffies, + clone->last_commit_jiffies + COMMIT_PERIOD); +} + +/* + * A non-zero return indicates read-only or fail mode. + */ +static int commit_metadata(struct clone *clone, bool *dest_dev_flushed) +{ + int r = 0; + + if (dest_dev_flushed) + *dest_dev_flushed = false; + + mutex_lock(&clone->commit_lock); + + if (!dm_clone_changed_this_transaction(clone->cmd)) + goto out; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + r = -EPERM; + goto out; + } + + r = dm_clone_metadata_pre_commit(clone->cmd); + if (unlikely(r)) { + __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r); + goto out; + } + + r = blkdev_issue_flush(clone->dest_dev->bdev); + if (unlikely(r)) { + __metadata_operation_failed(clone, "flush destination device", r); + goto out; + } + + if (dest_dev_flushed) + *dest_dev_flushed = true; + + r = dm_clone_metadata_commit(clone->cmd); + if (unlikely(r)) { + __metadata_operation_failed(clone, "dm_clone_metadata_commit", r); + goto out; + } + + if (dm_clone_is_hydration_done(clone->cmd)) + dm_table_event(clone->ti->table); +out: + mutex_unlock(&clone->commit_lock); + + return r; +} + +static void process_deferred_discards(struct clone *clone) +{ + int r = -EPERM; + struct bio *bio; + struct blk_plug plug; + unsigned long rs, nr_regions; + struct bio_list discards = BIO_EMPTY_LIST; + + spin_lock_irq(&clone->lock); + bio_list_merge(&discards, &clone->deferred_discard_bios); + bio_list_init(&clone->deferred_discard_bios); + spin_unlock_irq(&clone->lock); + + if (bio_list_empty(&discards)) + return; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) + goto out; + + /* Update the metadata */ + bio_list_for_each(bio, &discards) { + bio_region_range(clone, bio, &rs, &nr_regions); + /* + * A discard request might cover regions that have been already + * hydrated. There is no need to update the metadata for these + * regions. + */ + r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions); + if (unlikely(r)) + break; + } +out: + blk_start_plug(&plug); + while ((bio = bio_list_pop(&discards))) + complete_discard_bio(clone, bio, r == 0); + blk_finish_plug(&plug); +} + +static void process_deferred_bios(struct clone *clone) +{ + struct bio_list bios = BIO_EMPTY_LIST; + + spin_lock_irq(&clone->lock); + bio_list_merge(&bios, &clone->deferred_bios); + bio_list_init(&clone->deferred_bios); + spin_unlock_irq(&clone->lock); + + if (bio_list_empty(&bios)) + return; + + submit_bios(&bios); +} + +static void process_deferred_flush_bios(struct clone *clone) +{ + struct bio *bio; + bool dest_dev_flushed; + struct bio_list bios = BIO_EMPTY_LIST; + struct bio_list bio_completions = BIO_EMPTY_LIST; + + /* + * If there are any deferred flush bios, we must commit the metadata + * before issuing them or signaling their completion. + */ + spin_lock_irq(&clone->lock); + bio_list_merge(&bios, &clone->deferred_flush_bios); + bio_list_init(&clone->deferred_flush_bios); + + bio_list_merge(&bio_completions, &clone->deferred_flush_completions); + bio_list_init(&clone->deferred_flush_completions); + spin_unlock_irq(&clone->lock); + + if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && + !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) + return; + + if (commit_metadata(clone, &dest_dev_flushed)) { + bio_list_merge(&bios, &bio_completions); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + + return; + } + + clone->last_commit_jiffies = jiffies; + + while ((bio = bio_list_pop(&bio_completions))) + bio_endio(bio); + + while ((bio = bio_list_pop(&bios))) { + if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) { + /* We just flushed the destination device as part of + * the metadata commit, so there is no reason to send + * another flush. + */ + bio_endio(bio); + } else { + submit_bio_noacct(bio); + } + } +} + +static void do_worker(struct work_struct *work) +{ + struct clone *clone = container_of(work, typeof(*clone), worker); + + process_deferred_bios(clone); + process_deferred_discards(clone); + + /* + * process_deferred_flush_bios(): + * + * - Commit metadata + * + * - Process deferred REQ_FUA completions + * + * - Process deferred REQ_PREFLUSH bios + */ + process_deferred_flush_bios(clone); + + /* Background hydration */ + do_hydration(clone); +} + +/* + * Commit periodically so that not too much unwritten data builds up. + * + * Also, restart background hydration, if it has been stopped by in-flight I/O. + */ +static void do_waker(struct work_struct *work) +{ + struct clone *clone = container_of(to_delayed_work(work), struct clone, waker); + + wake_worker(clone); + queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD); +} + +/*---------------------------------------------------------------------------*/ + +/* + * Target methods + */ +static int clone_map(struct dm_target *ti, struct bio *bio) +{ + struct clone *clone = ti->private; + unsigned long region_nr; + + atomic_inc(&clone->ios_in_flight); + + if (unlikely(get_clone_mode(clone) == CM_FAIL)) + return DM_MAPIO_KILL; + + /* + * REQ_PREFLUSH bios carry no data: + * + * - Commit metadata, if changed + * + * - Pass down to destination device + */ + if (bio->bi_opf & REQ_PREFLUSH) { + remap_and_issue(clone, bio); + return DM_MAPIO_SUBMITTED; + } + + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + /* + * dm-clone interprets discards and performs a fast hydration of the + * discarded regions, i.e., we skip the copy from the source device and + * just mark the regions as hydrated. + */ + if (bio_op(bio) == REQ_OP_DISCARD) { + process_discard_bio(clone, bio); + return DM_MAPIO_SUBMITTED; + } + + /* + * If the bio's region is hydrated, redirect it to the destination + * device. + * + * If the region is not hydrated and the bio is a READ, redirect it to + * the source device. + * + * Else, defer WRITE bio until after its region has been hydrated and + * start the region's hydration immediately. + */ + region_nr = bio_to_region(clone, bio); + if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { + remap_and_issue(clone, bio); + return DM_MAPIO_SUBMITTED; + } else if (bio_data_dir(bio) == READ) { + remap_to_source(clone, bio); + return DM_MAPIO_REMAPPED; + } + + remap_to_dest(clone, bio); + hydrate_bio_region(clone, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error) +{ + struct clone *clone = ti->private; + + atomic_dec(&clone->ios_in_flight); + + return DM_ENDIO_DONE; +} + +static void emit_flags(struct clone *clone, char *result, unsigned int maxlen, + ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + unsigned int count; + + count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); + count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + + DMEMIT("%u ", count); + + if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) + DMEMIT("no_hydration "); + + if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) + DMEMIT("no_discard_passdown "); + + *sz_ptr = sz; +} + +static void emit_core_args(struct clone *clone, char *result, + unsigned int maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + unsigned int count = 4; + + DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count, + READ_ONCE(clone->hydration_threshold), + READ_ONCE(clone->hydration_batch_size)); + + *sz_ptr = sz; +} + +/* + * Status format: + * + * <#used metadata blocks>/<#total metadata blocks> + * <#hydrated regions>/<#total regions> <#hydrating regions> + * <#features> * <#core args> * + */ +static void clone_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + int r; + unsigned int i; + ssize_t sz = 0; + dm_block_t nr_free_metadata_blocks = 0; + dm_block_t nr_metadata_blocks = 0; + char buf[BDEVNAME_SIZE]; + struct clone *clone = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + if (get_clone_mode(clone) == CM_FAIL) { + DMEMIT("Fail"); + break; + } + + /* Commit to ensure statistics aren't out-of-date */ + if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) + (void) commit_metadata(clone, NULL); + + r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks); + + if (r) { + DMERR("%s: dm_clone_get_free_metadata_block_count returned %d", + clone_device_name(clone), r); + goto error; + } + + r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks); + + if (r) { + DMERR("%s: dm_clone_get_metadata_dev_size returned %d", + clone_device_name(clone), r); + goto error; + } + + DMEMIT("%u %llu/%llu %llu %u/%lu %u ", + DM_CLONE_METADATA_BLOCK_SIZE, + (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), + (unsigned long long)nr_metadata_blocks, + (unsigned long long)clone->region_size, + dm_clone_nr_of_hydrated_regions(clone->cmd), + clone->nr_regions, + atomic_read(&clone->hydrations_in_flight)); + + emit_flags(clone, result, maxlen, &sz); + emit_core_args(clone, result, maxlen, &sz); + + switch (get_clone_mode(clone)) { + case CM_WRITE: + DMEMIT("rw"); + break; + case CM_READ_ONLY: + DMEMIT("ro"); + break; + case CM_FAIL: + DMEMIT("Fail"); + } + + break; + + case STATUSTYPE_TABLE: + format_dev_t(buf, clone->metadata_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + + format_dev_t(buf, clone->dest_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + + format_dev_t(buf, clone->source_dev->bdev->bd_dev); + DMEMIT("%s", buf); + + for (i = 0; i < clone->nr_ctr_args; i++) + DMEMIT(" %s", clone->ctr_args[i]); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return; + +error: + DMEMIT("Error"); +} + +static sector_t get_dev_size(struct dm_dev *dev) +{ + return bdev_nr_sectors(dev->bdev); +} + +/*---------------------------------------------------------------------------*/ + +/* + * Construct a clone device mapping: + * + * clone + * [<#feature args> []* [<#core args> [key value]*]] + * + * metadata dev: Fast device holding the persistent metadata + * destination dev: The destination device, which will become a clone of the + * source device + * source dev: The read-only source device that gets cloned + * region size: dm-clone unit size in sectors + * + * #feature args: Number of feature arguments passed + * feature args: E.g. no_hydration, no_discard_passdown + * + * #core arguments: An even number of core arguments + * core arguments: Key/value pairs for tuning the core + * E.g. 'hydration_threshold 256' + */ +static int parse_feature_args(struct dm_arg_set *as, struct clone *clone) +{ + int r; + unsigned int argc; + const char *arg_name; + struct dm_target *ti = clone->ti; + + const struct dm_arg args = { + .min = 0, + .max = 2, + .error = "Invalid number of feature arguments" + }; + + /* No feature arguments supplied */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(&args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "no_hydration")) { + __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); + } else if (!strcasecmp(arg_name, "no_discard_passdown")) { + __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + } else { + ti->error = "Invalid feature argument"; + return -EINVAL; + } + } + + return 0; +} + +static int parse_core_args(struct dm_arg_set *as, struct clone *clone) +{ + int r; + unsigned int argc; + unsigned int value; + const char *arg_name; + struct dm_target *ti = clone->ti; + + const struct dm_arg args = { + .min = 0, + .max = 4, + .error = "Invalid number of core arguments" + }; + + /* Initialize core arguments */ + clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE; + clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD; + + /* No core arguments supplied */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(&args, as, &argc, &ti->error); + if (r) + return r; + + if (argc & 1) { + ti->error = "Number of core arguments must be even"; + return -EINVAL; + } + + while (argc) { + arg_name = dm_shift_arg(as); + argc -= 2; + + if (!strcasecmp(arg_name, "hydration_threshold")) { + if (kstrtouint(dm_shift_arg(as), 10, &value)) { + ti->error = "Invalid value for argument `hydration_threshold'"; + return -EINVAL; + } + clone->hydration_threshold = value; + } else if (!strcasecmp(arg_name, "hydration_batch_size")) { + if (kstrtouint(dm_shift_arg(as), 10, &value)) { + ti->error = "Invalid value for argument `hydration_batch_size'"; + return -EINVAL; + } + clone->hydration_batch_size = value; + } else { + ti->error = "Invalid core argument"; + return -EINVAL; + } + } + + return 0; +} + +static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + unsigned int region_size; + struct dm_arg arg; + + arg.min = MIN_REGION_SIZE; + arg.max = MAX_REGION_SIZE; + arg.error = "Invalid region size"; + + r = dm_read_arg(&arg, as, ®ion_size, error); + if (r) + return r; + + /* Check region size is a power of 2 */ + if (!is_power_of_2(region_size)) { + *error = "Region size is not a power of 2"; + return -EINVAL; + } + + /* Validate the region size against the device logical block size */ + if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) || + region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) { + *error = "Region size is not a multiple of device logical block size"; + return -EINVAL; + } + + clone->region_size = region_size; + + return 0; +} + +static int validate_nr_regions(unsigned long n, char **error) +{ + /* + * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us + * further to 2^31 regions. + */ + if (n > (1UL << 31)) { + *error = "Too many regions. Consider increasing the region size"; + return -EINVAL; + } + + return 0; +} + +static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + sector_t metadata_dev_size; + + r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &clone->metadata_dev); + if (r) { + *error = "Error opening metadata device"; + return r; + } + + metadata_dev_size = get_dev_size(clone->metadata_dev); + if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", + clone->metadata_dev->bdev, DM_CLONE_METADATA_MAX_SECTORS); + + return 0; +} + +static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + sector_t dest_dev_size; + + r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &clone->dest_dev); + if (r) { + *error = "Error opening destination device"; + return r; + } + + dest_dev_size = get_dev_size(clone->dest_dev); + if (dest_dev_size < clone->ti->len) { + dm_put_device(clone->ti, clone->dest_dev); + *error = "Device size larger than destination device"; + return -EINVAL; + } + + return 0; +} + +static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + sector_t source_dev_size; + + r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, + &clone->source_dev); + if (r) { + *error = "Error opening source device"; + return r; + } + + source_dev_size = get_dev_size(clone->source_dev); + if (source_dev_size < clone->ti->len) { + dm_put_device(clone->ti, clone->source_dev); + *error = "Device size larger than source device"; + return -EINVAL; + } + + return 0; +} + +static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error) +{ + unsigned int i; + const char **copy; + + copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); + if (!copy) + goto error; + + for (i = 0; i < argc; i++) { + copy[i] = kstrdup(argv[i], GFP_KERNEL); + + if (!copy[i]) { + while (i--) + kfree(copy[i]); + kfree(copy); + goto error; + } + } + + clone->nr_ctr_args = argc; + clone->ctr_args = copy; + return 0; + +error: + *error = "Failed to allocate memory for table line"; + return -ENOMEM; +} + +static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + sector_t nr_regions; + struct clone *clone; + struct dm_arg_set as; + + if (argc < 4) { + ti->error = "Invalid number of arguments"; + return -EINVAL; + } + + as.argc = argc; + as.argv = argv; + + clone = kzalloc(sizeof(*clone), GFP_KERNEL); + if (!clone) { + ti->error = "Failed to allocate clone structure"; + return -ENOMEM; + } + + clone->ti = ti; + + /* Initialize dm-clone flags */ + __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); + __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); + __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + + r = parse_metadata_dev(clone, &as, &ti->error); + if (r) + goto out_with_clone; + + r = parse_dest_dev(clone, &as, &ti->error); + if (r) + goto out_with_meta_dev; + + r = parse_source_dev(clone, &as, &ti->error); + if (r) + goto out_with_dest_dev; + + r = parse_region_size(clone, &as, &ti->error); + if (r) + goto out_with_source_dev; + + clone->region_shift = __ffs(clone->region_size); + nr_regions = dm_sector_div_up(ti->len, clone->region_size); + + /* Check for overflow */ + if (nr_regions != (unsigned long)nr_regions) { + ti->error = "Too many regions. Consider increasing the region size"; + r = -EOVERFLOW; + goto out_with_source_dev; + } + + clone->nr_regions = nr_regions; + + r = validate_nr_regions(clone->nr_regions, &ti->error); + if (r) + goto out_with_source_dev; + + r = dm_set_target_max_io_len(ti, clone->region_size); + if (r) { + ti->error = "Failed to set max io len"; + goto out_with_source_dev; + } + + r = parse_feature_args(&as, clone); + if (r) + goto out_with_source_dev; + + r = parse_core_args(&as, clone); + if (r) + goto out_with_source_dev; + + /* Load metadata */ + clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len, + clone->region_size); + if (IS_ERR(clone->cmd)) { + ti->error = "Failed to load metadata"; + r = PTR_ERR(clone->cmd); + goto out_with_source_dev; + } + + __set_clone_mode(clone, CM_WRITE); + + if (get_clone_mode(clone) != CM_WRITE) { + ti->error = "Unable to get write access to metadata, please check/repair metadata"; + r = -EPERM; + goto out_with_metadata; + } + + clone->last_commit_jiffies = jiffies; + + /* Allocate hydration hash table */ + r = hash_table_init(clone); + if (r) { + ti->error = "Failed to allocate hydration hash table"; + goto out_with_metadata; + } + + atomic_set(&clone->ios_in_flight, 0); + init_waitqueue_head(&clone->hydration_stopped); + spin_lock_init(&clone->lock); + bio_list_init(&clone->deferred_bios); + bio_list_init(&clone->deferred_discard_bios); + bio_list_init(&clone->deferred_flush_bios); + bio_list_init(&clone->deferred_flush_completions); + clone->hydration_offset = 0; + atomic_set(&clone->hydrations_in_flight, 0); + + clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); + if (!clone->wq) { + ti->error = "Failed to allocate workqueue"; + r = -ENOMEM; + goto out_with_ht; + } + + INIT_WORK(&clone->worker, do_worker); + INIT_DELAYED_WORK(&clone->waker, do_waker); + + clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(clone->kcopyd_client)) { + r = PTR_ERR(clone->kcopyd_client); + goto out_with_wq; + } + + r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS, + _hydration_cache); + if (r) { + ti->error = "Failed to create dm_clone_region_hydration memory pool"; + goto out_with_kcopyd; + } + + /* Save a copy of the table line */ + r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error); + if (r) + goto out_with_mempool; + + mutex_init(&clone->commit_lock); + + /* Enable flushes */ + ti->num_flush_bios = 1; + ti->flush_supported = true; + + /* Enable discards */ + ti->discards_supported = true; + ti->num_discard_bios = 1; + + ti->private = clone; + + return 0; + +out_with_mempool: + mempool_exit(&clone->hydration_pool); +out_with_kcopyd: + dm_kcopyd_client_destroy(clone->kcopyd_client); +out_with_wq: + destroy_workqueue(clone->wq); +out_with_ht: + hash_table_exit(clone); +out_with_metadata: + dm_clone_metadata_close(clone->cmd); +out_with_source_dev: + dm_put_device(ti, clone->source_dev); +out_with_dest_dev: + dm_put_device(ti, clone->dest_dev); +out_with_meta_dev: + dm_put_device(ti, clone->metadata_dev); +out_with_clone: + kfree(clone); + + return r; +} + +static void clone_dtr(struct dm_target *ti) +{ + unsigned int i; + struct clone *clone = ti->private; + + mutex_destroy(&clone->commit_lock); + + for (i = 0; i < clone->nr_ctr_args; i++) + kfree(clone->ctr_args[i]); + kfree(clone->ctr_args); + + mempool_exit(&clone->hydration_pool); + dm_kcopyd_client_destroy(clone->kcopyd_client); + cancel_delayed_work_sync(&clone->waker); + destroy_workqueue(clone->wq); + hash_table_exit(clone); + dm_clone_metadata_close(clone->cmd); + dm_put_device(ti, clone->source_dev); + dm_put_device(ti, clone->dest_dev); + dm_put_device(ti, clone->metadata_dev); + + kfree(clone); +} + +/*---------------------------------------------------------------------------*/ + +static void clone_postsuspend(struct dm_target *ti) +{ + struct clone *clone = ti->private; + + /* + * To successfully suspend the device: + * + * - We cancel the delayed work for periodic commits and wait for + * it to finish. + * + * - We stop the background hydration, i.e. we prevent new region + * hydrations from starting. + * + * - We wait for any in-flight hydrations to finish. + * + * - We flush the workqueue. + * + * - We commit the metadata. + */ + cancel_delayed_work_sync(&clone->waker); + + set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); + + /* + * Make sure set_bit() is ordered before atomic_read(), otherwise we + * might race with do_hydration() and miss some started region + * hydrations. + * + * This is paired with smp_mb__after_atomic() in do_hydration(). + */ + smp_mb__after_atomic(); + + wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight)); + flush_workqueue(clone->wq); + + (void) commit_metadata(clone, NULL); +} + +static void clone_resume(struct dm_target *ti) +{ + struct clone *clone = ti->private; + + clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); + do_waker(&clone->waker.work); +} + +/* + * If discard_passdown was enabled verify that the destination device supports + * discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct clone *clone) +{ + struct block_device *dest_dev = clone->dest_dev->bdev; + struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; + const char *reason = NULL; + + if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) + return; + + if (!bdev_max_discard_sectors(dest_dev)) + reason = "discard unsupported"; + else if (dest_limits->max_discard_sectors < clone->region_size) + reason = "max discard sectors smaller than a region"; + + if (reason) { + DMWARN("Destination device (%pg) %s: Disabling discard passdown.", + dest_dev, reason); + clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + } +} + +static void set_discard_limits(struct clone *clone, struct queue_limits *limits) +{ + struct block_device *dest_bdev = clone->dest_dev->bdev; + struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; + + if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { + /* No passdown is done so we set our own virtual limits */ + limits->discard_granularity = clone->region_size << SECTOR_SHIFT; + limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size); + return; + } + + /* + * clone_iterate_devices() is stacking both the source and destination + * device limits but discards aren't passed to the source device, so + * inherit destination's limits. + */ + limits->max_discard_sectors = dest_limits->max_discard_sectors; + limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; + limits->discard_granularity = dest_limits->discard_granularity; + limits->discard_alignment = dest_limits->discard_alignment; + limits->discard_misaligned = dest_limits->discard_misaligned; + limits->max_discard_segments = dest_limits->max_discard_segments; +} + +static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct clone *clone = ti->private; + u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with + * dm-clone's region size (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < clone->region_size || + do_div(io_opt_sectors, clone->region_size)) { + blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); + blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); + } + + disable_passdown_if_not_supported(clone); + set_discard_limits(clone, limits); +} + +static int clone_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + int ret; + struct clone *clone = ti->private; + struct dm_dev *dest_dev = clone->dest_dev; + struct dm_dev *source_dev = clone->source_dev; + + ret = fn(ti, source_dev, 0, ti->len, data); + if (!ret) + ret = fn(ti, dest_dev, 0, ti->len, data); + return ret; +} + +/* + * dm-clone message functions. + */ +static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions) +{ + WRITE_ONCE(clone->hydration_threshold, nr_regions); + + /* + * If user space sets hydration_threshold to zero then the hydration + * will stop. If at a later time the hydration_threshold is increased + * we must restart the hydration process by waking up the worker. + */ + wake_worker(clone); +} + +static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions) +{ + WRITE_ONCE(clone->hydration_batch_size, nr_regions); +} + +static void enable_hydration(struct clone *clone) +{ + if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) + wake_worker(clone); +} + +static void disable_hydration(struct clone *clone) +{ + clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); +} + +static int clone_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct clone *clone = ti->private; + unsigned int value; + + if (!argc) + return -EINVAL; + + if (!strcasecmp(argv[0], "enable_hydration")) { + enable_hydration(clone); + return 0; + } + + if (!strcasecmp(argv[0], "disable_hydration")) { + disable_hydration(clone); + return 0; + } + + if (argc != 2) + return -EINVAL; + + if (!strcasecmp(argv[0], "hydration_threshold")) { + if (kstrtouint(argv[1], 10, &value)) + return -EINVAL; + + set_hydration_threshold(clone, value); + + return 0; + } + + if (!strcasecmp(argv[0], "hydration_batch_size")) { + if (kstrtouint(argv[1], 10, &value)) + return -EINVAL; + + set_hydration_batch_size(clone, value); + + return 0; + } + + DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]); + return -EINVAL; +} + +static struct target_type clone_target = { + .name = "clone", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = clone_ctr, + .dtr = clone_dtr, + .map = clone_map, + .end_io = clone_endio, + .postsuspend = clone_postsuspend, + .resume = clone_resume, + .status = clone_status, + .message = clone_message, + .io_hints = clone_io_hints, + .iterate_devices = clone_iterate_devices, +}; + +/*---------------------------------------------------------------------------*/ + +/* Module functions */ +static int __init dm_clone_init(void) +{ + int r; + + _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0); + if (!_hydration_cache) + return -ENOMEM; + + r = dm_register_target(&clone_target); + if (r < 0) { + DMERR("Failed to register clone target"); + kmem_cache_destroy(_hydration_cache); + return r; + } + + return 0; +} + +static void __exit dm_clone_exit(void) +{ + dm_unregister_target(&clone_target); + + kmem_cache_destroy(_hydration_cache); + _hydration_cache = NULL; +} + +/* Module hooks */ +module_init(dm_clone_init); +module_exit(dm_clone_exit); + +MODULE_DESCRIPTION(DM_NAME " clone target"); +MODULE_AUTHOR("Nikos Tsironis "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h new file mode 100644 index 000000000..71dcd8fd4 --- /dev/null +++ b/drivers/md/dm-core.h @@ -0,0 +1,341 @@ +/* + * Internal header file _only_ for device mapper core + * + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef DM_CORE_INTERNAL_H +#define DM_CORE_INTERNAL_H + +#include +#include +#include +#include +#include + +#include + +#include "dm.h" +#include "dm-ima.h" + +#define DM_RESERVED_MAX_IOS 1024 + +struct dm_io; + +struct dm_kobject_holder { + struct kobject kobj; + struct completion completion; +}; + +/* + * DM core internal structures used directly by dm.c, dm-rq.c and dm-table.c. + * DM targets must _not_ deference a mapped_device or dm_table to directly + * access their members! + */ + +/* + * For mempools pre-allocation at the table loading time. + */ +struct dm_md_mempools { + struct bio_set bs; + struct bio_set io_bs; +}; + +struct mapped_device { + struct mutex suspend_lock; + + struct mutex table_devices_lock; + struct list_head table_devices; + + /* + * The current mapping (struct dm_table *). + * Use dm_get_live_table{_fast} or take suspend_lock for + * dereference. + */ + void __rcu *map; + + unsigned long flags; + + /* Protect queue and type against concurrent access. */ + struct mutex type_lock; + enum dm_queue_mode type; + + int numa_node_id; + struct request_queue *queue; + + atomic_t holders; + atomic_t open_count; + + struct dm_target *immutable_target; + struct target_type *immutable_target_type; + + char name[16]; + struct gendisk *disk; + struct dax_device *dax_dev; + + wait_queue_head_t wait; + unsigned long __percpu *pending_io; + + /* forced geometry settings */ + struct hd_geometry geometry; + + /* + * Processing queue (flush) + */ + struct workqueue_struct *wq; + + /* + * A list of ios that arrived while we were suspended. + */ + struct work_struct work; + spinlock_t deferred_lock; + struct bio_list deferred; + + /* + * requeue work context is needed for cloning one new bio + * to represent the dm_io to be requeued, since each + * dm_io may point to the original bio from FS. + */ + struct work_struct requeue_work; + struct dm_io *requeue_list; + + void *interface_ptr; + + /* + * Event handling. + */ + wait_queue_head_t eventq; + atomic_t event_nr; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* for blk-mq request-based DM support */ + bool init_tio_pdu:1; + struct blk_mq_tag_set *tag_set; + + struct dm_stats stats; + + /* the number of internal suspends */ + unsigned int internal_suspend_count; + + int swap_bios; + struct semaphore swap_bios_semaphore; + struct mutex swap_bios_lock; + + /* + * io objects are allocated from here. + */ + struct dm_md_mempools *mempools; + + /* kobject and completion */ + struct dm_kobject_holder kobj_holder; + + struct srcu_struct io_barrier; + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_zones; + unsigned int *zwp_offset; +#endif + +#ifdef CONFIG_IMA + struct dm_ima_measurements ima; +#endif +}; + +/* + * Bits for the flags field of struct mapped_device. + */ +#define DMF_BLOCK_IO_FOR_SUSPEND 0 +#define DMF_SUSPENDED 1 +#define DMF_FROZEN 2 +#define DMF_FREEING 3 +#define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_DEFERRED_REMOVE 6 +#define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 +#define DMF_EMULATE_ZONE_APPEND 9 + +void disable_discard(struct mapped_device *md); +void disable_write_zeroes(struct mapped_device *md); + +static inline sector_t dm_get_size(struct mapped_device *md) +{ + return get_capacity(md->disk); +} + +static inline struct dm_stats *dm_get_stats(struct mapped_device *md) +{ + return &md->stats; +} + +DECLARE_STATIC_KEY_FALSE(stats_enabled); +DECLARE_STATIC_KEY_FALSE(swap_bios_enabled); +DECLARE_STATIC_KEY_FALSE(zoned_enabled); + +static inline bool dm_emulate_zone_append(struct mapped_device *md) +{ + if (blk_queue_is_zoned(md->queue)) + return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + return false; +} + +#define DM_TABLE_MAX_DEPTH 16 + +struct dm_table { + struct mapped_device *md; + enum dm_queue_mode type; + + /* btree table */ + unsigned int depth; + unsigned int counts[DM_TABLE_MAX_DEPTH]; /* in nodes */ + sector_t *index[DM_TABLE_MAX_DEPTH]; + + unsigned int num_targets; + unsigned int num_allocated; + sector_t *highs; + struct dm_target *targets; + + struct target_type *immutable_target_type; + + bool integrity_supported:1; + bool singleton:1; + unsigned integrity_added:1; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + fmode_t mode; + + /* a list of devices used by this table */ + struct list_head devices; + struct rw_semaphore devices_lock; + + /* events get handed up using this callback */ + void (*event_fn)(void *); + void *event_context; + + struct dm_md_mempools *mempools; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct blk_crypto_profile *crypto_profile; +#endif +}; + +static inline struct dm_target *dm_table_get_target(struct dm_table *t, + unsigned int index) +{ + BUG_ON(index >= t->num_targets); + return t->targets + index; +} + +/* + * One of these is allocated per clone bio. + */ +#define DM_TIO_MAGIC 28714 +struct dm_target_io { + unsigned short magic; + blk_short_t flags; + unsigned int target_bio_nr; + struct dm_io *io; + struct dm_target *ti; + unsigned int *len_ptr; + sector_t old_sector; + struct bio clone; +}; +#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) +#define DM_IO_BIO_OFFSET \ + (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) + +/* + * dm_target_io flags + */ +enum { + DM_TIO_INSIDE_DM_IO, + DM_TIO_IS_DUPLICATE_BIO +}; + +static inline bool dm_tio_flagged(struct dm_target_io *tio, unsigned int bit) +{ + return (tio->flags & (1U << bit)) != 0; +} + +static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) +{ + tio->flags |= (1U << bit); +} + +static inline bool dm_tio_is_normal(struct dm_target_io *tio) +{ + return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && + !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); +} + +/* + * One of these is allocated per original bio. + * It contains the first clone used for that original. + */ +#define DM_IO_MAGIC 19577 +struct dm_io { + unsigned short magic; + blk_short_t flags; + spinlock_t lock; + unsigned long start_time; + void *data; + struct dm_io *next; + struct dm_stats_aux stats_aux; + blk_status_t status; + atomic_t io_count; + struct mapped_device *md; + + /* The three fields represent mapped part of original bio */ + struct bio *orig_bio; + unsigned int sector_offset; /* offset to end of orig_bio */ + unsigned int sectors; + + /* last member of dm_target_io is 'struct bio' */ + struct dm_target_io tio; +}; + +/* + * dm_io flags + */ +enum { + DM_IO_ACCOUNTED, + DM_IO_WAS_SPLIT +}; + +static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit) +{ + return (io->flags & (1U << bit)) != 0; +} + +static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) +{ + io->flags |= (1U << bit); +} + +void dm_io_rewind(struct dm_io *io, struct bio_set *bs); + +static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) +{ + return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; +} + +unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max); + +static inline bool dm_message_test_buffer_overflow(char *result, unsigned int maxlen) +{ + return !maxlen || strlen(result) + 1 >= maxlen; +} + +extern atomic_t dm_global_event_nr; +extern wait_queue_head_t dm_global_eventq; +void dm_issue_global_event(void); + +#endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c new file mode 100644 index 000000000..ff515437d --- /dev/null +++ b/drivers/md/dm-crypt.c @@ -0,0 +1,3678 @@ +/* + * Copyright (C) 2003 Jana Saout + * Copyright (C) 2004 Clemens Fruhwirth + * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013-2020 Milan Broz + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for struct rtattr and RTA macros only */ +#include +#include +#include +#include + +#include + +#include "dm-audit.h" + +#define DM_MSG_PREFIX "crypt" + +/* + * context holding the current state of a multi-part conversion + */ +struct convert_context { + struct completion restart; + struct bio *bio_in; + struct bio *bio_out; + struct bvec_iter iter_in; + struct bvec_iter iter_out; + u64 cc_sector; + atomic_t cc_pending; + union { + struct skcipher_request *req; + struct aead_request *req_aead; + } r; + +}; + +/* + * per bio private data + */ +struct dm_crypt_io { + struct crypt_config *cc; + struct bio *base_bio; + u8 *integrity_metadata; + bool integrity_metadata_from_pool:1; + bool in_tasklet:1; + + struct work_struct work; + struct tasklet_struct tasklet; + + struct convert_context ctx; + + atomic_t io_pending; + blk_status_t error; + sector_t sector; + + struct rb_node rb_node; +} CRYPTO_MINALIGN_ATTR; + +struct dm_crypt_request { + struct convert_context *ctx; + struct scatterlist sg_in[4]; + struct scatterlist sg_out[4]; + u64 iv_sector; +}; + +struct crypt_config; + +struct crypt_iv_operations { + int (*ctr)(struct crypt_config *cc, struct dm_target *ti, + const char *opts); + void (*dtr)(struct crypt_config *cc); + int (*init)(struct crypt_config *cc); + int (*wipe)(struct crypt_config *cc); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); +}; + +struct iv_benbi_private { + int shift; +}; + +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + +#define TCW_WHITENING_SIZE 16 +struct iv_tcw_private { + struct crypto_shash *crc32_tfm; + u8 *iv_seed; + u8 *whitening; +}; + +#define ELEPHANT_MAX_KEY_SIZE 32 +struct iv_elephant_private { + struct crypto_skcipher *tfm; +}; + +/* + * Crypt: maps a linear range of a block device + * and encrypts / decrypts at the same time. + */ +enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, + DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD, + DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE, + DM_CRYPT_WRITE_INLINE }; + +enum cipher_flags { + CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */ + CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ + CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */ +}; + +/* + * The fields in here must be read only after initialization. + */ +struct crypt_config { + struct dm_dev *dev; + sector_t start; + + struct percpu_counter n_allocated_pages; + + struct workqueue_struct *io_queue; + struct workqueue_struct *crypt_queue; + + spinlock_t write_thread_lock; + struct task_struct *write_thread; + struct rb_root write_tree; + + char *cipher_string; + char *cipher_auth; + char *key_string; + + const struct crypt_iv_operations *iv_gen_ops; + union { + struct iv_benbi_private benbi; + struct iv_lmk_private lmk; + struct iv_tcw_private tcw; + struct iv_elephant_private elephant; + } iv_gen_private; + u64 iv_offset; + unsigned int iv_size; + unsigned short sector_size; + unsigned char sector_shift; + + union { + struct crypto_skcipher **tfms; + struct crypto_aead **tfms_aead; + } cipher_tfm; + unsigned int tfms_count; + unsigned long cipher_flags; + + /* + * Layout of each crypto request: + * + * struct skcipher_request + * context + * padding + * struct dm_crypt_request + * padding + * IV + * + * The padding is added so that dm_crypt_request and the IV are + * correctly aligned. + */ + unsigned int dmreq_start; + + unsigned int per_bio_data_size; + + unsigned long flags; + unsigned int key_size; + unsigned int key_parts; /* independent parts in key buffer */ + unsigned int key_extra_size; /* additional keys length */ + unsigned int key_mac_size; /* MAC key size for authenc(...) */ + + unsigned int integrity_tag_size; + unsigned int integrity_iv_size; + unsigned int on_disk_tag_size; + + /* + * pool for per bio private data, crypto requests, + * encryption requeusts/buffer pages and integrity tags + */ + unsigned int tag_pool_max_sectors; + mempool_t tag_pool; + mempool_t req_pool; + mempool_t page_pool; + + struct bio_set bs; + struct mutex bio_alloc_lock; + + u8 *authenc_key; /* space for keys in authenc() format (if used) */ + u8 key[]; +}; + +#define MIN_IOS 64 +#define MAX_TAG_SIZE 480 +#define POOL_ENTRY_SIZE 512 + +static DEFINE_SPINLOCK(dm_crypt_clients_lock); +static unsigned int dm_crypt_clients_n = 0; +static volatile unsigned long dm_crypt_pages_per_client; +#define DM_CRYPT_MEMORY_PERCENT 2 +#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16) + +static void crypt_endio(struct bio *clone); +static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, + struct scatterlist *sg); + +static bool crypt_integrity_aead(struct crypt_config *cc); + +/* + * Use this to access cipher attributes that are independent of the key. + */ +static struct crypto_skcipher *any_tfm(struct crypt_config *cc) +{ + return cc->cipher_tfm.tfms[0]; +} + +static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) +{ + return cc->cipher_tfm.tfms_aead[0]; +} + +/* + * Different IV generation algorithms: + * + * plain: the initial vector is the 32-bit little-endian version of the sector + * number, padded with zeros if necessary. + * + * plain64: the initial vector is the 64-bit little-endian version of the sector + * number, padded with zeros if necessary. + * + * plain64be: the initial vector is the 64-bit big-endian version of the sector + * number, padded with zeros if necessary. + * + * essiv: "encrypted sector|salt initial vector", the sector number is + * encrypted with the bulk cipher using a salt as key. The salt + * should be derived from the bulk cipher's key via hashing. + * + * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 + * (needed for LRW-32-AES and possible other narrow block modes) + * + * null: the initial vector is always zero. Provides compatibility with + * obsolete loop_fish2 devices. Do not use for new devices. + * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * + * tcw: Compatible implementation of the block chaining mode used + * by the TrueCrypt device encryption system (prior to version 4.1). + * For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from initial key and the sector number. + * In addition, whitening value is applied on every sector, whitening + * is calculated from initial key, sector number and mixed using CRC32. + * Note that this encryption scheme is vulnerable to watermarking attacks + * and should be used for old compatible containers access only. + * + * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode) + * The IV is encrypted little-endian byte-offset (with the same key + * and cipher as the volume). + * + * elephant: The extended version of eboiv with additional Elephant diffuser + * used with Bitlocker CBC mode. + * This mode was used in older Windows systems + * https://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf + */ + +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); + + return 0; +} + +static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + + return 0; +} + +static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + /* iv_size is at least of size u64; usually it is 16 bytes */ + *(__be64 *)&iv[cc->iv_size - sizeof(u64)] = cpu_to_be64(dmreq->iv_sector); + + return 0; +} + +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + /* + * ESSIV encryption of the IV is now handled by the crypto API, + * so just pass the plain sector number here. + */ + memset(iv, 0, cc->iv_size); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + + return 0; +} + +static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + unsigned int bs; + int log; + + if (crypt_integrity_aead(cc)) + bs = crypto_aead_blocksize(any_tfm_aead(cc)); + else + bs = crypto_skcipher_blocksize(any_tfm(cc)); + log = ilog2(bs); + + /* we need to calculate how far we must shift the sector count + * to get the cipher block count, we use this shift in _gen */ + + if (1 << log != bs) { + ti->error = "cypher blocksize is not a power of 2"; + return -EINVAL; + } + + if (log > 9) { + ti->error = "cypher blocksize is > 512"; + return -EINVAL; + } + + cc->iv_gen_private.benbi.shift = 9 - log; + + return 0; +} + +static void crypt_iv_benbi_dtr(struct crypt_config *cc) +{ +} + +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + __be64 val; + + memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ + + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); + put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); + + return 0; +} + +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + + return 0; +} + +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kfree_sensitive(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (cc->sector_size != (1 << SECTOR_SHIFT)) { + ti->error = "Unsupported sector size for LMK"; + return -EINVAL; + } + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, + CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); + struct md5_state md5state; + __le32 buf[4]; + int i, r; + + desc->tfm = lmk->hash_tfm; + + r = crypto_shash_init(desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct scatterlist *sg; + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + sg = crypt_get_sg_data(cc, dmreq->sg_in); + src = kmap_atomic(sg_page(sg)); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset); + kunmap_atomic(src); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct scatterlist *sg; + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + sg = crypt_get_sg_data(cc, dmreq->sg_out); + dst = kmap_atomic(sg_page(sg)); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + sg->offset, iv, cc->iv_size); + + kunmap_atomic(dst); + return r; +} + +static void crypt_iv_tcw_dtr(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + kfree_sensitive(tcw->iv_seed); + tcw->iv_seed = NULL; + kfree_sensitive(tcw->whitening); + tcw->whitening = NULL; + + if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) + crypto_free_shash(tcw->crc32_tfm); + tcw->crc32_tfm = NULL; +} + +static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + if (cc->sector_size != (1 << SECTOR_SHIFT)) { + ti->error = "Unsupported sector size for TCW"; + return -EINVAL; + } + + if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { + ti->error = "Wrong key size for TCW"; + return -EINVAL; + } + + tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, + CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(tcw->crc32_tfm)) { + ti->error = "Error initializing CRC32 in TCW"; + return PTR_ERR(tcw->crc32_tfm); + } + + tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); + tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); + if (!tcw->iv_seed || !tcw->whitening) { + crypt_iv_tcw_dtr(cc); + ti->error = "Error allocating seed storage in TCW"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_tcw_init(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE; + + memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size); + memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size], + TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_wipe(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + memset(tcw->iv_seed, 0, cc->iv_size); + memset(tcw->whitening, 0, TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + __le64 sector = cpu_to_le64(dmreq->iv_sector); + u8 buf[TCW_WHITENING_SIZE]; + SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); + int i, r; + + /* xor whitening with sector number */ + crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); + crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); + + /* calculate crc32 for every 32bit part and xor it */ + desc->tfm = tcw->crc32_tfm; + for (i = 0; i < 4; i++) { + r = crypto_shash_init(desc); + if (r) + goto out; + r = crypto_shash_update(desc, &buf[i * 4], 4); + if (r) + goto out; + r = crypto_shash_final(desc, &buf[i * 4]); + if (r) + goto out; + } + crypto_xor(&buf[0], &buf[12], 4); + crypto_xor(&buf[4], &buf[8], 4); + + /* apply whitening (8 bytes) to whole sector */ + for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) + crypto_xor(data + i * 8, buf, 8); +out: + memzero_explicit(buf, sizeof(buf)); + return r; +} + +static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct scatterlist *sg; + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + __le64 sector = cpu_to_le64(dmreq->iv_sector); + u8 *src; + int r = 0; + + /* Remove whitening from ciphertext */ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + sg = crypt_get_sg_data(cc, dmreq->sg_in); + src = kmap_atomic(sg_page(sg)); + r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); + kunmap_atomic(src); + } + + /* Calculate IV */ + crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)§or, 8); + if (cc->iv_size > 8) + crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, + cc->iv_size - 8); + + return r; +} + +static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct scatterlist *sg; + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return 0; + + /* Apply whitening on ciphertext */ + sg = crypt_get_sg_data(cc, dmreq->sg_out); + dst = kmap_atomic(sg_page(sg)); + r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); + kunmap_atomic(dst); + + return r; +} + +static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + /* Used only for writes, there must be an additional space to store IV */ + get_random_bytes(iv, cc->iv_size); + return 0; +} + +static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + if (crypt_integrity_aead(cc)) { + ti->error = "AEAD transforms not supported for EBOIV"; + return -EINVAL; + } + + if (crypto_skcipher_blocksize(any_tfm(cc)) != cc->iv_size) { + ti->error = "Block size of EBOIV cipher does not match IV size of block cipher"; + return -EINVAL; + } + + return 0; +} + +static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64)); + struct skcipher_request *req; + struct scatterlist src, dst; + DECLARE_CRYPTO_WAIT(wait); + int err; + + req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO); + if (!req) + return -ENOMEM; + + memset(buf, 0, cc->iv_size); + *(__le64 *)buf = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + + sg_init_one(&src, page_address(ZERO_PAGE(0)), cc->iv_size); + sg_init_one(&dst, iv, cc->iv_size); + skcipher_request_set_crypt(req, &src, &dst, cc->iv_size, buf); + skcipher_request_set_callback(req, 0, crypto_req_done, &wait); + err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + skcipher_request_free(req); + + return err; +} + +static void crypt_iv_elephant_dtr(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + + crypto_free_skcipher(elephant->tfm); + elephant->tfm = NULL; +} + +static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + int r; + + elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, + CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(elephant->tfm)) { + r = PTR_ERR(elephant->tfm); + elephant->tfm = NULL; + return r; + } + + r = crypt_iv_eboiv_ctr(cc, ti, NULL); + if (r) + crypt_iv_elephant_dtr(cc); + return r; +} + +static void diffuser_disk_to_cpu(u32 *d, size_t n) +{ +#ifndef __LITTLE_ENDIAN + int i; + + for (i = 0; i < n; i++) + d[i] = le32_to_cpu((__le32)d[i]); +#endif +} + +static void diffuser_cpu_to_disk(__le32 *d, size_t n) +{ +#ifndef __LITTLE_ENDIAN + int i; + + for (i = 0; i < n; i++) + d[i] = cpu_to_le32((u32)d[i]); +#endif +} + +static void diffuser_a_decrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 5; i++) { + i1 = 0; + i2 = n - 2; + i3 = n - 5; + + while (i1 < (n - 1)) { + d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23); + i1++; i2++; i3++; + + if (i3 >= n) + i3 -= n; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + if (i2 >= n) + i2 -= n; + + d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19); + i1++; i2++; i3++; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + } + } +} + +static void diffuser_a_encrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 5; i++) { + i1 = n - 1; + i2 = n - 2 - 1; + i3 = n - 5 - 1; + + while (i1 > 0) { + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19); + i1--; i2--; i3--; + + if (i2 < 0) + i2 += n; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + if (i3 < 0) + i3 += n; + + d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23); + i1--; i2--; i3--; + } + } +} + +static void diffuser_b_decrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 3; i++) { + i1 = 0; + i2 = 2; + i3 = 5; + + while (i1 < (n - 1)) { + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22); + i1++; i2++; i3++; + + if (i2 >= n) + i2 -= n; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + if (i3 >= n) + i3 -= n; + + d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7); + i1++; i2++; i3++; + } + } +} + +static void diffuser_b_encrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 3; i++) { + i1 = n - 1; + i2 = 2 - 1; + i3 = 5 - 1; + + while (i1 > 0) { + d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7); + i1--; i2--; i3--; + + if (i3 < 0) + i3 += n; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + if (i2 < 0) + i2 += n; + + d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22); + i1--; i2--; i3--; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + } + } +} + +static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + u8 *es, *ks, *data, *data2, *data_offset; + struct skcipher_request *req; + struct scatterlist *sg, *sg2, src, dst; + DECLARE_CRYPTO_WAIT(wait); + int i, r; + + req = skcipher_request_alloc(elephant->tfm, GFP_NOIO); + es = kzalloc(16, GFP_NOIO); /* Key for AES */ + ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */ + + if (!req || !es || !ks) { + r = -ENOMEM; + goto out; + } + + *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + + /* E(Ks, e(s)) */ + sg_init_one(&src, es, 16); + sg_init_one(&dst, ks, 16); + skcipher_request_set_crypt(req, &src, &dst, 16, NULL); + skcipher_request_set_callback(req, 0, crypto_req_done, &wait); + r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (r) + goto out; + + /* E(Ks, e'(s)) */ + es[15] = 0x80; + sg_init_one(&dst, &ks[16], 16); + r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (r) + goto out; + + sg = crypt_get_sg_data(cc, dmreq->sg_out); + data = kmap_atomic(sg_page(sg)); + data_offset = data + sg->offset; + + /* Cannot modify original bio, copy to sg_out and apply Elephant to it */ + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + sg2 = crypt_get_sg_data(cc, dmreq->sg_in); + data2 = kmap_atomic(sg_page(sg2)); + memcpy(data_offset, data2 + sg2->offset, cc->sector_size); + kunmap_atomic(data2); + } + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32)); + } + + for (i = 0; i < (cc->sector_size / 32); i++) + crypto_xor(data_offset + i * 32, ks, 32); + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32)); + } + + kunmap_atomic(data); +out: + kfree_sensitive(ks); + kfree_sensitive(es); + skcipher_request_free(req); + return r; +} + +static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + r = crypt_iv_elephant(cc, dmreq); + if (r) + return r; + } + + return crypt_iv_eboiv_gen(cc, iv, dmreq); +} + +static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return crypt_iv_elephant(cc, dmreq); + + return 0; +} + +static int crypt_iv_elephant_init(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + int key_offset = cc->key_size - cc->key_extra_size; + + return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size); +} + +static int crypt_iv_elephant_wipe(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + u8 key[ELEPHANT_MAX_KEY_SIZE]; + + memset(key, 0, cc->key_extra_size); + return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size); +} + +static const struct crypt_iv_operations crypt_iv_plain_ops = { + .generator = crypt_iv_plain_gen +}; + +static const struct crypt_iv_operations crypt_iv_plain64_ops = { + .generator = crypt_iv_plain64_gen +}; + +static const struct crypt_iv_operations crypt_iv_plain64be_ops = { + .generator = crypt_iv_plain64be_gen +}; + +static const struct crypt_iv_operations crypt_iv_essiv_ops = { + .generator = crypt_iv_essiv_gen +}; + +static const struct crypt_iv_operations crypt_iv_benbi_ops = { + .ctr = crypt_iv_benbi_ctr, + .dtr = crypt_iv_benbi_dtr, + .generator = crypt_iv_benbi_gen +}; + +static const struct crypt_iv_operations crypt_iv_null_ops = { + .generator = crypt_iv_null_gen +}; + +static const struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + +static const struct crypt_iv_operations crypt_iv_tcw_ops = { + .ctr = crypt_iv_tcw_ctr, + .dtr = crypt_iv_tcw_dtr, + .init = crypt_iv_tcw_init, + .wipe = crypt_iv_tcw_wipe, + .generator = crypt_iv_tcw_gen, + .post = crypt_iv_tcw_post +}; + +static const struct crypt_iv_operations crypt_iv_random_ops = { + .generator = crypt_iv_random_gen +}; + +static const struct crypt_iv_operations crypt_iv_eboiv_ops = { + .ctr = crypt_iv_eboiv_ctr, + .generator = crypt_iv_eboiv_gen +}; + +static const struct crypt_iv_operations crypt_iv_elephant_ops = { + .ctr = crypt_iv_elephant_ctr, + .dtr = crypt_iv_elephant_dtr, + .init = crypt_iv_elephant_init, + .wipe = crypt_iv_elephant_wipe, + .generator = crypt_iv_elephant_gen, + .post = crypt_iv_elephant_post +}; + +/* + * Integrity extensions + */ +static bool crypt_integrity_aead(struct crypt_config *cc) +{ + return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); +} + +static bool crypt_integrity_hmac(struct crypt_config *cc) +{ + return crypt_integrity_aead(cc) && cc->key_mac_size; +} + +/* Get sg containing data */ +static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, + struct scatterlist *sg) +{ + if (unlikely(crypt_integrity_aead(cc))) + return &sg[2]; + + return sg; +} + +static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) +{ + struct bio_integrity_payload *bip; + unsigned int tag_len; + int ret; + + if (!bio_sectors(bio) || !io->cc->on_disk_tag_size) + return 0; + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); + + bip->bip_iter.bi_size = tag_len; + bip->bip_iter.bi_sector = io->cc->start + io->sector; + + ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), + tag_len, offset_in_page(io->integrity_metadata)); + if (unlikely(ret != tag_len)) + return -ENOMEM; + + return 0; +} + +static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) +{ +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); + struct mapped_device *md = dm_table_get_md(ti->table); + + /* From now we require underlying device with our integrity profile */ + if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { + ti->error = "Integrity profile not supported."; + return -EINVAL; + } + + if (bi->tag_size != cc->on_disk_tag_size || + bi->tuple_size != cc->on_disk_tag_size) { + ti->error = "Integrity profile tag size mismatch."; + return -EINVAL; + } + if (1 << bi->interval_exp != cc->sector_size) { + ti->error = "Integrity profile sector size mismatch."; + return -EINVAL; + } + + if (crypt_integrity_aead(cc)) { + cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; + DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md), + cc->integrity_tag_size, cc->integrity_iv_size); + + if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { + ti->error = "Integrity AEAD auth tag size is not supported."; + return -EINVAL; + } + } else if (cc->integrity_iv_size) + DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md), + cc->integrity_iv_size); + + if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { + ti->error = "Not enough space for integrity tag in the profile."; + return -EINVAL; + } + + return 0; +#else + ti->error = "Integrity profile not supported."; + return -EINVAL; +#endif +} + +static void crypt_convert_init(struct crypt_config *cc, + struct convert_context *ctx, + struct bio *bio_out, struct bio *bio_in, + sector_t sector) +{ + ctx->bio_in = bio_in; + ctx->bio_out = bio_out; + if (bio_in) + ctx->iter_in = bio_in->bi_iter; + if (bio_out) + ctx->iter_out = bio_out->bi_iter; + ctx->cc_sector = sector + cc->iv_offset; + init_completion(&ctx->restart); +} + +static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, + void *req) +{ + return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); +} + +static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) +{ + return (void *)((char *)dmreq - cc->dmreq_start); +} + +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + if (crypt_integrity_aead(cc)) + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_aead_alignmask(any_tfm_aead(cc)) + 1); + else + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_skcipher_alignmask(any_tfm(cc)) + 1); +} + +static u8 *org_iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return iv_of_dmreq(cc, dmreq) + cc->iv_size; +} + +static __le64 *org_sector_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; + return (__le64 *) ptr; +} + +static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + + cc->iv_size + sizeof(uint64_t); + return (unsigned int*)ptr; +} + +static void *tag_from_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + struct convert_context *ctx = dmreq->ctx; + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); + + return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) * + cc->on_disk_tag_size]; +} + +static void *iv_tag_from_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size; +} + +static int crypt_convert_block_aead(struct crypt_config *cc, + struct convert_context *ctx, + struct aead_request *req, + unsigned int tag_offset) +{ + struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); + struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); + struct dm_crypt_request *dmreq; + u8 *iv, *org_iv, *tag_iv, *tag; + __le64 *sector; + int r = 0; + + BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); + + /* Reject unexpected unaligned bio. */ + if (unlikely(bv_in.bv_len & (cc->sector_size - 1))) + return -EIO; + + dmreq = dmreq_of_req(cc, req); + dmreq->iv_sector = ctx->cc_sector; + if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) + dmreq->iv_sector >>= cc->sector_shift; + dmreq->ctx = ctx; + + *org_tag_of_dmreq(cc, dmreq) = tag_offset; + + sector = org_sector_of_dmreq(cc, dmreq); + *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset); + + iv = iv_of_dmreq(cc, dmreq); + org_iv = org_iv_of_dmreq(cc, dmreq); + tag = tag_from_dmreq(cc, dmreq); + tag_iv = iv_tag_from_dmreq(cc, dmreq); + + /* AEAD request: + * |----- AAD -------|------ DATA -------|-- AUTH TAG --| + * | (authenticated) | (auth+encryption) | | + * | sector_LE | IV | sector in/out | tag in/out | + */ + sg_init_table(dmreq->sg_in, 4); + sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t)); + sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size); + sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset); + sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size); + + sg_init_table(dmreq->sg_out, 4); + sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t)); + sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size); + sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset); + sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size); + + if (cc->iv_gen_ops) { + /* For READs use IV stored in integrity metadata */ + if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) { + memcpy(org_iv, tag_iv, cc->iv_size); + } else { + r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); + if (r < 0) + return r; + /* Store generated IV in integrity metadata */ + if (cc->integrity_iv_size) + memcpy(tag_iv, org_iv, cc->iv_size); + } + /* Working copy of IV, to be modified in crypto API */ + memcpy(iv, org_iv, cc->iv_size); + } + + aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size); + if (bio_data_dir(ctx->bio_in) == WRITE) { + aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, + cc->sector_size, iv); + r = crypto_aead_encrypt(req); + if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size) + memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0, + cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size)); + } else { + aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, + cc->sector_size + cc->integrity_tag_size, iv); + r = crypto_aead_decrypt(req); + } + + if (r == -EBADMSG) { + sector_t s = le64_to_cpu(*sector); + + DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", + ctx->bio_in->bi_bdev, s); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", + ctx->bio_in, s, 0); + } + + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + + bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); + bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); + + return r; +} + +static int crypt_convert_block_skcipher(struct crypt_config *cc, + struct convert_context *ctx, + struct skcipher_request *req, + unsigned int tag_offset) +{ + struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); + struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); + struct scatterlist *sg_in, *sg_out; + struct dm_crypt_request *dmreq; + u8 *iv, *org_iv, *tag_iv; + __le64 *sector; + int r = 0; + + /* Reject unexpected unaligned bio. */ + if (unlikely(bv_in.bv_len & (cc->sector_size - 1))) + return -EIO; + + dmreq = dmreq_of_req(cc, req); + dmreq->iv_sector = ctx->cc_sector; + if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) + dmreq->iv_sector >>= cc->sector_shift; + dmreq->ctx = ctx; + + *org_tag_of_dmreq(cc, dmreq) = tag_offset; + + iv = iv_of_dmreq(cc, dmreq); + org_iv = org_iv_of_dmreq(cc, dmreq); + tag_iv = iv_tag_from_dmreq(cc, dmreq); + + sector = org_sector_of_dmreq(cc, dmreq); + *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset); + + /* For skcipher we use only the first sg item */ + sg_in = &dmreq->sg_in[0]; + sg_out = &dmreq->sg_out[0]; + + sg_init_table(sg_in, 1); + sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset); + + sg_init_table(sg_out, 1); + sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset); + + if (cc->iv_gen_ops) { + /* For READs use IV stored in integrity metadata */ + if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) { + memcpy(org_iv, tag_iv, cc->integrity_iv_size); + } else { + r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); + if (r < 0) + return r; + /* Data can be already preprocessed in generator */ + if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags)) + sg_in = sg_out; + /* Store generated IV in integrity metadata */ + if (cc->integrity_iv_size) + memcpy(tag_iv, org_iv, cc->integrity_iv_size); + } + /* Working copy of IV, to be modified in crypto API */ + memcpy(iv, org_iv, cc->iv_size); + } + + skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv); + + if (bio_data_dir(ctx->bio_in) == WRITE) + r = crypto_skcipher_encrypt(req); + else + r = crypto_skcipher_decrypt(req); + + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + + bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); + bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); + + return r; +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error); + +static int crypt_alloc_req_skcipher(struct crypt_config *cc, + struct convert_context *ctx) +{ + unsigned int key_index = ctx->cc_sector & (cc->tfms_count - 1); + + if (!ctx->r.req) { + ctx->r.req = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO); + if (!ctx->r.req) + return -ENOMEM; + } + + skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]); + + /* + * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs + * requests if driver request queue is full. + */ + skcipher_request_set_callback(ctx->r.req, + CRYPTO_TFM_REQ_MAY_BACKLOG, + kcryptd_async_done, dmreq_of_req(cc, ctx->r.req)); + + return 0; +} + +static int crypt_alloc_req_aead(struct crypt_config *cc, + struct convert_context *ctx) +{ + if (!ctx->r.req_aead) { + ctx->r.req_aead = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO); + if (!ctx->r.req_aead) + return -ENOMEM; + } + + aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]); + + /* + * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs + * requests if driver request queue is full. + */ + aead_request_set_callback(ctx->r.req_aead, + CRYPTO_TFM_REQ_MAY_BACKLOG, + kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead)); + + return 0; +} + +static int crypt_alloc_req(struct crypt_config *cc, + struct convert_context *ctx) +{ + if (crypt_integrity_aead(cc)) + return crypt_alloc_req_aead(cc, ctx); + else + return crypt_alloc_req_skcipher(cc, ctx); +} + +static void crypt_free_req_skcipher(struct crypt_config *cc, + struct skcipher_request *req, struct bio *base_bio) +{ + struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); + + if ((struct skcipher_request *)(io + 1) != req) + mempool_free(req, &cc->req_pool); +} + +static void crypt_free_req_aead(struct crypt_config *cc, + struct aead_request *req, struct bio *base_bio) +{ + struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); + + if ((struct aead_request *)(io + 1) != req) + mempool_free(req, &cc->req_pool); +} + +static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio) +{ + if (crypt_integrity_aead(cc)) + crypt_free_req_aead(cc, req, base_bio); + else + crypt_free_req_skcipher(cc, req, base_bio); +} + +/* + * Encrypt / decrypt data from one bio to another one (can be the same one) + */ +static blk_status_t crypt_convert(struct crypt_config *cc, + struct convert_context *ctx, bool atomic, bool reset_pending) +{ + unsigned int tag_offset = 0; + unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; + int r; + + /* + * if reset_pending is set we are dealing with the bio for the first time, + * else we're continuing to work on the previous bio, so don't mess with + * the cc_pending counter + */ + if (reset_pending) + atomic_set(&ctx->cc_pending, 1); + + while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { + + r = crypt_alloc_req(cc, ctx); + if (r) { + complete(&ctx->restart); + return BLK_STS_DEV_RESOURCE; + } + + atomic_inc(&ctx->cc_pending); + + if (crypt_integrity_aead(cc)) + r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset); + else + r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset); + + switch (r) { + /* + * The request was queued by a crypto driver + * but the driver request queue is full, let's wait. + */ + case -EBUSY: + if (in_interrupt()) { + if (try_wait_for_completion(&ctx->restart)) { + /* + * we don't have to block to wait for completion, + * so proceed + */ + } else { + /* + * we can't wait for completion without blocking + * exit and continue processing in a workqueue + */ + ctx->r.req = NULL; + ctx->cc_sector += sector_step; + tag_offset++; + return BLK_STS_DEV_RESOURCE; + } + } else { + wait_for_completion(&ctx->restart); + } + reinit_completion(&ctx->restart); + fallthrough; + /* + * The request is queued and processed asynchronously, + * completion function kcryptd_async_done() will be called. + */ + case -EINPROGRESS: + ctx->r.req = NULL; + ctx->cc_sector += sector_step; + tag_offset++; + continue; + /* + * The request was already processed (synchronously). + */ + case 0: + atomic_dec(&ctx->cc_pending); + ctx->cc_sector += sector_step; + tag_offset++; + if (!atomic) + cond_resched(); + continue; + /* + * There was a data integrity error. + */ + case -EBADMSG: + atomic_dec(&ctx->cc_pending); + return BLK_STS_PROTECTION; + /* + * There was an error while processing the request. + */ + default: + atomic_dec(&ctx->cc_pending); + return BLK_STS_IOERR; + } + } + + return 0; +} + +static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); + +/* + * Generate a new unfragmented bio with the given size + * This should never violate the device limitations (but only because + * max_segment_size is being constrained to PAGE_SIZE). + * + * This function may be called concurrently. If we allocate from the mempool + * concurrently, there is a possibility of deadlock. For example, if we have + * mempool of 256 pages, two processes, each wanting 256, pages allocate from + * the mempool concurrently, it may deadlock in a situation where both processes + * have allocated 128 pages and the mempool is exhausted. + * + * In order to avoid this scenario we allocate the pages under a mutex. + * + * In order to not degrade performance with excessive locking, we try + * non-blocking allocations without a mutex first but on failure we fallback + * to blocking allocations with a mutex. + */ +static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) +{ + struct crypt_config *cc = io->cc; + struct bio *clone; + unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; + unsigned int i, len, remaining_size; + struct page *page; + +retry: + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) + mutex_lock(&cc->bio_alloc_lock); + + clone = bio_alloc_bioset(cc->dev->bdev, nr_iovecs, io->base_bio->bi_opf, + GFP_NOIO, &cc->bs); + clone->bi_private = io; + clone->bi_end_io = crypt_endio; + + remaining_size = size; + + for (i = 0; i < nr_iovecs; i++) { + page = mempool_alloc(&cc->page_pool, gfp_mask); + if (!page) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + gfp_mask |= __GFP_DIRECT_RECLAIM; + goto retry; + } + + len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; + + bio_add_page(clone, page, len, 0); + + remaining_size -= len; + } + + /* Allocate space for integrity tags */ + if (dm_crypt_integrity_io_alloc(io, clone)) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + clone = NULL; + } + + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) + mutex_unlock(&cc->bio_alloc_lock); + + return clone; +} + +static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, clone, iter_all) { + BUG_ON(!bv->bv_page); + mempool_free(bv->bv_page, &cc->page_pool); + } +} + +static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, + struct bio *bio, sector_t sector) +{ + io->cc = cc; + io->base_bio = bio; + io->sector = sector; + io->error = 0; + io->ctx.r.req = NULL; + io->integrity_metadata = NULL; + io->integrity_metadata_from_pool = false; + io->in_tasklet = false; + atomic_set(&io->io_pending, 0); +} + +static void crypt_inc_pending(struct dm_crypt_io *io) +{ + atomic_inc(&io->io_pending); +} + +static void kcryptd_io_bio_endio(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + bio_endio(io->base_bio); +} + +/* + * One of the bios was finished. Check for completion of + * the whole request and correctly clean up the buffer. + */ +static void crypt_dec_pending(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + struct bio *base_bio = io->base_bio; + blk_status_t error = io->error; + + if (!atomic_dec_and_test(&io->io_pending)) + return; + + if (io->ctx.r.req) + crypt_free_req(cc, io->ctx.r.req, base_bio); + + if (unlikely(io->integrity_metadata_from_pool)) + mempool_free(io->integrity_metadata, &io->cc->tag_pool); + else + kfree(io->integrity_metadata); + + base_bio->bi_status = error; + + /* + * If we are running this function from our tasklet, + * we can't call bio_endio() here, because it will call + * clone_endio() from dm.c, which in turn will + * free the current struct dm_crypt_io structure with + * our tasklet. In this case we need to delay bio_endio() + * execution to after the tasklet is done and dequeued. + */ + if (io->in_tasklet) { + INIT_WORK(&io->work, kcryptd_io_bio_endio); + queue_work(cc->io_queue, &io->work); + return; + } + + bio_endio(base_bio); +} + +/* + * kcryptd/kcryptd_io: + * + * Needed because it would be very unwise to do decryption in an + * interrupt context. + * + * kcryptd performs the actual encryption or decryption. + * + * kcryptd_io performs the IO submission. + * + * They must be separated as otherwise the final stages could be + * starved by new requests which can block in the first stages due + * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. + */ +static void crypt_endio(struct bio *clone) +{ + struct dm_crypt_io *io = clone->bi_private; + struct crypt_config *cc = io->cc; + unsigned int rw = bio_data_dir(clone); + blk_status_t error; + + /* + * free the processed pages + */ + if (rw == WRITE) + crypt_free_buffer_pages(cc, clone); + + error = clone->bi_status; + bio_put(clone); + + if (rw == READ && !error) { + kcryptd_queue_crypt(io); + return; + } + + if (unlikely(error)) + io->error = error; + + crypt_dec_pending(io); +} + +#define CRYPT_MAP_READ_GFP GFP_NOWAIT + +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) +{ + struct crypt_config *cc = io->cc; + struct bio *clone; + + /* + * We need the original biovec array in order to decrypt the whole bio + * data *afterwards* -- thanks to immutable biovecs we don't need to + * worry about the block layer modifying the biovec array; so leverage + * bio_alloc_clone(). + */ + clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs); + if (!clone) + return 1; + clone->bi_private = io; + clone->bi_end_io = crypt_endio; + + crypt_inc_pending(io); + + clone->bi_iter.bi_sector = cc->start + io->sector; + + if (dm_crypt_integrity_io_alloc(io, clone)) { + crypt_dec_pending(io); + bio_put(clone); + return 1; + } + + dm_submit_bio_remap(io->base_bio, clone); + return 0; +} + +static void kcryptd_io_read_work(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = BLK_STS_RESOURCE; + crypt_dec_pending(io); +} + +static void kcryptd_queue_read(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + + INIT_WORK(&io->work, kcryptd_io_read_work); + queue_work(cc->io_queue, &io->work); +} + +static void kcryptd_io_write(struct dm_crypt_io *io) +{ + struct bio *clone = io->ctx.bio_out; + + dm_submit_bio_remap(io->base_bio, clone); +} + +#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) + +static int dmcrypt_write(void *data) +{ + struct crypt_config *cc = data; + struct dm_crypt_io *io; + + while (1) { + struct rb_root write_tree; + struct blk_plug plug; + + spin_lock_irq(&cc->write_thread_lock); +continue_locked: + + if (!RB_EMPTY_ROOT(&cc->write_tree)) + goto pop_from_list; + + set_current_state(TASK_INTERRUPTIBLE); + + spin_unlock_irq(&cc->write_thread_lock); + + if (unlikely(kthread_should_stop())) { + set_current_state(TASK_RUNNING); + break; + } + + schedule(); + + set_current_state(TASK_RUNNING); + spin_lock_irq(&cc->write_thread_lock); + goto continue_locked; + +pop_from_list: + write_tree = cc->write_tree; + cc->write_tree = RB_ROOT; + spin_unlock_irq(&cc->write_thread_lock); + + BUG_ON(rb_parent(write_tree.rb_node)); + + /* + * Note: we cannot walk the tree here with rb_next because + * the structures may be freed when kcryptd_io_write is called. + */ + blk_start_plug(&plug); + do { + io = crypt_io_from_node(rb_first(&write_tree)); + rb_erase(&io->rb_node, &write_tree); + kcryptd_io_write(io); + cond_resched(); + } while (!RB_EMPTY_ROOT(&write_tree)); + blk_finish_plug(&plug); + } + return 0; +} + +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) +{ + struct bio *clone = io->ctx.bio_out; + struct crypt_config *cc = io->cc; + unsigned long flags; + sector_t sector; + struct rb_node **rbp, *parent; + + if (unlikely(io->error)) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + crypt_dec_pending(io); + return; + } + + /* crypt_convert should have filled the clone bio */ + BUG_ON(io->ctx.iter_out.bi_size); + + clone->bi_iter.bi_sector = cc->start + io->sector; + + if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { + dm_submit_bio_remap(io->base_bio, clone); + return; + } + + spin_lock_irqsave(&cc->write_thread_lock, flags); + if (RB_EMPTY_ROOT(&cc->write_tree)) + wake_up_process(cc->write_thread); + rbp = &cc->write_tree.rb_node; + parent = NULL; + sector = io->sector; + while (*rbp) { + parent = *rbp; + if (sector < crypt_io_from_node(parent)->sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + rb_link_node(&io->rb_node, parent, rbp); + rb_insert_color(&io->rb_node, &cc->write_tree); + spin_unlock_irqrestore(&cc->write_thread_lock, flags); +} + +static bool kcryptd_crypt_write_inline(struct crypt_config *cc, + struct convert_context *ctx) + +{ + if (!test_bit(DM_CRYPT_WRITE_INLINE, &cc->flags)) + return false; + + /* + * Note: zone append writes (REQ_OP_ZONE_APPEND) do not have ordering + * constraints so they do not need to be issued inline by + * kcryptd_crypt_write_convert(). + */ + switch (bio_op(ctx->bio_in)) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + return false; + } +} + +static void kcryptd_crypt_write_continue(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + struct crypt_config *cc = io->cc; + struct convert_context *ctx = &io->ctx; + int crypt_finished; + sector_t sector = io->sector; + blk_status_t r; + + wait_for_completion(&ctx->restart); + reinit_completion(&ctx->restart); + + r = crypt_convert(cc, &io->ctx, true, false); + if (r) + io->error = r; + crypt_finished = atomic_dec_and_test(&ctx->cc_pending); + if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) { + /* Wait for completion signaled by kcryptd_async_done() */ + wait_for_completion(&ctx->restart); + crypt_finished = 1; + } + + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io, 0); + io->sector = sector; + } + + crypt_dec_pending(io); +} + +static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + struct convert_context *ctx = &io->ctx; + struct bio *clone; + int crypt_finished; + sector_t sector = io->sector; + blk_status_t r; + + /* + * Prevent io from disappearing until this function completes. + */ + crypt_inc_pending(io); + crypt_convert_init(cc, ctx, NULL, io->base_bio, sector); + + clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); + if (unlikely(!clone)) { + io->error = BLK_STS_IOERR; + goto dec; + } + + io->ctx.bio_out = clone; + io->ctx.iter_out = clone->bi_iter; + + sector += bio_sectors(clone); + + crypt_inc_pending(io); + r = crypt_convert(cc, ctx, + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true); + /* + * Crypto API backlogged the request, because its queue was full + * and we're in softirq context, so continue from a workqueue + * (TODO: is it actually possible to be in softirq in the write path?) + */ + if (r == BLK_STS_DEV_RESOURCE) { + INIT_WORK(&io->work, kcryptd_crypt_write_continue); + queue_work(cc->crypt_queue, &io->work); + return; + } + if (r) + io->error = r; + crypt_finished = atomic_dec_and_test(&ctx->cc_pending); + if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) { + /* Wait for completion signaled by kcryptd_async_done() */ + wait_for_completion(&ctx->restart); + crypt_finished = 1; + } + + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io, 0); + io->sector = sector; + } + +dec: + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_done(struct dm_crypt_io *io) +{ + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_continue(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + struct crypt_config *cc = io->cc; + blk_status_t r; + + wait_for_completion(&io->ctx.restart); + reinit_completion(&io->ctx.restart); + + r = crypt_convert(cc, &io->ctx, true, false); + if (r) + io->error = r; + + if (atomic_dec_and_test(&io->ctx.cc_pending)) + kcryptd_crypt_read_done(io); + + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + blk_status_t r; + + crypt_inc_pending(io); + + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, + io->sector); + + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + /* + * Crypto API backlogged the request, because its queue was full + * and we're in softirq context, so continue from a workqueue + */ + if (r == BLK_STS_DEV_RESOURCE) { + INIT_WORK(&io->work, kcryptd_crypt_read_continue); + queue_work(cc->crypt_queue, &io->work); + return; + } + if (r) + io->error = r; + + if (atomic_dec_and_test(&io->ctx.cc_pending)) + kcryptd_crypt_read_done(io); + + crypt_dec_pending(io); +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error) +{ + struct dm_crypt_request *dmreq = async_req->data; + struct convert_context *ctx = dmreq->ctx; + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); + struct crypt_config *cc = io->cc; + + /* + * A request from crypto driver backlog is going to be processed now, + * finish the completion and continue in crypt_convert(). + * (Callback will be called for the second time for this request.) + */ + if (error == -EINPROGRESS) { + complete(&ctx->restart); + return; + } + + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); + + if (error == -EBADMSG) { + sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)); + + DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", + ctx->bio_in->bi_bdev, s); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", + ctx->bio_in, s, 0); + io->error = BLK_STS_PROTECTION; + } else if (error < 0) + io->error = BLK_STS_IOERR; + + crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); + + if (!atomic_dec_and_test(&ctx->cc_pending)) + return; + + /* + * The request is fully completed: for inline writes, let + * kcryptd_crypt_write_convert() do the IO submission. + */ + if (bio_data_dir(io->base_bio) == READ) { + kcryptd_crypt_read_done(io); + return; + } + + if (kcryptd_crypt_write_inline(cc, ctx)) { + complete(&ctx->restart); + return; + } + + kcryptd_crypt_write_io_submit(io, 1); +} + +static void kcryptd_crypt(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + if (bio_data_dir(io->base_bio) == READ) + kcryptd_crypt_read_convert(io); + else + kcryptd_crypt_write_convert(io); +} + +static void kcryptd_crypt_tasklet(unsigned long work) +{ + kcryptd_crypt((struct work_struct *)work); +} + +static void kcryptd_queue_crypt(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + + if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) || + (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) { + /* + * in_hardirq(): Crypto API's skcipher_walk_first() refuses to work in hard IRQ context. + * irqs_disabled(): the kernel may run some IO completion from the idle thread, but + * it is being executed with irqs disabled. + */ + if (in_hardirq() || irqs_disabled()) { + io->in_tasklet = true; + tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work); + tasklet_schedule(&io->tasklet); + return; + } + + kcryptd_crypt(&io->work); + return; + } + + INIT_WORK(&io->work, kcryptd_crypt); + queue_work(cc->crypt_queue, &io->work); +} + +static void crypt_free_tfms_aead(struct crypt_config *cc) +{ + if (!cc->cipher_tfm.tfms_aead) + return; + + if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) { + crypto_free_aead(cc->cipher_tfm.tfms_aead[0]); + cc->cipher_tfm.tfms_aead[0] = NULL; + } + + kfree(cc->cipher_tfm.tfms_aead); + cc->cipher_tfm.tfms_aead = NULL; +} + +static void crypt_free_tfms_skcipher(struct crypt_config *cc) +{ + unsigned int i; + + if (!cc->cipher_tfm.tfms) + return; + + for (i = 0; i < cc->tfms_count; i++) + if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) { + crypto_free_skcipher(cc->cipher_tfm.tfms[i]); + cc->cipher_tfm.tfms[i] = NULL; + } + + kfree(cc->cipher_tfm.tfms); + cc->cipher_tfm.tfms = NULL; +} + +static void crypt_free_tfms(struct crypt_config *cc) +{ + if (crypt_integrity_aead(cc)) + crypt_free_tfms_aead(cc); + else + crypt_free_tfms_skcipher(cc); +} + +static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) +{ + unsigned int i; + int err; + + cc->cipher_tfm.tfms = kcalloc(cc->tfms_count, + sizeof(struct crypto_skcipher *), + GFP_KERNEL); + if (!cc->cipher_tfm.tfms) + return -ENOMEM; + + for (i = 0; i < cc->tfms_count; i++) { + cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, + CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(cc->cipher_tfm.tfms[i])) { + err = PTR_ERR(cc->cipher_tfm.tfms[i]); + crypt_free_tfms(cc); + return err; + } + } + + /* + * dm-crypt performance can vary greatly depending on which crypto + * algorithm implementation is used. Help people debug performance + * problems by logging the ->cra_driver_name. + */ + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, + crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name); + return 0; +} + +static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) +{ + int err; + + cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL); + if (!cc->cipher_tfm.tfms) + return -ENOMEM; + + cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, + CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) { + err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]); + crypt_free_tfms(cc); + return err; + } + + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, + crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name); + return 0; +} + +static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) +{ + if (crypt_integrity_aead(cc)) + return crypt_alloc_tfms_aead(cc, ciphermode); + else + return crypt_alloc_tfms_skcipher(cc, ciphermode); +} + +static unsigned int crypt_subkey_size(struct crypt_config *cc) +{ + return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); +} + +static unsigned int crypt_authenckey_size(struct crypt_config *cc) +{ + return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param)); +} + +/* + * If AEAD is composed like authenc(hmac(sha256),xts(aes)), + * the key must be for some reason in special format. + * This funcion converts cc->key to this special format. + */ +static void crypt_copy_authenckey(char *p, const void *key, + unsigned int enckeylen, unsigned int authkeylen) +{ + struct crypto_authenc_key_param *param; + struct rtattr *rta; + + rta = (struct rtattr *)p; + param = RTA_DATA(rta); + param->enckeylen = cpu_to_be32(enckeylen); + rta->rta_len = RTA_LENGTH(sizeof(*param)); + rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM; + p += RTA_SPACE(sizeof(*param)); + memcpy(p, key + enckeylen, authkeylen); + p += authkeylen; + memcpy(p, key, enckeylen); +} + +static int crypt_setkey(struct crypt_config *cc) +{ + unsigned int subkey_size; + int err = 0, i, r; + + /* Ignore extra keys (which are used for IV etc) */ + subkey_size = crypt_subkey_size(cc); + + if (crypt_integrity_hmac(cc)) { + if (subkey_size < cc->key_mac_size) + return -EINVAL; + + crypt_copy_authenckey(cc->authenc_key, cc->key, + subkey_size - cc->key_mac_size, + cc->key_mac_size); + } + + for (i = 0; i < cc->tfms_count; i++) { + if (crypt_integrity_hmac(cc)) + r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], + cc->authenc_key, crypt_authenckey_size(cc)); + else if (crypt_integrity_aead(cc)) + r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], + cc->key + (i * subkey_size), + subkey_size); + else + r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i], + cc->key + (i * subkey_size), + subkey_size); + if (r) + err = r; + } + + if (crypt_integrity_hmac(cc)) + memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc)); + + return err; +} + +#ifdef CONFIG_KEYS + +static bool contains_whitespace(const char *str) +{ + while (*str) + if (isspace(*str++)) + return true; + return false; +} + +static int set_key_user(struct crypt_config *cc, struct key *key) +{ + const struct user_key_payload *ukp; + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (cc->key_size != ukp->datalen) + return -EINVAL; + + memcpy(cc->key, ukp->data, cc->key_size); + + return 0; +} + +static int set_key_encrypted(struct crypt_config *cc, struct key *key) +{ + const struct encrypted_key_payload *ekp; + + ekp = key->payload.data[0]; + if (!ekp) + return -EKEYREVOKED; + + if (cc->key_size != ekp->decrypted_datalen) + return -EINVAL; + + memcpy(cc->key, ekp->decrypted_data, cc->key_size); + + return 0; +} + +static int set_key_trusted(struct crypt_config *cc, struct key *key) +{ + const struct trusted_key_payload *tkp; + + tkp = key->payload.data[0]; + if (!tkp) + return -EKEYREVOKED; + + if (cc->key_size != tkp->key_len) + return -EINVAL; + + memcpy(cc->key, tkp->key, cc->key_size); + + return 0; +} + +static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) +{ + char *new_key_string, *key_desc; + int ret; + struct key_type *type; + struct key *key; + int (*set_key)(struct crypt_config *cc, struct key *key); + + /* + * Reject key_string with whitespace. dm core currently lacks code for + * proper whitespace escaping in arguments on DM_TABLE_STATUS path. + */ + if (contains_whitespace(key_string)) { + DMERR("whitespace chars not allowed in key string"); + return -EINVAL; + } + + /* look for next ':' separating key_type from key_description */ + key_desc = strpbrk(key_string, ":"); + if (!key_desc || key_desc == key_string || !strlen(key_desc + 1)) + return -EINVAL; + + if (!strncmp(key_string, "logon:", key_desc - key_string + 1)) { + type = &key_type_logon; + set_key = set_key_user; + } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) { + type = &key_type_user; + set_key = set_key_user; + } else if (IS_ENABLED(CONFIG_ENCRYPTED_KEYS) && + !strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { + type = &key_type_encrypted; + set_key = set_key_encrypted; + } else if (IS_ENABLED(CONFIG_TRUSTED_KEYS) && + !strncmp(key_string, "trusted:", key_desc - key_string + 1)) { + type = &key_type_trusted; + set_key = set_key_trusted; + } else { + return -EINVAL; + } + + new_key_string = kstrdup(key_string, GFP_KERNEL); + if (!new_key_string) + return -ENOMEM; + + key = request_key(type, key_desc + 1, NULL); + if (IS_ERR(key)) { + kfree_sensitive(new_key_string); + return PTR_ERR(key); + } + + down_read(&key->sem); + + ret = set_key(cc, key); + if (ret < 0) { + up_read(&key->sem); + key_put(key); + kfree_sensitive(new_key_string); + return ret; + } + + up_read(&key->sem); + key_put(key); + + /* clear the flag since following operations may invalidate previously valid key */ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + ret = crypt_setkey(cc); + + if (!ret) { + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + kfree_sensitive(cc->key_string); + cc->key_string = new_key_string; + } else + kfree_sensitive(new_key_string); + + return ret; +} + +static int get_key_size(char **key_string) +{ + char *colon, dummy; + int ret; + + if (*key_string[0] != ':') + return strlen(*key_string) >> 1; + + /* look for next ':' in key string */ + colon = strpbrk(*key_string + 1, ":"); + if (!colon) + return -EINVAL; + + if (sscanf(*key_string + 1, "%u%c", &ret, &dummy) != 2 || dummy != ':') + return -EINVAL; + + *key_string = colon; + + /* remaining key string should be :: */ + + return ret; +} + +#else + +static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) +{ + return -EINVAL; +} + +static int get_key_size(char **key_string) +{ + return (*key_string[0] == ':') ? -EINVAL : (int)(strlen(*key_string) >> 1); +} + +#endif /* CONFIG_KEYS */ + +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + int r = -EINVAL; + int key_string_len = strlen(key); + + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + goto out; + + /* ':' means the key is in kernel keyring, short-circuit normal key processing */ + if (key[0] == ':') { + r = crypt_set_keyring_key(cc, key + 1); + goto out; + } + + /* clear the flag since following operations may invalidate previously valid key */ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + /* wipe references to any kernel keyring key */ + kfree_sensitive(cc->key_string); + cc->key_string = NULL; + + /* Decode key from its hex representation. */ + if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0) + goto out; + + r = crypt_setkey(cc); + if (!r) + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + +out: + /* Hex key string not needed after here, so wipe it. */ + memset(key, '0', key_string_len); + + return r; +} + +static int crypt_wipe_key(struct crypt_config *cc) +{ + int r; + + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + get_random_bytes(&cc->key, cc->key_size); + + /* Wipe IV private keys */ + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { + r = cc->iv_gen_ops->wipe(cc); + if (r) + return r; + } + + kfree_sensitive(cc->key_string); + cc->key_string = NULL; + r = crypt_setkey(cc); + memset(&cc->key, 0, cc->key_size * sizeof(u8)); + + return r; +} + +static void crypt_calculate_pages_per_client(void) +{ + unsigned long pages = (totalram_pages() - totalhigh_pages()) * DM_CRYPT_MEMORY_PERCENT / 100; + + if (!dm_crypt_clients_n) + return; + + pages /= dm_crypt_clients_n; + if (pages < DM_CRYPT_MIN_PAGES_PER_CLIENT) + pages = DM_CRYPT_MIN_PAGES_PER_CLIENT; + dm_crypt_pages_per_client = pages; +} + +static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct crypt_config *cc = pool_data; + struct page *page; + + /* + * Note, percpu_counter_read_positive() may over (and under) estimate + * the current usage by at most (batch - 1) * num_online_cpus() pages, + * but avoids potential spinlock contention of an exact result. + */ + if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) && + likely(gfp_mask & __GFP_NORETRY)) + return NULL; + + page = alloc_page(gfp_mask); + if (likely(page != NULL)) + percpu_counter_add(&cc->n_allocated_pages, 1); + + return page; +} + +static void crypt_page_free(void *page, void *pool_data) +{ + struct crypt_config *cc = pool_data; + + __free_page(page); + percpu_counter_sub(&cc->n_allocated_pages, 1); +} + +static void crypt_dtr(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + ti->private = NULL; + + if (!cc) + return; + + if (cc->write_thread) + kthread_stop(cc->write_thread); + + if (cc->io_queue) + destroy_workqueue(cc->io_queue); + if (cc->crypt_queue) + destroy_workqueue(cc->crypt_queue); + + crypt_free_tfms(cc); + + bioset_exit(&cc->bs); + + mempool_exit(&cc->page_pool); + mempool_exit(&cc->req_pool); + mempool_exit(&cc->tag_pool); + + WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0); + percpu_counter_destroy(&cc->n_allocated_pages); + + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) + cc->iv_gen_ops->dtr(cc); + + if (cc->dev) + dm_put_device(ti, cc->dev); + + kfree_sensitive(cc->cipher_string); + kfree_sensitive(cc->key_string); + kfree_sensitive(cc->cipher_auth); + kfree_sensitive(cc->authenc_key); + + mutex_destroy(&cc->bio_alloc_lock); + + /* Must zero key material before freeing */ + kfree_sensitive(cc); + + spin_lock(&dm_crypt_clients_lock); + WARN_ON(!dm_crypt_clients_n); + dm_crypt_clients_n--; + crypt_calculate_pages_per_client(); + spin_unlock(&dm_crypt_clients_lock); + + dm_audit_log_dtr(DM_MSG_PREFIX, ti, 1); +} + +static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) +{ + struct crypt_config *cc = ti->private; + + if (crypt_integrity_aead(cc)) + cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); + else + cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); + + if (cc->iv_size) + /* at least a 64 bit sector number should fit in our buffer */ + cc->iv_size = max(cc->iv_size, + (unsigned int)(sizeof(u64) / sizeof(u8))); + else if (ivmode) { + DMWARN("Selected cipher does not support IVs"); + ivmode = NULL; + } + + /* Choose ivmode, see comments at iv code. */ + if (ivmode == NULL) + cc->iv_gen_ops = NULL; + else if (strcmp(ivmode, "plain") == 0) + cc->iv_gen_ops = &crypt_iv_plain_ops; + else if (strcmp(ivmode, "plain64") == 0) + cc->iv_gen_ops = &crypt_iv_plain64_ops; + else if (strcmp(ivmode, "plain64be") == 0) + cc->iv_gen_ops = &crypt_iv_plain64be_ops; + else if (strcmp(ivmode, "essiv") == 0) + cc->iv_gen_ops = &crypt_iv_essiv_ops; + else if (strcmp(ivmode, "benbi") == 0) + cc->iv_gen_ops = &crypt_iv_benbi_ops; + else if (strcmp(ivmode, "null") == 0) + cc->iv_gen_ops = &crypt_iv_null_ops; + else if (strcmp(ivmode, "eboiv") == 0) + cc->iv_gen_ops = &crypt_iv_eboiv_ops; + else if (strcmp(ivmode, "elephant") == 0) { + cc->iv_gen_ops = &crypt_iv_elephant_ops; + cc->key_parts = 2; + cc->key_extra_size = cc->key_size / 2; + if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE) + return -EINVAL; + set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags); + } else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* + * Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + * All keys (including IV seed) are always the same size. + */ + if (cc->key_size % cc->key_parts) { + cc->key_parts++; + cc->key_extra_size = cc->key_size / cc->key_parts; + } + } else if (strcmp(ivmode, "tcw") == 0) { + cc->iv_gen_ops = &crypt_iv_tcw_ops; + cc->key_parts += 2; /* IV + whitening */ + cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; + } else if (strcmp(ivmode, "random") == 0) { + cc->iv_gen_ops = &crypt_iv_random_ops; + /* Need storage space in integrity fields. */ + cc->integrity_iv_size = cc->iv_size; + } else { + ti->error = "Invalid IV mode"; + return -EINVAL; + } + + return 0; +} + +/* + * Workaround to parse HMAC algorithm from AEAD crypto API spec. + * The HMAC is needed to calculate tag size (HMAC digest size). + * This should be probably done by crypto-api calls (once available...) + */ +static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api) +{ + char *start, *end, *mac_alg = NULL; + struct crypto_ahash *mac; + + if (!strstarts(cipher_api, "authenc(")) + return 0; + + start = strchr(cipher_api, '('); + end = strchr(cipher_api, ','); + if (!start || !end || ++start > end) + return -EINVAL; + + mac_alg = kzalloc(end - start + 1, GFP_KERNEL); + if (!mac_alg) + return -ENOMEM; + strncpy(mac_alg, start, end - start); + + mac = crypto_alloc_ahash(mac_alg, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + kfree(mac_alg); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + cc->key_mac_size = crypto_ahash_digestsize(mac); + crypto_free_ahash(mac); + + cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL); + if (!cc->authenc_key) + return -ENOMEM; + + return 0; +} + +static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key, + char **ivmode, char **ivopts) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher_api, buf[CRYPTO_MAX_ALG_NAME]; + int ret = -EINVAL; + + cc->tfms_count = 1; + + /* + * New format (capi: prefix) + * capi:cipher_api_spec-iv:ivopts + */ + tmp = &cipher_in[strlen("capi:")]; + + /* Separate IV options if present, it can contain another '-' in hash name */ + *ivopts = strrchr(tmp, ':'); + if (*ivopts) { + **ivopts = '\0'; + (*ivopts)++; + } + /* Parse IV mode */ + *ivmode = strrchr(tmp, '-'); + if (*ivmode) { + **ivmode = '\0'; + (*ivmode)++; + } + /* The rest is crypto API spec */ + cipher_api = tmp; + + /* Alloc AEAD, can be used only in new format. */ + if (crypt_integrity_aead(cc)) { + ret = crypt_ctr_auth_cipher(cc, cipher_api); + if (ret < 0) { + ti->error = "Invalid AEAD cipher spec"; + return -ENOMEM; + } + } + + if (*ivmode && !strcmp(*ivmode, "lmk")) + cc->tfms_count = 64; + + if (*ivmode && !strcmp(*ivmode, "essiv")) { + if (!*ivopts) { + ti->error = "Digest algorithm missing for ESSIV mode"; + return -EINVAL; + } + ret = snprintf(buf, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)", + cipher_api, *ivopts); + if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) { + ti->error = "Cannot allocate cipher string"; + return -ENOMEM; + } + cipher_api = buf; + } + + cc->key_parts = cc->tfms_count; + + /* Allocate cipher */ + ret = crypt_alloc_tfms(cc, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + return ret; + } + + if (crypt_integrity_aead(cc)) + cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); + else + cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); + + return 0; +} + +static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key, + char **ivmode, char **ivopts) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *keycount; + char *cipher_api = NULL; + int ret = -EINVAL; + char dummy; + + if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) { + ti->error = "Bad cipher specification"; + return -EINVAL; + } + + /* + * Legacy dm-crypt cipher specification + * cipher[:keycount]-mode-iv:ivopts + */ + tmp = cipher_in; + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; + + chainmode = strsep(&tmp, "-"); + *ivmode = strsep(&tmp, ":"); + *ivopts = tmp; + + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ + if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) { + chainmode = "cbc"; + *ivmode = "plain"; + } + + if (strcmp(chainmode, "ecb") && !*ivmode) { + ti->error = "IV mechanism required"; + return -EINVAL; + } + + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + if (*ivmode && !strcmp(*ivmode, "essiv")) { + if (!*ivopts) { + ti->error = "Digest algorithm missing for ESSIV mode"; + kfree(cipher_api); + return -EINVAL; + } + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "essiv(%s(%s),%s)", chainmode, cipher, *ivopts); + } else { + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + } + if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) { + kfree(cipher_api); + goto bad_mem; + } + + /* Allocate cipher */ + ret = crypt_alloc_tfms(cc, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + kfree(cipher_api); + return ret; + } + kfree(cipher_api); + + return 0; +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) +{ + struct crypt_config *cc = ti->private; + char *ivmode = NULL, *ivopts = NULL; + int ret; + + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) { + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; + } + + if (strstarts(cipher_in, "capi:")) + ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts); + else + ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts); + if (ret) + return ret; + + /* Initialize IV */ + ret = crypt_ctr_ivmode(ti, ivmode); + if (ret < 0) + return ret; + + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { + ti->error = "Error decoding and setting key"; + return ret; + } + + /* Allocate IV */ + if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { + ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); + if (ret < 0) { + ti->error = "Error creating IV"; + return ret; + } + } + + /* Initialize IV (set keys for ESSIV etc) */ + if (cc->iv_gen_ops && cc->iv_gen_ops->init) { + ret = cc->iv_gen_ops->init(cc); + if (ret < 0) { + ti->error = "Error initialising IV"; + return ret; + } + } + + /* wipe the kernel key payload copy */ + if (cc->key_string) + memset(cc->key, 0, cc->key_size * sizeof(u8)); + + return ret; +} + +static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc = ti->private; + struct dm_arg_set as; + static const struct dm_arg _args[] = { + {0, 8, "Invalid number of feature args"}, + }; + unsigned int opt_params, val; + const char *opt_string, *sval; + char dummy; + int ret; + + /* Optional parameters */ + as.argc = argc; + as.argv = argv; + + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + return ret; + + while (opt_params--) { + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + return -EINVAL; + } + + if (!strcasecmp(opt_string, "allow_discards")) + ti->num_discard_bios = 1; + + else if (!strcasecmp(opt_string, "same_cpu_crypt")) + set_bit(DM_CRYPT_SAME_CPU, &cc->flags); + + else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) + set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + else if (!strcasecmp(opt_string, "no_read_workqueue")) + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + else if (!strcasecmp(opt_string, "no_write_workqueue")) + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); + else if (sscanf(opt_string, "integrity:%u:", &val) == 1) { + if (val == 0 || val > MAX_TAG_SIZE) { + ti->error = "Invalid integrity arguments"; + return -EINVAL; + } + cc->on_disk_tag_size = val; + sval = strchr(opt_string + strlen("integrity:"), ':') + 1; + if (!strcasecmp(sval, "aead")) { + set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); + } else if (strcasecmp(sval, "none")) { + ti->error = "Unknown integrity profile"; + return -EINVAL; + } + + cc->cipher_auth = kstrdup(sval, GFP_KERNEL); + if (!cc->cipher_auth) + return -ENOMEM; + } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) { + if (cc->sector_size < (1 << SECTOR_SHIFT) || + cc->sector_size > 4096 || + (cc->sector_size & (cc->sector_size - 1))) { + ti->error = "Invalid feature value for sector_size"; + return -EINVAL; + } + if (ti->len & ((cc->sector_size >> SECTOR_SHIFT) - 1)) { + ti->error = "Device size is not multiple of sector_size feature"; + return -EINVAL; + } + cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; + } else if (!strcasecmp(opt_string, "iv_large_sectors")) + set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); + else { + ti->error = "Invalid feature arguments"; + return -EINVAL; + } + } + + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int crypt_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct crypt_config *cc = ti->private; + + return dm_report_zones(cc->dev->bdev, cc->start, + cc->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define crypt_report_zones NULL +#endif + +/* + * Construct an encryption mapping: + * [|:::] + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc; + const char *devname = dm_table_device_name(ti->table); + int key_size; + unsigned int align_mask; + unsigned long long tmpll; + int ret; + size_t iv_size_padding, additional_req_size; + char dummy; + + if (argc < 5) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + key_size = get_key_size(&argv[1]); + if (key_size < 0) { + ti->error = "Cannot parse key size"; + return -EINVAL; + } + + cc = kzalloc(struct_size(cc, key, key_size), GFP_KERNEL); + if (!cc) { + ti->error = "Cannot allocate encryption context"; + return -ENOMEM; + } + cc->key_size = key_size; + cc->sector_size = (1 << SECTOR_SHIFT); + cc->sector_shift = 0; + + ti->private = cc; + + spin_lock(&dm_crypt_clients_lock); + dm_crypt_clients_n++; + crypt_calculate_pages_per_client(); + spin_unlock(&dm_crypt_clients_lock); + + ret = percpu_counter_init(&cc->n_allocated_pages, 0, GFP_KERNEL); + if (ret < 0) + goto bad; + + /* Optional parameters need to be read before cipher constructor */ + if (argc > 5) { + ret = crypt_ctr_optional(ti, argc - 5, &argv[5]); + if (ret) + goto bad; + } + + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; + + if (crypt_integrity_aead(cc)) { + cc->dmreq_start = sizeof(struct aead_request); + cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc)); + align_mask = crypto_aead_alignmask(any_tfm_aead(cc)); + } else { + cc->dmreq_start = sizeof(struct skcipher_request); + cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); + align_mask = crypto_skcipher_alignmask(any_tfm(cc)); + } + cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); + + if (align_mask < CRYPTO_MINALIGN) { + /* Allocate the padding exactly */ + iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) + & align_mask; + } else { + /* + * If the cipher requires greater alignment than kmalloc + * alignment, we don't know the exact position of the + * initialization vector. We must assume worst case. + */ + iv_size_padding = align_mask; + } + + /* ...| IV + padding | original IV | original sec. number | bio tag offset | */ + additional_req_size = sizeof(struct dm_crypt_request) + + iv_size_padding + cc->iv_size + + cc->iv_size + + sizeof(uint64_t) + + sizeof(unsigned int); + + ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size); + if (ret) { + ti->error = "Cannot allocate crypt request mempool"; + goto bad; + } + + cc->per_bio_data_size = ti->per_io_data_size = + ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, + ARCH_KMALLOC_MINALIGN); + + ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc); + if (ret) { + ti->error = "Cannot allocate page mempool"; + goto bad; + } + + ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS); + if (ret) { + ti->error = "Cannot allocate crypt bioset"; + goto bad; + } + + mutex_init(&cc->bio_alloc_lock); + + ret = -EINVAL; + if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) || + (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) { + ti->error = "Invalid iv_offset sector"; + goto bad; + } + cc->iv_offset = tmpll; + + ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev); + if (ret) { + ti->error = "Device lookup failed"; + goto bad; + } + + ret = -EINVAL; + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { + ti->error = "Invalid device sector"; + goto bad; + } + cc->start = tmpll; + + if (bdev_is_zoned(cc->dev->bdev)) { + /* + * For zoned block devices, we need to preserve the issuer write + * ordering. To do so, disable write workqueues and force inline + * encryption completion. + */ + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags); + + /* + * All zone append writes to a zone of a zoned block device will + * have the same BIO sector, the start of the zone. When the + * cypher IV mode uses sector values, all data targeting a + * zone will be encrypted using the first sector numbers of the + * zone. This will not result in write errors but will + * cause most reads to fail as reads will use the sector values + * for the actual data locations, resulting in IV mismatch. + * To avoid this problem, ask DM core to emulate zone append + * operations with regular writes. + */ + DMDEBUG("Zone append operations will be emulated"); + ti->emulate_zone_append = true; + } + + if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { + ret = crypt_integrity_ctr(cc, ti); + if (ret) + goto bad; + + cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size; + if (!cc->tag_pool_max_sectors) + cc->tag_pool_max_sectors = 1; + + ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS, + cc->tag_pool_max_sectors * cc->on_disk_tag_size); + if (ret) { + ti->error = "Cannot allocate integrity tags mempool"; + goto bad; + } + + cc->tag_pool_max_sectors <<= cc->sector_shift; + } + + ret = -ENOMEM; + cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname); + if (!cc->io_queue) { + ti->error = "Couldn't create kcryptd io queue"; + goto bad; + } + + if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + 1, devname); + else + cc->crypt_queue = alloc_workqueue("kcryptd/%s", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + num_online_cpus(), devname); + if (!cc->crypt_queue) { + ti->error = "Couldn't create kcryptd queue"; + goto bad; + } + + spin_lock_init(&cc->write_thread_lock); + cc->write_tree = RB_ROOT; + + cc->write_thread = kthread_run(dmcrypt_write, cc, "dmcrypt_write/%s", devname); + if (IS_ERR(cc->write_thread)) { + ret = PTR_ERR(cc->write_thread); + cc->write_thread = NULL; + ti->error = "Couldn't spawn write thread"; + goto bad; + } + + ti->num_flush_bios = 1; + ti->limit_swap_bios = true; + ti->accounts_remapped_io = true; + + dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1); + return 0; + +bad: + dm_audit_log_ctr(DM_MSG_PREFIX, ti, 0); + crypt_dtr(ti); + return ret; +} + +static int crypt_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_crypt_io *io; + struct crypt_config *cc = ti->private; + + /* + * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues. + * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight + * - for REQ_OP_DISCARD caller must use flush if IO ordering matters + */ + if (unlikely(bio->bi_opf & REQ_PREFLUSH || + bio_op(bio) == REQ_OP_DISCARD)) { + bio_set_dev(bio, cc->dev->bdev); + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = cc->start + + dm_target_offset(ti, bio->bi_iter.bi_sector); + return DM_MAPIO_REMAPPED; + } + + /* + * Check if bio is too large, split as needed. + */ + if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) && + (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) + dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT)); + + /* + * Ensure that bio is a multiple of internal sector encryption size + * and is aligned to this size as defined in IO hints. + */ + if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0)) + return DM_MAPIO_KILL; + + if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1))) + return DM_MAPIO_KILL; + + io = dm_per_bio_data(bio, cc->per_bio_data_size); + crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + + if (cc->on_disk_tag_size) { + unsigned int tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift); + + if (unlikely(tag_len > KMALLOC_MAX_SIZE) || + unlikely(!(io->integrity_metadata = kmalloc(tag_len, + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) { + if (bio_sectors(bio) > cc->tag_pool_max_sectors) + dm_accept_partial_bio(bio, cc->tag_pool_max_sectors); + io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO); + io->integrity_metadata_from_pool = true; + } + } + + if (crypt_integrity_aead(cc)) + io->ctx.r.req_aead = (struct aead_request *)(io + 1); + else + io->ctx.r.req = (struct skcipher_request *)(io + 1); + + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, CRYPT_MAP_READ_GFP)) + kcryptd_queue_read(io); + } else + kcryptd_queue_crypt(io); + + return DM_MAPIO_SUBMITTED; +} + +static char hex2asc(unsigned char c) +{ + return c + '0' + ((unsigned int)(9 - c) >> 4 & 0x27); +} + +static void crypt_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct crypt_config *cc = ti->private; + unsigned int i, sz = 0; + int num_feature_args = 0; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s ", cc->cipher_string); + + if (cc->key_size > 0) { + if (cc->key_string) + DMEMIT(":%u:%s", cc->key_size, cc->key_string); + else { + for (i = 0; i < cc->key_size; i++) { + DMEMIT("%c%c", hex2asc(cc->key[i] >> 4), + hex2asc(cc->key[i] & 0xf)); + } + } + } else + DMEMIT("-"); + + DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, + cc->dev->name, (unsigned long long)cc->start); + + num_feature_args += !!ti->num_discard_bios; + num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); + num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); + num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); + if (cc->on_disk_tag_size) + num_feature_args++; + if (num_feature_args) { + DMEMIT(" %d", num_feature_args); + if (ti->num_discard_bios) + DMEMIT(" allow_discards"); + if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) + DMEMIT(" same_cpu_crypt"); + if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) + DMEMIT(" submit_from_crypt_cpus"); + if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) + DMEMIT(" no_read_workqueue"); + if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) + DMEMIT(" no_write_workqueue"); + if (cc->on_disk_tag_size) + DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); + if (cc->sector_size != (1 << SECTOR_SHIFT)) + DMEMIT(" sector_size:%d", cc->sector_size); + if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) + DMEMIT(" iv_large_sectors"); + } + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",allow_discards=%c", ti->num_discard_bios ? 'y' : 'n'); + DMEMIT(",same_cpu_crypt=%c", test_bit(DM_CRYPT_SAME_CPU, &cc->flags) ? 'y' : 'n'); + DMEMIT(",submit_from_crypt_cpus=%c", test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags) ? + 'y' : 'n'); + DMEMIT(",no_read_workqueue=%c", test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags) ? + 'y' : 'n'); + DMEMIT(",no_write_workqueue=%c", test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags) ? + 'y' : 'n'); + DMEMIT(",iv_large_sectors=%c", test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags) ? + 'y' : 'n'); + + if (cc->on_disk_tag_size) + DMEMIT(",integrity_tag_size=%u,cipher_auth=%s", + cc->on_disk_tag_size, cc->cipher_auth); + if (cc->sector_size != (1 << SECTOR_SHIFT)) + DMEMIT(",sector_size=%d", cc->sector_size); + if (cc->cipher_string) + DMEMIT(",cipher_string=%s", cc->cipher_string); + + DMEMIT(",key_size=%u", cc->key_size); + DMEMIT(",key_parts=%u", cc->key_parts); + DMEMIT(",key_extra_size=%u", cc->key_extra_size); + DMEMIT(",key_mac_size=%u", cc->key_mac_size); + DMEMIT(";"); + break; + } +} + +static void crypt_postsuspend(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + set_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +static int crypt_preresume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) { + DMERR("aborting resume - crypt key is not set."); + return -EAGAIN; + } + + return 0; +} + +static void crypt_resume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + clear_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +/* Message interface + * key set + * key wipe + */ +static int crypt_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct crypt_config *cc = ti->private; + int key_size, ret = -EINVAL; + + if (argc < 2) + goto error; + + if (!strcasecmp(argv[0], "key")) { + if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { + DMWARN("not suspended during key manipulation."); + return -EINVAL; + } + if (argc == 3 && !strcasecmp(argv[1], "set")) { + /* The key size may not be changed. */ + key_size = get_key_size(&argv[2]); + if (key_size < 0 || cc->key_size != key_size) { + memset(argv[2], '0', strlen(argv[2])); + return -EINVAL; + } + + ret = crypt_set_key(cc, argv[2]); + if (ret) + return ret; + if (cc->iv_gen_ops && cc->iv_gen_ops->init) + ret = cc->iv_gen_ops->init(cc); + /* wipe the kernel key payload copy */ + if (cc->key_string) + memset(cc->key, 0, cc->key_size * sizeof(u8)); + return ret; + } + if (argc == 2 && !strcasecmp(argv[1], "wipe")) + return crypt_wipe_key(cc); + } + +error: + DMWARN("unrecognised message received."); + return -EINVAL; +} + +static int crypt_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct crypt_config *cc = ti->private; + + return fn(ti, cc->dev, cc->start, ti->len, data); +} + +static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct crypt_config *cc = ti->private; + + /* + * Unfortunate constraint that is required to avoid the potential + * for exceeding underlying device's max_segments limits -- due to + * crypt_alloc_buffer() possibly allocating pages for the encryption + * bio that are not as physically contiguous as the original bio. + */ + limits->max_segment_size = PAGE_SIZE; + + limits->logical_block_size = + max_t(unsigned int, limits->logical_block_size, cc->sector_size); + limits->physical_block_size = + max_t(unsigned int, limits->physical_block_size, cc->sector_size); + limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size); + limits->dma_alignment = limits->logical_block_size - 1; +} + +static struct target_type crypt_target = { + .name = "crypt", + .version = {1, 24, 0}, + .module = THIS_MODULE, + .ctr = crypt_ctr, + .dtr = crypt_dtr, + .features = DM_TARGET_ZONED_HM, + .report_zones = crypt_report_zones, + .map = crypt_map, + .status = crypt_status, + .postsuspend = crypt_postsuspend, + .preresume = crypt_preresume, + .resume = crypt_resume, + .message = crypt_message, + .iterate_devices = crypt_iterate_devices, + .io_hints = crypt_io_hints, +}; + +static int __init dm_crypt_init(void) +{ + int r; + + r = dm_register_target(&crypt_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_crypt_exit(void) +{ + dm_unregister_target(&crypt_target); +} + +module_init(dm_crypt_init); +module_exit(dm_crypt_exit); + +MODULE_AUTHOR("Jana Saout "); +MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c new file mode 100644 index 000000000..43541c8e2 --- /dev/null +++ b/drivers/md/dm-delay.c @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2005-2007 Red Hat GmbH + * + * A target that delays reads and/or writes and can send + * them to different devices. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include + +#include + +#define DM_MSG_PREFIX "delay" + +struct delay_class { + struct dm_dev *dev; + sector_t start; + unsigned int delay; + unsigned int ops; +}; + +struct delay_c { + struct timer_list delay_timer; + struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; + struct work_struct flush_expired_bios; + struct list_head delayed_bios; + bool may_delay; + + struct delay_class read; + struct delay_class write; + struct delay_class flush; + + int argc; +}; + +struct dm_delay_info { + struct delay_c *context; + struct delay_class *class; + struct list_head list; + unsigned long expires; +}; + +static DEFINE_MUTEX(delayed_bios_lock); + +static void handle_delayed_timer(struct timer_list *t) +{ + struct delay_c *dc = from_timer(dc, t, delay_timer); + + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); +} + +static void queue_timeout(struct delay_c *dc, unsigned long expires) +{ + mutex_lock(&dc->timer_lock); + + if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires) + mod_timer(&dc->delay_timer, expires); + + mutex_unlock(&dc->timer_lock); +} + +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + dm_submit_bio_remap(bio, NULL); + bio = n; + } +} + +static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) +{ + struct dm_delay_info *delayed, *next; + unsigned long next_expires = 0; + unsigned long start_timer = 0; + struct bio_list flush_bios = { }; + + mutex_lock(&delayed_bios_lock); + list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + if (flush_all || time_after_eq(jiffies, delayed->expires)) { + struct bio *bio = dm_bio_from_per_bio_data(delayed, + sizeof(struct dm_delay_info)); + list_del(&delayed->list); + bio_list_add(&flush_bios, bio); + delayed->class->ops--; + continue; + } + + if (!start_timer) { + start_timer = 1; + next_expires = delayed->expires; + } else + next_expires = min(next_expires, delayed->expires); + } + mutex_unlock(&delayed_bios_lock); + + if (start_timer) + queue_timeout(dc, next_expires); + + return bio_list_get(&flush_bios); +} + +static void flush_expired_bios(struct work_struct *work) +{ + struct delay_c *dc; + + dc = container_of(work, struct delay_c, flush_expired_bios); + flush_bios(flush_delayed_bios(dc, 0)); +} + +static void delay_dtr(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + if (dc->kdelayd_wq) + destroy_workqueue(dc->kdelayd_wq); + + if (dc->read.dev) + dm_put_device(ti, dc->read.dev); + if (dc->write.dev) + dm_put_device(ti, dc->write.dev); + if (dc->flush.dev) + dm_put_device(ti, dc->flush.dev); + + mutex_destroy(&dc->timer_lock); + + kfree(dc); +} + +static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv) +{ + int ret; + unsigned long long tmpll; + char dummy; + + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { + ti->error = "Invalid device sector"; + return -EINVAL; + } + c->start = tmpll; + + if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) { + ti->error = "Invalid delay"; + return -EINVAL; + } + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev); + if (ret) { + ti->error = "Device lookup failed"; + return ret; + } + + return 0; +} + +/* + * Mapping parameters: + * [ ] + * + * With separate write parameters, the first set is only used for reads. + * Offsets are specified in sectors. + * Delays are specified in milliseconds. + */ +static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct delay_c *dc; + int ret; + + if (argc != 3 && argc != 6 && argc != 9) { + ti->error = "Requires exactly 3, 6 or 9 arguments"; + return -EINVAL; + } + + dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + ti->private = dc; + timer_setup(&dc->delay_timer, handle_delayed_timer, 0); + INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); + INIT_LIST_HEAD(&dc->delayed_bios); + mutex_init(&dc->timer_lock); + dc->may_delay = true; + dc->argc = argc; + + ret = delay_class_ctr(ti, &dc->read, argv); + if (ret) + goto bad; + + if (argc == 3) { + ret = delay_class_ctr(ti, &dc->write, argv); + if (ret) + goto bad; + ret = delay_class_ctr(ti, &dc->flush, argv); + if (ret) + goto bad; + goto out; + } + + ret = delay_class_ctr(ti, &dc->write, argv + 3); + if (ret) + goto bad; + if (argc == 6) { + ret = delay_class_ctr(ti, &dc->flush, argv + 3); + if (ret) + goto bad; + goto out; + } + + ret = delay_class_ctr(ti, &dc->flush, argv + 6); + if (ret) + goto bad; + +out: + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + ret = -EINVAL; + DMERR("Couldn't start kdelayd"); + goto bad; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->accounts_remapped_io = true; + ti->per_io_data_size = sizeof(struct dm_delay_info); + return 0; + +bad: + delay_dtr(ti); + return ret; +} + +static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) +{ + struct dm_delay_info *delayed; + unsigned long expires = 0; + + if (!c->delay) + return DM_MAPIO_REMAPPED; + + delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); + + delayed->context = dc; + delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); + + mutex_lock(&delayed_bios_lock); + if (unlikely(!dc->may_delay)) { + mutex_unlock(&delayed_bios_lock); + return DM_MAPIO_REMAPPED; + } + c->ops++; + list_add_tail(&delayed->list, &dc->delayed_bios); + mutex_unlock(&delayed_bios_lock); + + queue_timeout(dc, expires); + + return DM_MAPIO_SUBMITTED; +} + +static void delay_presuspend(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + mutex_lock(&delayed_bios_lock); + dc->may_delay = false; + mutex_unlock(&delayed_bios_lock); + + del_timer_sync(&dc->delay_timer); + flush_bios(flush_delayed_bios(dc, 1)); +} + +static void delay_resume(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + dc->may_delay = true; +} + +static int delay_map(struct dm_target *ti, struct bio *bio) +{ + struct delay_c *dc = ti->private; + struct delay_class *c; + struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); + + if (bio_data_dir(bio) == WRITE) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + c = &dc->flush; + else + c = &dc->write; + } else { + c = &dc->read; + } + delayed->class = c; + bio_set_dev(bio, c->dev->bdev); + bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + return delay_bio(dc, c, bio); +} + +#define DMEMIT_DELAY_CLASS(c) \ + DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) + +static void delay_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct delay_c *dc = ti->private; + int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops); + break; + + case STATUSTYPE_TABLE: + DMEMIT_DELAY_CLASS(&dc->read); + if (dc->argc >= 6) { + DMEMIT(" "); + DMEMIT_DELAY_CLASS(&dc->write); + } + if (dc->argc >= 9) { + DMEMIT(" "); + DMEMIT_DELAY_CLASS(&dc->flush); + } + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int delay_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct delay_c *dc = ti->private; + int ret = 0; + + ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data); + if (ret) + goto out; + ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); + if (ret) + goto out; + ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data); + if (ret) + goto out; + +out: + return ret; +} + +static struct target_type delay_target = { + .name = "delay", + .version = {1, 3, 0}, + .features = DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, + .ctr = delay_ctr, + .dtr = delay_dtr, + .map = delay_map, + .presuspend = delay_presuspend, + .resume = delay_resume, + .status = delay_status, + .iterate_devices = delay_iterate_devices, +}; + +static int __init dm_delay_init(void) +{ + int r; + + r = dm_register_target(&delay_target); + if (r < 0) { + DMERR("register failed %d", r); + goto bad_register; + } + + return 0; + +bad_register: + return r; +} + +static void __exit dm_delay_exit(void) +{ + dm_unregister_target(&delay_target); +} + +/* Module hooks */ +module_init(dm_delay_init); +module_exit(dm_delay_exit); + +MODULE_DESCRIPTION(DM_NAME " delay target"); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c new file mode 100644 index 000000000..03672204b --- /dev/null +++ b/drivers/md/dm-dust.c @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + * + * This is a test "dust" device, which fails reads on specified + * sectors, emulating the behavior of a hard disk drive sending + * a "Read Medium Error" sense. + * + */ + +#include +#include +#include + +#define DM_MSG_PREFIX "dust" + +struct badblock { + struct rb_node node; + sector_t bb; + unsigned char wr_fail_cnt; +}; + +struct dust_device { + struct dm_dev *dev; + struct rb_root badblocklist; + unsigned long long badblock_count; + spinlock_t dust_lock; + unsigned int blksz; + int sect_per_block_shift; + unsigned int sect_per_block; + sector_t start; + bool fail_read_on_bb:1; + bool quiet_mode:1; +}; + +static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct badblock *bblk = rb_entry(node, struct badblock, node); + + if (bblk->bb > blk) + node = node->rb_left; + else if (bblk->bb < blk) + node = node->rb_right; + else + return bblk; + } + + return NULL; +} + +static bool dust_rb_insert(struct rb_root *root, struct badblock *new) +{ + struct badblock *bblk; + struct rb_node **link = &root->rb_node, *parent = NULL; + sector_t value = new->bb; + + while (*link) { + parent = *link; + bblk = rb_entry(parent, struct badblock, node); + + if (bblk->bb > value) + link = &(*link)->rb_left; + else if (bblk->bb < value) + link = &(*link)->rb_right; + else + return false; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); + + return true; +} + +static int dust_remove_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block); + + if (bblock == NULL) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu not found in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + return -EINVAL; + } + + rb_erase(&bblock->node, &dd->badblocklist); + dd->badblock_count--; + if (!dd->quiet_mode) + DMINFO("%s: badblock removed at block %llu", __func__, block); + kfree(bblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_add_block(struct dust_device *dd, unsigned long long block, + unsigned char wr_fail_cnt) +{ + struct badblock *bblock; + unsigned long flags; + + bblock = kmalloc(sizeof(*bblock), GFP_KERNEL); + if (bblock == NULL) { + if (!dd->quiet_mode) + DMERR("%s: badblock allocation failed", __func__); + return -ENOMEM; + } + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock->bb = block; + bblock->wr_fail_cnt = wr_fail_cnt; + if (!dust_rb_insert(&dd->badblocklist, bblock)) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu already in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + kfree(bblock); + return -EINVAL; + } + + dd->badblock_count++; + if (!dd->quiet_mode) { + DMINFO("%s: badblock added at block %llu with write fail count %u", + __func__, block, wr_fail_cnt); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result, + unsigned int maxlen, unsigned int *sz_ptr) +{ + struct badblock *bblock; + unsigned long flags; + unsigned int sz = *sz_ptr; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block); + if (bblock != NULL) + DMEMIT("%s: block %llu found in badblocklist", __func__, block); + else + DMEMIT("%s: block %llu not found in badblocklist", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 1; +} + +static int __dust_map_read(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) + return DM_MAPIO_KILL; + + return DM_MAPIO_REMAPPED; +} + +static int dust_map_read(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + int r = DM_MAPIO_REMAPPED; + + if (fail_read_on_bb) { + thisblock >>= dd->sect_per_block_shift; + spin_lock_irqsave(&dd->dust_lock, flags); + r = __dust_map_read(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return r; +} + +static int __dust_map_write(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk && bblk->wr_fail_cnt > 0) { + bblk->wr_fail_cnt--; + return DM_MAPIO_KILL; + } + + if (bblk) { + rb_erase(&bblk->node, &dd->badblocklist); + dd->badblock_count--; + kfree(bblk); + if (!dd->quiet_mode) { + sector_div(thisblock, dd->sect_per_block); + DMINFO("block %llu removed from badblocklist by write", + (unsigned long long)thisblock); + } + } + + return DM_MAPIO_REMAPPED; +} + +static int dust_map_write(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + int r = DM_MAPIO_REMAPPED; + + if (fail_read_on_bb) { + thisblock >>= dd->sect_per_block_shift; + spin_lock_irqsave(&dd->dust_lock, flags); + r = __dust_map_write(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return r; +} + +static int dust_map(struct dm_target *ti, struct bio *bio) +{ + struct dust_device *dd = ti->private; + int r; + + bio_set_dev(bio, dd->dev->bdev); + bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (bio_data_dir(bio) == READ) + r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + else + r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + + return r; +} + +static bool __dust_clear_badblocks(struct rb_root *tree, + unsigned long long count) +{ + struct rb_node *node = NULL, *nnode = NULL; + + nnode = rb_first(tree); + if (nnode == NULL) { + BUG_ON(count != 0); + return false; + } + + while (nnode) { + node = nnode; + nnode = rb_next(node); + rb_erase(node, tree); + count--; + kfree(node); + } + BUG_ON(count != 0); + BUG_ON(tree->rb_node != NULL); + + return true; +} + +static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen, + unsigned int *sz_ptr) +{ + unsigned long flags; + struct rb_root badblocklist; + unsigned long long badblock_count; + unsigned int sz = *sz_ptr; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + badblock_count = dd->badblock_count; + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_unlock_irqrestore(&dd->dust_lock, flags); + + if (!__dust_clear_badblocks(&badblocklist, badblock_count)) + DMEMIT("%s: no badblocks found", __func__); + else + DMEMIT("%s: badblocks cleared", __func__); + + return 1; +} + +static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen, + unsigned int *sz_ptr) +{ + unsigned long flags; + struct rb_root badblocklist; + struct rb_node *node; + struct badblock *bblk; + unsigned int sz = *sz_ptr; + unsigned long long num = 0; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + for (node = rb_first(&badblocklist); node; node = rb_next(node)) { + bblk = rb_entry(node, struct badblock, node); + DMEMIT("%llu\n", bblk->bb); + num++; + } + + spin_unlock_irqrestore(&dd->dust_lock, flags); + if (!num) + DMEMIT("No blocks in badblocklist"); + + return 1; +} + +/* + * Target parameters: + * + * + * + * device_path: path to the block device + * offset: offset to data area from start of device_path + * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2) + */ +static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dust_device *dd; + unsigned long long tmp; + char dummy; + unsigned int blksz; + unsigned int sect_per_block; + sector_t DUST_MAX_BLKSZ_SECTORS = 2097152; + sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS); + + if (argc != 3) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + if (kstrtouint(argv[2], 10, &blksz) || !blksz) { + ti->error = "Invalid block size parameter"; + return -EINVAL; + } + + if (blksz < 512) { + ti->error = "Block size must be at least 512"; + return -EINVAL; + } + + if (!is_power_of_2(blksz)) { + ti->error = "Block size must be a power of 2"; + return -EINVAL; + } + + if (to_sector(blksz) > max_block_sectors) { + ti->error = "Block size is too large"; + return -EINVAL; + } + + sect_per_block = (blksz >> SECTOR_SHIFT); + + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { + ti->error = "Invalid device offset sector"; + return -EINVAL; + } + + dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL); + if (dd == NULL) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) { + ti->error = "Device lookup failed"; + kfree(dd); + return -EINVAL; + } + + dd->sect_per_block = sect_per_block; + dd->blksz = blksz; + dd->start = tmp; + + dd->sect_per_block_shift = __ffs(sect_per_block); + + /* + * Whether to fail a read on a "bad" block. + * Defaults to false; enabled later by message. + */ + dd->fail_read_on_bb = false; + + /* + * Initialize bad block list rbtree. + */ + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_lock_init(&dd->dust_lock); + + dd->quiet_mode = false; + + BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0); + + ti->num_discard_bios = 1; + ti->num_flush_bios = 1; + ti->private = dd; + + return 0; +} + +static void dust_dtr(struct dm_target *ti) +{ + struct dust_device *dd = ti->private; + + __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count); + dm_put_device(ti, dd->dev); + kfree(dd); +} + +static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + sector_t size = bdev_nr_sectors(dd->dev->bdev); + bool invalid_msg = false; + int r = -EINVAL; + unsigned long long tmp, block; + unsigned char wr_fail_cnt; + unsigned int tmp_ui; + unsigned long flags; + unsigned int sz = 0; + char dummy; + + if (argc == 1) { + if (!strcasecmp(argv[0], "addbadblock") || + !strcasecmp(argv[0], "removebadblock") || + !strcasecmp(argv[0], "queryblock")) { + DMERR("%s requires an additional argument", argv[0]); + } else if (!strcasecmp(argv[0], "disable")) { + DMINFO("disabling read failures on bad sectors"); + dd->fail_read_on_bb = false; + r = 0; + } else if (!strcasecmp(argv[0], "enable")) { + DMINFO("enabling read failures on bad sectors"); + dd->fail_read_on_bb = true; + r = 0; + } else if (!strcasecmp(argv[0], "countbadblocks")) { + spin_lock_irqsave(&dd->dust_lock, flags); + DMEMIT("countbadblocks: %llu badblock(s) found", + dd->badblock_count); + spin_unlock_irqrestore(&dd->dust_lock, flags); + r = 1; + } else if (!strcasecmp(argv[0], "clearbadblocks")) { + r = dust_clear_badblocks(dd, result, maxlen, &sz); + } else if (!strcasecmp(argv[0], "quiet")) { + if (!dd->quiet_mode) + dd->quiet_mode = true; + else + dd->quiet_mode = false; + r = 0; + } else if (!strcasecmp(argv[0], "listbadblocks")) { + r = dust_list_badblocks(dd, result, maxlen, &sz); + } else { + invalid_msg = true; + } + } else if (argc == 2) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return r; + + block = tmp; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return r; + } + + if (!strcasecmp(argv[0], "addbadblock")) + r = dust_add_block(dd, block, 0); + else if (!strcasecmp(argv[0], "removebadblock")) + r = dust_remove_block(dd, block); + else if (!strcasecmp(argv[0], "queryblock")) + r = dust_query_block(dd, block, result, maxlen, &sz); + else + invalid_msg = true; + + } else if (argc == 3) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return r; + + if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1) + return r; + + block = tmp; + if (tmp_ui > 255) { + DMERR("selected write fail count out of range"); + return r; + } + wr_fail_cnt = tmp_ui; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return r; + } + + if (!strcasecmp(argv[0], "addbadblock")) + r = dust_add_block(dd, block, wr_fail_cnt); + else + invalid_msg = true; + + } else + DMERR("invalid number of arguments '%d'", argc); + + if (invalid_msg) + DMERR("unrecognized message '%s' received", argv[0]); + + return r; +} + +static void dust_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %s %s", dd->dev->name, + dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass", + dd->quiet_mode ? "quiet" : "verbose"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u", dd->dev->name, + (unsigned long long)dd->start, dd->blksz); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct dust_device *dd = ti->private; + struct dm_dev *dev = dd->dev; + + *bdev = dev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (dd->start || ti->len != bdev_nr_sectors(dev->bdev)) + return 1; + + return 0; +} + +static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, + void *data) +{ + struct dust_device *dd = ti->private; + + return fn(ti, dd->dev, dd->start, ti->len, data); +} + +static struct target_type dust_target = { + .name = "dust", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = dust_ctr, + .dtr = dust_dtr, + .iterate_devices = dust_iterate_devices, + .map = dust_map, + .message = dust_message, + .status = dust_status, + .prepare_ioctl = dust_prepare_ioctl, +}; + +static int __init dm_dust_init(void) +{ + int r = dm_register_target(&dust_target); + + if (r < 0) + DMERR("dm_register_target failed %d", r); + + return r; +} + +static void __exit dm_dust_exit(void) +{ + dm_unregister_target(&dust_target); +} + +module_init(dm_dust_init); +module_exit(dm_dust_exit); + +MODULE_DESCRIPTION(DM_NAME " dust test target"); +MODULE_AUTHOR("Bryan Gurney "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c new file mode 100644 index 000000000..7606c6695 --- /dev/null +++ b/drivers/md/dm-ebs-target.c @@ -0,0 +1,475 @@ +/* + * Copyright (C) 2020 Red Hat GmbH + * + * This file is released under the GPL. + * + * Device-mapper target to emulate smaller logical block + * size on backing devices exposing (natively) larger ones. + * + * E.g. 512 byte sector emulation on 4K native disks. + */ + +#include "dm.h" +#include +#include +#include + +#define DM_MSG_PREFIX "ebs" + +static void ebs_dtr(struct dm_target *ti); + +/* Emulated block size context. */ +struct ebs_c { + struct dm_dev *dev; /* Underlying device to emulate block size on. */ + struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ + struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ + struct work_struct ws; /* Work item used for ^. */ + struct bio_list bios_in; /* Worker bios input list. */ + spinlock_t lock; /* Guard bios input list above. */ + sector_t start; /* table line argument, see ebs_ctr below. */ + unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ + unsigned int u_bs; /* Underlying block size in sectors retrieved from/set on lower layer device. */ + unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ + bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ +}; + +static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) +{ + return sector >> ec->block_shift; +} + +static inline sector_t __block_mod(sector_t sector, unsigned int bs) +{ + return sector & (bs - 1); +} + +/* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */ +static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) +{ + sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); + + return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); +} + +static inline bool __ebs_check_bs(unsigned int bs) +{ + return bs && is_power_of_2(bs); +} + +/* + * READ/WRITE: + * + * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. + */ +static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv, + struct bvec_iter *iter) +{ + int r = 0; + unsigned char *ba, *pa; + unsigned int cur_len; + unsigned int bv_len = bv->bv_len; + unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); + sector_t block = __sector_to_block(ec, iter->bi_sector); + struct dm_buffer *b; + + if (unlikely(!bv->bv_page || !bv_len)) + return -EIO; + + pa = bvec_virt(bv); + + /* Handle overlapping page <-> blocks */ + while (bv_len) { + cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); + + /* Avoid reading for writes in case bio vector's page overwrites block completely. */ + if (op == REQ_OP_READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) + ba = dm_bufio_read(ec->bufio, block, &b); + else + ba = dm_bufio_new(ec->bufio, block, &b); + + if (IS_ERR(ba)) { + /* + * Carry on with next buffer, if any, to issue all possible + * data but return error. + */ + r = PTR_ERR(ba); + } else { + /* Copy data to/from bio to buffer if read/new was successful above. */ + ba += buf_off; + if (op == REQ_OP_READ) { + memcpy(pa, ba, cur_len); + flush_dcache_page(bv->bv_page); + } else { + flush_dcache_page(bv->bv_page); + memcpy(ba, pa, cur_len); + dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); + } + + dm_bufio_release(b); + } + + pa += cur_len; + bv_len -= cur_len; + buf_off = 0; + block++; + } + + return r; +} + +/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ +static int __ebs_rw_bio(struct ebs_c *ec, enum req_op op, struct bio *bio) +{ + int r = 0, rr; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_bvec(bv, bio, iter) { + rr = __ebs_rw_bvec(ec, op, &bv, &iter); + if (rr) + r = rr; + } + + return r; +} + +/* + * Discard bio's blocks, i.e. pass discards down. + * + * Avoid discarding partial blocks at beginning and end; + * return 0 in case no blocks can be discarded as a result. + */ +static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t block, blocks, sector = bio->bi_iter.bi_sector; + + block = __sector_to_block(ec, sector); + blocks = __nr_blocks(ec, bio); + + /* + * Partial first underlying block (__nr_blocks() may have + * resulted in one block). + */ + if (__block_mod(sector, ec->u_bs)) { + block++; + blocks--; + } + + /* Partial last underlying block if any. */ + if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) + blocks--; + + return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; +} + +/* Release blocks them from the bufio cache. */ +static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t blocks, sector = bio->bi_iter.bi_sector; + + blocks = __nr_blocks(ec, bio); + + dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); +} + +/* Worker function to process incoming bios. */ +static void __ebs_process_bios(struct work_struct *ws) +{ + int r; + bool write = false; + sector_t block1, block2; + struct ebs_c *ec = container_of(ws, struct ebs_c, ws); + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irq(&ec->lock); + bios = ec->bios_in; + bio_list_init(&ec->bios_in); + spin_unlock_irq(&ec->lock); + + /* Prefetch all read and any mis-aligned write buffers */ + bio_list_for_each(bio, &bios) { + block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); + if (bio_op(bio) == REQ_OP_READ) + dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); + else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { + block2 = __sector_to_block(ec, bio_end_sector(bio)); + if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) + dm_bufio_prefetch(ec->bufio, block1, 1); + if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) + dm_bufio_prefetch(ec->bufio, block2, 1); + } + } + + bio_list_for_each(bio, &bios) { + r = -EIO; + if (bio_op(bio) == REQ_OP_READ) + r = __ebs_rw_bio(ec, REQ_OP_READ, bio); + else if (bio_op(bio) == REQ_OP_WRITE) { + write = true; + r = __ebs_rw_bio(ec, REQ_OP_WRITE, bio); + } else if (bio_op(bio) == REQ_OP_DISCARD) { + __ebs_forget_bio(ec, bio); + r = __ebs_discard_bio(ec, bio); + } + + if (r < 0) + bio->bi_status = errno_to_blk_status(r); + } + + /* + * We write dirty buffers after processing I/O on them + * but before we endio thus addressing REQ_FUA/REQ_SYNC. + */ + r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; + + while ((bio = bio_list_pop(&bios))) { + /* Any other request is endioed. */ + if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) + bio_io_error(bio); + else + bio_endio(bio); + } +} + +/* + * Construct an emulated block size mapping: [] + * + * : path of the underlying device + * : offset in 512 bytes sectors into + * : emulated block size in units of 512 bytes exposed to the upper layer + * []: underlying block size in units of 512 bytes imposed on the lower layer; + * optional, if not supplied, retrieve logical block size from underlying device + */ +static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned short tmp1; + unsigned long long tmp; + char dummy; + struct ebs_c *ec; + + if (argc < 3 || argc > 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); + if (!ec) { + ti->error = "Cannot allocate ebs context"; + return -ENOMEM; + } + + r = -EINVAL; + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || + tmp != (sector_t)tmp || + (sector_t)tmp >= ti->len) { + ti->error = "Invalid device offset sector"; + goto bad; + } + ec->start = tmp; + + if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || + !__ebs_check_bs(tmp1) || + to_bytes(tmp1) > PAGE_SIZE) { + ti->error = "Invalid emulated block size"; + goto bad; + } + ec->e_bs = tmp1; + + if (argc > 3) { + if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { + ti->error = "Invalid underlying block size"; + goto bad; + } + ec->u_bs = tmp1; + ec->u_bs_set = true; + } else + ec->u_bs_set = false; + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); + if (r) { + ti->error = "Device lookup failed"; + ec->dev = NULL; + goto bad; + } + + r = -EINVAL; + if (!ec->u_bs_set) { + ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); + if (!__ebs_check_bs(ec->u_bs)) { + ti->error = "Invalid retrieved underlying block size"; + goto bad; + } + } + + if (!ec->u_bs_set && ec->e_bs == ec->u_bs) + DMINFO("Emulation superfluous: emulated equal to underlying block size"); + + if (__block_mod(ec->start, ec->u_bs)) { + ti->error = "Device offset must be multiple of underlying block size"; + goto bad; + } + + ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, + 0, NULL, NULL, 0); + if (IS_ERR(ec->bufio)) { + ti->error = "Cannot create dm bufio client"; + r = PTR_ERR(ec->bufio); + ec->bufio = NULL; + goto bad; + } + + ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!ec->wq) { + ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; + r = -ENOMEM; + goto bad; + } + + ec->block_shift = __ffs(ec->u_bs); + INIT_WORK(&ec->ws, &__ebs_process_bios); + bio_list_init(&ec->bios_in); + spin_lock_init(&ec->lock); + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 0; + ti->num_write_zeroes_bios = 0; + return 0; +bad: + ebs_dtr(ti); + return r; +} + +static void ebs_dtr(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + + if (ec->wq) + destroy_workqueue(ec->wq); + if (ec->bufio) + dm_bufio_client_destroy(ec->bufio); + if (ec->dev) + dm_put_device(ti, ec->dev); + kfree(ec); +} + +static int ebs_map(struct dm_target *ti, struct bio *bio) +{ + struct ebs_c *ec = ti->private; + + bio_set_dev(bio, ec->dev->bdev); + bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) + return DM_MAPIO_REMAPPED; + /* + * Only queue for bufio processing in case of partial or overlapping buffers + * -or- + * emulation with ebs == ubs aiming for tests of dm-bufio overhead. + */ + if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || + __block_mod(bio_end_sector(bio), ec->u_bs) || + ec->e_bs == ec->u_bs)) { + spin_lock_irq(&ec->lock); + bio_list_add(&ec->bios_in, bio); + spin_unlock_irq(&ec->lock); + + queue_work(ec->wq, &ec->ws); + + return DM_MAPIO_SUBMITTED; + } + + /* Forget any buffer content relative to this direct backing device I/O. */ + __ebs_forget_bio(ec, bio); + + return DM_MAPIO_REMAPPED; +} + +static void ebs_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct ebs_c *ec = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + *result = '\0'; + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", + ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct ebs_c *ec = ti->private; + struct dm_dev *dev = ec->dev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + *bdev = dev->bdev; + return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev)); +} + +static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct ebs_c *ec = ti->private; + + limits->logical_block_size = to_bytes(ec->e_bs); + limits->physical_block_size = to_bytes(ec->u_bs); + limits->alignment_offset = limits->physical_block_size; + blk_limits_io_min(limits, limits->logical_block_size); +} + +static int ebs_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct ebs_c *ec = ti->private; + + return fn(ti, ec->dev, ec->start, ti->len, data); +} + +static struct target_type ebs_target = { + .name = "ebs", + .version = {1, 0, 1}, + .features = DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, + .ctr = ebs_ctr, + .dtr = ebs_dtr, + .map = ebs_map, + .status = ebs_status, + .io_hints = ebs_io_hints, + .prepare_ioctl = ebs_prepare_ioctl, + .iterate_devices = ebs_iterate_devices, +}; + +static int __init dm_ebs_init(void) +{ + int r = dm_register_target(&ebs_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void dm_ebs_exit(void) +{ + dm_unregister_target(&ebs_target); +} + +module_init(dm_ebs_init); +module_exit(dm_ebs_exit); + +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_DESCRIPTION(DM_NAME " emulated block size target"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c new file mode 100644 index 000000000..a96290103 --- /dev/null +++ b/drivers/md/dm-era-target.c @@ -0,0 +1,1756 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "dm.h" +#include "persistent-data/dm-transaction-manager.h" +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" + +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "era" + +#define SUPERBLOCK_LOCATION 0 +#define SUPERBLOCK_MAGIC 2126579579 +#define SUPERBLOCK_CSUM_XOR 146538381 +#define MIN_ERA_VERSION 1 +#define MAX_ERA_VERSION 1 +#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION +#define MIN_BLOCK_SIZE 8 + +/*---------------------------------------------------------------- + * Writeset + *--------------------------------------------------------------*/ +struct writeset_metadata { + uint32_t nr_bits; + dm_block_t root; +}; + +struct writeset { + struct writeset_metadata md; + + /* + * An in core copy of the bits to save constantly doing look ups on + * disk. + */ + unsigned long *bits; +}; + +/* + * This does not free off the on disk bitset as this will normally be done + * after digesting into the era array. + */ +static void writeset_free(struct writeset *ws) +{ + vfree(ws->bits); + ws->bits = NULL; +} + +static int setup_on_disk_bitset(struct dm_disk_bitset *info, + unsigned int nr_bits, dm_block_t *root) +{ + int r; + + r = dm_bitset_empty(info, root); + if (r) + return r; + + return dm_bitset_resize(info, *root, 0, nr_bits, false, root); +} + +static size_t bitset_size(unsigned int nr_bits) +{ + return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG); +} + +/* + * Allocates memory for the in core bitset. + */ +static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) +{ + ws->bits = vzalloc(bitset_size(nr_blocks)); + if (!ws->bits) { + DMERR("%s: couldn't allocate in memory bitset", __func__); + return -ENOMEM; + } + + return 0; +} + +/* + * Wipes the in-core bitset, and creates a new on disk bitset. + */ +static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws, + dm_block_t nr_blocks) +{ + int r; + + memset(ws->bits, 0, bitset_size(nr_blocks)); + + ws->md.nr_bits = nr_blocks; + r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); + if (r) { + DMERR("%s: setup_on_disk_bitset failed", __func__); + return r; + } + + return 0; +} + +static bool writeset_marked(struct writeset *ws, dm_block_t block) +{ + return test_bit(block, ws->bits); +} + +static int writeset_marked_on_disk(struct dm_disk_bitset *info, + struct writeset_metadata *m, dm_block_t block, + bool *result) +{ + dm_block_t old = m->root; + + /* + * The bitset was flushed when it was archived, so we know there'll + * be no change to the root. + */ + int r = dm_bitset_test_bit(info, m->root, block, &m->root, result); + if (r) { + DMERR("%s: dm_bitset_test_bit failed", __func__); + return r; + } + + BUG_ON(m->root != old); + + return r; +} + +/* + * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was. + */ +static int writeset_test_and_set(struct dm_disk_bitset *info, + struct writeset *ws, uint32_t block) +{ + int r; + + if (!test_bit(block, ws->bits)) { + r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); + if (r) { + /* FIXME: fail mode */ + return r; + } + + return 0; + } + + return 1; +} + +/*---------------------------------------------------------------- + * On disk metadata layout + *--------------------------------------------------------------*/ +#define SPACE_MAP_ROOT_SIZE 128 +#define UUID_LEN 16 + +struct writeset_disk { + __le32 nr_bits; + __le64 root; +} __packed; + +struct superblock_disk { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[UUID_LEN]; + __le64 magic; + __le32 version; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + __le32 data_block_size; + __le32 metadata_block_size; + __le32 nr_blocks; + + __le32 current_era; + struct writeset_disk current_writeset; + + /* + * Only these two fields are valid within the metadata snapshot. + */ + __le64 writeset_tree_root; + __le64 era_array_root; + + __le64 metadata_snap; +} __packed; + +/*---------------------------------------------------------------- + * Superblock validation + *--------------------------------------------------------------*/ +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct superblock_disk *disk = dm_block_data(b); + + disk->blocknr = cpu_to_le64(dm_block_location(b)); + disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int check_metadata_version(struct superblock_disk *disk) +{ + uint32_t metadata_version = le32_to_cpu(disk->version); + if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) { + DMERR("Era metadata version %u found, but only versions between %u and %u supported.", + metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION); + return -EINVAL; + } + + return 0; +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct superblock_disk *disk = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) { + DMERR("sb_check failed: blocknr %llu: wanted %llu", + le64_to_cpu(disk->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: wanted %llu", + le64_to_cpu(disk->magic), + (unsigned long long) SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk->csum)); + return -EILSEQ; + } + + return check_metadata_version(disk); +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*---------------------------------------------------------------- + * Low level metadata handling + *--------------------------------------------------------------*/ +#define DM_ERA_METADATA_BLOCK_SIZE 4096 +#define ERA_MAX_CONCURRENT_LOCKS 5 + +struct era_metadata { + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *sm; + struct dm_transaction_manager *tm; + + dm_block_t block_size; + uint32_t nr_blocks; + + uint32_t current_era; + + /* + * We preallocate 2 writesets. When an era rolls over we + * switch between them. This means the allocation is done at + * preresume time, rather than on the io path. + */ + struct writeset writesets[2]; + struct writeset *current_writeset; + + dm_block_t writeset_tree_root; + dm_block_t era_array_root; + + struct dm_disk_bitset bitset_info; + struct dm_btree_info writeset_tree_info; + struct dm_array_info era_array_info; + + dm_block_t metadata_snap; + + /* + * A flag that is set whenever a writeset has been archived. + */ + bool archived_writesets; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; +}; + +static int superblock_read_lock(struct era_metadata *md, + struct dm_block **sblock) +{ + return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock_zero(struct era_metadata *md, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct era_metadata *md, + struct dm_block **sblock) +{ + return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +/* FIXME: duplication with cache and thin */ +static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result) +{ + int r; + unsigned int i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned int sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = true; + for (i = 0; i < sb_block_size; i++) { + if (data_le[i] != zero) { + *result = false; + break; + } + } + + dm_bm_unlock(b); + + return 0; +} + +/*----------------------------------------------------------------*/ + +static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk) +{ + disk->nr_bits = cpu_to_le32(core->nr_bits); + disk->root = cpu_to_le64(core->root); +} + +static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core) +{ + core->nr_bits = le32_to_cpu(disk->nr_bits); + core->root = le64_to_cpu(disk->root); +} + +static void ws_inc(void *context, const void *value, unsigned int count) +{ + struct era_metadata *md = context; + struct writeset_disk ws_d; + dm_block_t b; + unsigned int i; + + for (i = 0; i < count; i++) { + memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + dm_tm_inc(md->tm, b); + } +} + +static void ws_dec(void *context, const void *value, unsigned int count) +{ + struct era_metadata *md = context; + struct writeset_disk ws_d; + dm_block_t b; + unsigned int i; + + for (i = 0; i < count; i++) { + memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + dm_bitset_del(&md->bitset_info, b); + } +} + +static int ws_eq(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(struct writeset_disk)); +} + +/*----------------------------------------------------------------*/ + +static void setup_writeset_tree_info(struct era_metadata *md) +{ + struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type; + md->writeset_tree_info.tm = md->tm; + md->writeset_tree_info.levels = 1; + vt->context = md; + vt->size = sizeof(struct writeset_disk); + vt->inc = ws_inc; + vt->dec = ws_dec; + vt->equal = ws_eq; +} + +static void setup_era_array_info(struct era_metadata *md) + +{ + struct dm_btree_value_type vt; + vt.context = NULL; + vt.size = sizeof(__le32); + vt.inc = NULL; + vt.dec = NULL; + vt.equal = NULL; + + dm_array_info_init(&md->era_array_info, md->tm, &vt); +} + +static void setup_infos(struct era_metadata *md) +{ + dm_disk_bitset_init(md->tm, &md->bitset_info); + setup_writeset_tree_info(md); + setup_era_array_info(md); +} + +/*----------------------------------------------------------------*/ + +static int create_fresh_metadata(struct era_metadata *md) +{ + int r; + + r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION, + &md->tm, &md->sm); + if (r < 0) { + DMERR("dm_tm_create_with_sm failed"); + return r; + } + + setup_infos(md); + + r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root); + if (r) { + DMERR("couldn't create new writeset tree"); + goto bad; + } + + r = dm_array_empty(&md->era_array_info, &md->era_array_root); + if (r) { + DMERR("couldn't create era array"); + goto bad; + } + + return 0; + +bad: + dm_sm_destroy(md->sm); + dm_tm_destroy(md->tm); + + return r; +} + +static int save_sm_root(struct era_metadata *md) +{ + int r; + size_t metadata_len; + + r = dm_sm_root_size(md->sm, &metadata_len); + if (r < 0) + return r; + + return dm_sm_copy_root(md->sm, &md->metadata_space_map_root, + metadata_len); +} + +static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk) +{ + memcpy(&disk->metadata_space_map_root, + &md->metadata_space_map_root, + sizeof(md->metadata_space_map_root)); +} + +/* + * Writes a superblock, including the static fields that don't get updated + * with every commit (possible optimisation here). 'md' should be fully + * constructed when this is called. + */ +static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk) +{ + disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC); + disk->flags = cpu_to_le32(0ul); + + /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */ + memset(disk->uuid, 0, sizeof(disk->uuid)); + disk->version = cpu_to_le32(MAX_ERA_VERSION); + + copy_sm_root(md, disk); + + disk->data_block_size = cpu_to_le32(md->block_size); + disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk->nr_blocks = cpu_to_le32(md->nr_blocks); + disk->current_era = cpu_to_le32(md->current_era); + + ws_pack(&md->current_writeset->md, &disk->current_writeset); + disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root); + disk->era_array_root = cpu_to_le64(md->era_array_root); + disk->metadata_snap = cpu_to_le64(md->metadata_snap); +} + +static int write_superblock(struct era_metadata *md) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *disk; + + r = save_sm_root(md); + if (r) { + DMERR("%s: save_sm_root failed", __func__); + return r; + } + + r = superblock_lock_zero(md, &sblock); + if (r) + return r; + + disk = dm_block_data(sblock); + prepare_superblock(md, disk); + + return dm_tm_commit(md->tm, sblock); +} + +/* + * Assumes block_size and the infos are set. + */ +static int format_metadata(struct era_metadata *md) +{ + int r; + + r = create_fresh_metadata(md); + if (r) + return r; + + r = write_superblock(md); + if (r) { + dm_sm_destroy(md->sm); + dm_tm_destroy(md->tm); + return r; + } + + return 0; +} + +static int open_metadata(struct era_metadata *md) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *disk; + + r = superblock_read_lock(md, &sblock); + if (r) { + DMERR("couldn't read_lock superblock"); + return r; + } + + disk = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk->data_block_size) != md->block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk->data_block_size), md->block_size); + r = -EINVAL; + goto bad; + } + + r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, + disk->metadata_space_map_root, + sizeof(disk->metadata_space_map_root), + &md->tm, &md->sm); + if (r) { + DMERR("dm_tm_open_with_sm failed"); + goto bad; + } + + setup_infos(md); + + md->nr_blocks = le32_to_cpu(disk->nr_blocks); + md->current_era = le32_to_cpu(disk->current_era); + + ws_unpack(&disk->current_writeset, &md->current_writeset->md); + md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); + md->era_array_root = le64_to_cpu(disk->era_array_root); + md->metadata_snap = le64_to_cpu(disk->metadata_snap); + md->archived_writesets = true; + + dm_bm_unlock(sblock); + + return 0; + +bad: + dm_bm_unlock(sblock); + return r; +} + +static int open_or_format_metadata(struct era_metadata *md, + bool may_format) +{ + int r; + bool unformatted = false; + + r = superblock_all_zeroes(md->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return may_format ? format_metadata(md) : -EPERM; + + return open_metadata(md); +} + +static int create_persistent_data_objects(struct era_metadata *md, + bool may_format) +{ + int r; + + md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, + ERA_MAX_CONCURRENT_LOCKS); + if (IS_ERR(md->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(md->bm); + } + + r = open_or_format_metadata(md, may_format); + if (r) + dm_block_manager_destroy(md->bm); + + return r; +} + +static void destroy_persistent_data_objects(struct era_metadata *md) +{ + dm_sm_destroy(md->sm); + dm_tm_destroy(md->tm); + dm_block_manager_destroy(md->bm); +} + +/* + * This waits until all era_map threads have picked up the new filter. + */ +static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset) +{ + rcu_assign_pointer(md->current_writeset, new_writeset); + synchronize_rcu(); +} + +/*---------------------------------------------------------------- + * Writesets get 'digested' into the main era array. + * + * We're using a coroutine here so the worker thread can do the digestion, + * thus avoiding synchronisation of the metadata. Digesting a whole + * writeset in one go would cause too much latency. + *--------------------------------------------------------------*/ +struct digest { + uint32_t era; + unsigned int nr_bits, current_bit; + struct writeset_metadata writeset; + __le32 value; + struct dm_disk_bitset info; + + int (*step)(struct era_metadata *, struct digest *); +}; + +static int metadata_digest_lookup_writeset(struct era_metadata *md, + struct digest *d); + +static int metadata_digest_remove_writeset(struct era_metadata *md, + struct digest *d) +{ + int r; + uint64_t key = d->era; + + r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root, + &key, &md->writeset_tree_root); + if (r) { + DMERR("%s: dm_btree_remove failed", __func__); + return r; + } + + d->step = metadata_digest_lookup_writeset; + return 0; +} + +#define INSERTS_PER_STEP 100 + +static int metadata_digest_transcribe_writeset(struct era_metadata *md, + struct digest *d) +{ + int r; + bool marked; + unsigned int b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits); + + for (b = d->current_bit; b < e; b++) { + r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked); + if (r) { + DMERR("%s: writeset_marked_on_disk failed", __func__); + return r; + } + + if (!marked) + continue; + + __dm_bless_for_disk(&d->value); + r = dm_array_set_value(&md->era_array_info, md->era_array_root, + b, &d->value, &md->era_array_root); + if (r) { + DMERR("%s: dm_array_set_value failed", __func__); + return r; + } + } + + if (b == d->nr_bits) + d->step = metadata_digest_remove_writeset; + else + d->current_bit = b; + + return 0; +} + +static int metadata_digest_lookup_writeset(struct era_metadata *md, + struct digest *d) +{ + int r; + uint64_t key; + struct writeset_disk disk; + + r = dm_btree_find_lowest_key(&md->writeset_tree_info, + md->writeset_tree_root, &key); + if (r < 0) + return r; + + d->era = key; + + r = dm_btree_lookup(&md->writeset_tree_info, + md->writeset_tree_root, &key, &disk); + if (r) { + if (r == -ENODATA) { + d->step = NULL; + return 0; + } + + DMERR("%s: dm_btree_lookup failed", __func__); + return r; + } + + ws_unpack(&disk, &d->writeset); + d->value = cpu_to_le32(key); + + /* + * We initialise another bitset info to avoid any caching side effects + * with the previous one. + */ + dm_disk_bitset_init(md->tm, &d->info); + + d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); + d->current_bit = 0; + d->step = metadata_digest_transcribe_writeset; + + return 0; +} + +static int metadata_digest_start(struct era_metadata *md, struct digest *d) +{ + if (d->step) + return 0; + + memset(d, 0, sizeof(*d)); + d->step = metadata_digest_lookup_writeset; + + return 0; +} + +/*---------------------------------------------------------------- + * High level metadata interface. Target methods should use these, and not + * the lower level ones. + *--------------------------------------------------------------*/ +static struct era_metadata *metadata_open(struct block_device *bdev, + sector_t block_size, + bool may_format) +{ + int r; + struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL); + + if (!md) + return NULL; + + md->bdev = bdev; + md->block_size = block_size; + + md->writesets[0].md.root = INVALID_WRITESET_ROOT; + md->writesets[1].md.root = INVALID_WRITESET_ROOT; + md->current_writeset = &md->writesets[0]; + + r = create_persistent_data_objects(md, may_format); + if (r) { + kfree(md); + return ERR_PTR(r); + } + + return md; +} + +static void metadata_close(struct era_metadata *md) +{ + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); + destroy_persistent_data_objects(md); + kfree(md); +} + +static bool valid_nr_blocks(dm_block_t n) +{ + /* + * dm_bitset restricts us to 2^32. test_bit & co. restrict us + * further to 2^31 - 1 + */ + return n < (1ull << 31); +} + +static int metadata_resize(struct era_metadata *md, void *arg) +{ + int r; + dm_block_t *new_size = arg; + __le32 value; + + if (!valid_nr_blocks(*new_size)) { + DMERR("Invalid number of origin blocks %llu", + (unsigned long long) *new_size); + return -EINVAL; + } + + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); + + r = writeset_alloc(&md->writesets[0], *new_size); + if (r) { + DMERR("%s: writeset_alloc failed for writeset 0", __func__); + return r; + } + + r = writeset_alloc(&md->writesets[1], *new_size); + if (r) { + DMERR("%s: writeset_alloc failed for writeset 1", __func__); + writeset_free(&md->writesets[0]); + return r; + } + + value = cpu_to_le32(0u); + __dm_bless_for_disk(&value); + r = dm_array_resize(&md->era_array_info, md->era_array_root, + md->nr_blocks, *new_size, + &value, &md->era_array_root); + if (r) { + DMERR("%s: dm_array_resize failed", __func__); + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); + return r; + } + + md->nr_blocks = *new_size; + return 0; +} + +static int metadata_era_archive(struct era_metadata *md) +{ + int r; + uint64_t keys[1]; + struct writeset_disk value; + + r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, + &md->current_writeset->md.root); + if (r) { + DMERR("%s: dm_bitset_flush failed", __func__); + return r; + } + + ws_pack(&md->current_writeset->md, &value); + + keys[0] = md->current_era; + __dm_bless_for_disk(&value); + r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root, + keys, &value, &md->writeset_tree_root); + if (r) { + DMERR("%s: couldn't insert writeset into btree", __func__); + /* FIXME: fail mode */ + return r; + } + + md->current_writeset->md.root = INVALID_WRITESET_ROOT; + md->archived_writesets = true; + + return 0; +} + +static struct writeset *next_writeset(struct era_metadata *md) +{ + return (md->current_writeset == &md->writesets[0]) ? + &md->writesets[1] : &md->writesets[0]; +} + +static int metadata_new_era(struct era_metadata *md) +{ + int r; + struct writeset *new_writeset = next_writeset(md); + + r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks); + if (r) { + DMERR("%s: writeset_init failed", __func__); + return r; + } + + swap_writeset(md, new_writeset); + md->current_era++; + + return 0; +} + +static int metadata_era_rollover(struct era_metadata *md) +{ + int r; + + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { + r = metadata_era_archive(md); + if (r) { + DMERR("%s: metadata_archive_era failed", __func__); + /* FIXME: fail mode? */ + return r; + } + } + + r = metadata_new_era(md); + if (r) { + DMERR("%s: new era failed", __func__); + /* FIXME: fail mode */ + return r; + } + + return 0; +} + +static bool metadata_current_marked(struct era_metadata *md, dm_block_t block) +{ + bool r; + struct writeset *ws; + + rcu_read_lock(); + ws = rcu_dereference(md->current_writeset); + r = writeset_marked(ws, block); + rcu_read_unlock(); + + return r; +} + +static int metadata_commit(struct era_metadata *md) +{ + int r; + struct dm_block *sblock; + + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { + r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, + &md->current_writeset->md.root); + if (r) { + DMERR("%s: bitset flush failed", __func__); + return r; + } + } + + r = dm_tm_pre_commit(md->tm); + if (r) { + DMERR("%s: pre commit failed", __func__); + return r; + } + + r = save_sm_root(md); + if (r) { + DMERR("%s: save_sm_root failed", __func__); + return r; + } + + r = superblock_lock(md, &sblock); + if (r) { + DMERR("%s: superblock lock failed", __func__); + return r; + } + + prepare_superblock(md, dm_block_data(sblock)); + + return dm_tm_commit(md->tm, sblock); +} + +static int metadata_checkpoint(struct era_metadata *md) +{ + /* + * For now we just rollover, but later I want to put a check in to + * avoid this if the filter is still pretty fresh. + */ + return metadata_era_rollover(md); +} + +/* + * Metadata snapshots allow userland to access era data. + */ +static int metadata_take_snap(struct era_metadata *md) +{ + int r, inc; + struct dm_block *clone; + + if (md->metadata_snap != SUPERBLOCK_LOCATION) { + DMERR("%s: metadata snapshot already exists", __func__); + return -EINVAL; + } + + r = metadata_era_rollover(md); + if (r) { + DMERR("%s: era rollover failed", __func__); + return r; + } + + r = metadata_commit(md); + if (r) { + DMERR("%s: pre commit failed", __func__); + return r; + } + + r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION); + if (r) { + DMERR("%s: couldn't increment superblock", __func__); + return r; + } + + r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION, + &sb_validator, &clone, &inc); + if (r) { + DMERR("%s: couldn't shadow superblock", __func__); + dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION); + return r; + } + BUG_ON(!inc); + + r = dm_sm_inc_block(md->sm, md->writeset_tree_root); + if (r) { + DMERR("%s: couldn't inc writeset tree root", __func__); + dm_tm_unlock(md->tm, clone); + return r; + } + + r = dm_sm_inc_block(md->sm, md->era_array_root); + if (r) { + DMERR("%s: couldn't inc era tree root", __func__); + dm_sm_dec_block(md->sm, md->writeset_tree_root); + dm_tm_unlock(md->tm, clone); + return r; + } + + md->metadata_snap = dm_block_location(clone); + + dm_tm_unlock(md->tm, clone); + + return 0; +} + +static int metadata_drop_snap(struct era_metadata *md) +{ + int r; + dm_block_t location; + struct dm_block *clone; + struct superblock_disk *disk; + + if (md->metadata_snap == SUPERBLOCK_LOCATION) { + DMERR("%s: no snap to drop", __func__); + return -EINVAL; + } + + r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone); + if (r) { + DMERR("%s: couldn't read lock superblock clone", __func__); + return r; + } + + /* + * Whatever happens now we'll commit with no record of the metadata + * snap. + */ + md->metadata_snap = SUPERBLOCK_LOCATION; + + disk = dm_block_data(clone); + r = dm_btree_del(&md->writeset_tree_info, + le64_to_cpu(disk->writeset_tree_root)); + if (r) { + DMERR("%s: error deleting writeset tree clone", __func__); + dm_tm_unlock(md->tm, clone); + return r; + } + + r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root)); + if (r) { + DMERR("%s: error deleting era array clone", __func__); + dm_tm_unlock(md->tm, clone); + return r; + } + + location = dm_block_location(clone); + dm_tm_unlock(md->tm, clone); + + return dm_sm_dec_block(md->sm, location); +} + +struct metadata_stats { + dm_block_t used; + dm_block_t total; + dm_block_t snap; + uint32_t era; +}; + +static int metadata_get_stats(struct era_metadata *md, void *ptr) +{ + int r; + struct metadata_stats *s = ptr; + dm_block_t nr_free, nr_total; + + r = dm_sm_get_nr_free(md->sm, &nr_free); + if (r) { + DMERR("dm_sm_get_nr_free returned %d", r); + return r; + } + + r = dm_sm_get_nr_blocks(md->sm, &nr_total); + if (r) { + DMERR("dm_pool_get_metadata_dev_size returned %d", r); + return r; + } + + s->used = nr_total - nr_free; + s->total = nr_total; + s->snap = md->metadata_snap; + s->era = md->current_era; + + return 0; +} + +/*----------------------------------------------------------------*/ + +struct era { + struct dm_target *ti; + + struct dm_dev *metadata_dev; + struct dm_dev *origin_dev; + + dm_block_t nr_blocks; + uint32_t sectors_per_block; + int sectors_per_block_shift; + struct era_metadata *md; + + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t deferred_lock; + struct bio_list deferred_bios; + + spinlock_t rpc_lock; + struct list_head rpc_calls; + + struct digest digest; + atomic_t suspended; +}; + +struct rpc { + struct list_head list; + + int (*fn0)(struct era_metadata *); + int (*fn1)(struct era_metadata *, void *); + void *arg; + int result; + + struct completion complete; +}; + +/*---------------------------------------------------------------- + * Remapping. + *---------------------------------------------------------------*/ +static bool block_size_is_power_of_two(struct era *era) +{ + return era->sectors_per_block_shift >= 0; +} + +static dm_block_t get_block(struct era *era, struct bio *bio) +{ + sector_t block_nr = bio->bi_iter.bi_sector; + + if (!block_size_is_power_of_two(era)) + (void) sector_div(block_nr, era->sectors_per_block); + else + block_nr >>= era->sectors_per_block_shift; + + return block_nr; +} + +static void remap_to_origin(struct era *era, struct bio *bio) +{ + bio_set_dev(bio, era->origin_dev->bdev); +} + +/*---------------------------------------------------------------- + * Worker thread + *--------------------------------------------------------------*/ +static void wake_worker(struct era *era) +{ + if (!atomic_read(&era->suspended)) + queue_work(era->wq, &era->worker); +} + +static void process_old_eras(struct era *era) +{ + int r; + + if (!era->digest.step) + return; + + r = era->digest.step(era->md, &era->digest); + if (r < 0) { + DMERR("%s: digest step failed, stopping digestion", __func__); + era->digest.step = NULL; + + } else if (era->digest.step) + wake_worker(era); +} + +static void process_deferred_bios(struct era *era) +{ + int r; + struct bio_list deferred_bios, marked_bios; + struct bio *bio; + struct blk_plug plug; + bool commit_needed = false; + bool failed = false; + struct writeset *ws = era->md->current_writeset; + + bio_list_init(&deferred_bios); + bio_list_init(&marked_bios); + + spin_lock(&era->deferred_lock); + bio_list_merge(&deferred_bios, &era->deferred_bios); + bio_list_init(&era->deferred_bios); + spin_unlock(&era->deferred_lock); + + if (bio_list_empty(&deferred_bios)) + return; + + while ((bio = bio_list_pop(&deferred_bios))) { + r = writeset_test_and_set(&era->md->bitset_info, ws, + get_block(era, bio)); + if (r < 0) { + /* + * This is bad news, we need to rollback. + * FIXME: finish. + */ + failed = true; + } else if (r == 0) + commit_needed = true; + + bio_list_add(&marked_bios, bio); + } + + if (commit_needed) { + r = metadata_commit(era->md); + if (r) + failed = true; + } + + if (failed) + while ((bio = bio_list_pop(&marked_bios))) + bio_io_error(bio); + else { + blk_start_plug(&plug); + while ((bio = bio_list_pop(&marked_bios))) { + /* + * Only update the in-core writeset if the on-disk one + * was updated too. + */ + if (commit_needed) + set_bit(get_block(era, bio), ws->bits); + submit_bio_noacct(bio); + } + blk_finish_plug(&plug); + } +} + +static void process_rpc_calls(struct era *era) +{ + int r; + bool need_commit = false; + struct list_head calls; + struct rpc *rpc, *tmp; + + INIT_LIST_HEAD(&calls); + spin_lock(&era->rpc_lock); + list_splice_init(&era->rpc_calls, &calls); + spin_unlock(&era->rpc_lock); + + list_for_each_entry_safe(rpc, tmp, &calls, list) { + rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg); + need_commit = true; + } + + if (need_commit) { + r = metadata_commit(era->md); + if (r) + list_for_each_entry_safe(rpc, tmp, &calls, list) + rpc->result = r; + } + + list_for_each_entry_safe(rpc, tmp, &calls, list) + complete(&rpc->complete); +} + +static void kick_off_digest(struct era *era) +{ + if (era->md->archived_writesets) { + era->md->archived_writesets = false; + metadata_digest_start(era->md, &era->digest); + } +} + +static void do_work(struct work_struct *ws) +{ + struct era *era = container_of(ws, struct era, worker); + + kick_off_digest(era); + process_old_eras(era); + process_deferred_bios(era); + process_rpc_calls(era); +} + +static void defer_bio(struct era *era, struct bio *bio) +{ + spin_lock(&era->deferred_lock); + bio_list_add(&era->deferred_bios, bio); + spin_unlock(&era->deferred_lock); + + wake_worker(era); +} + +/* + * Make an rpc call to the worker to change the metadata. + */ +static int perform_rpc(struct era *era, struct rpc *rpc) +{ + rpc->result = 0; + init_completion(&rpc->complete); + + spin_lock(&era->rpc_lock); + list_add(&rpc->list, &era->rpc_calls); + spin_unlock(&era->rpc_lock); + + wake_worker(era); + wait_for_completion(&rpc->complete); + + return rpc->result; +} + +static int in_worker0(struct era *era, int (*fn)(struct era_metadata *)) +{ + struct rpc rpc; + rpc.fn0 = fn; + rpc.fn1 = NULL; + + return perform_rpc(era, &rpc); +} + +static int in_worker1(struct era *era, + int (*fn)(struct era_metadata *, void *), void *arg) +{ + struct rpc rpc; + rpc.fn0 = NULL; + rpc.fn1 = fn; + rpc.arg = arg; + + return perform_rpc(era, &rpc); +} + +static void start_worker(struct era *era) +{ + atomic_set(&era->suspended, 0); +} + +static void stop_worker(struct era *era) +{ + atomic_set(&era->suspended, 1); + drain_workqueue(era->wq); +} + +/*---------------------------------------------------------------- + * Target methods + *--------------------------------------------------------------*/ +static void era_destroy(struct era *era) +{ + if (era->md) + metadata_close(era->md); + + if (era->wq) + destroy_workqueue(era->wq); + + if (era->origin_dev) + dm_put_device(era->ti, era->origin_dev); + + if (era->metadata_dev) + dm_put_device(era->ti, era->metadata_dev); + + kfree(era); +} + +static dm_block_t calc_nr_blocks(struct era *era) +{ + return dm_sector_div_up(era->ti->len, era->sectors_per_block); +} + +static bool valid_block_size(dm_block_t block_size) +{ + bool greater_than_zero = block_size > 0; + bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0; + + return greater_than_zero && multiple_of_min_block_size; +} + +/* + * + */ +static int era_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + char dummy; + struct era *era; + struct era_metadata *md; + + if (argc != 3) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + era = kzalloc(sizeof(*era), GFP_KERNEL); + if (!era) { + ti->error = "Error allocating era structure"; + return -ENOMEM; + } + + era->ti = ti; + + r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev); + if (r) { + ti->error = "Error opening metadata device"; + era_destroy(era); + return -EINVAL; + } + + r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev); + if (r) { + ti->error = "Error opening data device"; + era_destroy(era); + return -EINVAL; + } + + r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy); + if (r != 1) { + ti->error = "Error parsing block size"; + era_destroy(era); + return -EINVAL; + } + + r = dm_set_target_max_io_len(ti, era->sectors_per_block); + if (r) { + ti->error = "could not set max io len"; + era_destroy(era); + return -EINVAL; + } + + if (!valid_block_size(era->sectors_per_block)) { + ti->error = "Invalid block size"; + era_destroy(era); + return -EINVAL; + } + if (era->sectors_per_block & (era->sectors_per_block - 1)) + era->sectors_per_block_shift = -1; + else + era->sectors_per_block_shift = __ffs(era->sectors_per_block); + + md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true); + if (IS_ERR(md)) { + ti->error = "Error reading metadata"; + era_destroy(era); + return PTR_ERR(md); + } + era->md = md; + + era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!era->wq) { + ti->error = "could not create workqueue for metadata object"; + era_destroy(era); + return -ENOMEM; + } + INIT_WORK(&era->worker, do_work); + + spin_lock_init(&era->deferred_lock); + bio_list_init(&era->deferred_bios); + + spin_lock_init(&era->rpc_lock); + INIT_LIST_HEAD(&era->rpc_calls); + + ti->private = era; + ti->num_flush_bios = 1; + ti->flush_supported = true; + + ti->num_discard_bios = 1; + + return 0; +} + +static void era_dtr(struct dm_target *ti) +{ + era_destroy(ti->private); +} + +static int era_map(struct dm_target *ti, struct bio *bio) +{ + struct era *era = ti->private; + dm_block_t block = get_block(era, bio); + + /* + * All bios get remapped to the origin device. We do this now, but + * it may not get issued until later. Depending on whether the + * block is marked in this era. + */ + remap_to_origin(era, bio); + + /* + * REQ_PREFLUSH bios carry no data, so we're not interested in them. + */ + if (!(bio->bi_opf & REQ_PREFLUSH) && + (bio_data_dir(bio) == WRITE) && + !metadata_current_marked(era->md, block)) { + defer_bio(era, bio); + return DM_MAPIO_SUBMITTED; + } + + return DM_MAPIO_REMAPPED; +} + +static void era_postsuspend(struct dm_target *ti) +{ + int r; + struct era *era = ti->private; + + r = in_worker0(era, metadata_era_archive); + if (r) { + DMERR("%s: couldn't archive current era", __func__); + /* FIXME: fail mode */ + } + + stop_worker(era); + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); + /* FIXME: fail mode */ + } +} + +static int era_preresume(struct dm_target *ti) +{ + int r; + struct era *era = ti->private; + dm_block_t new_size = calc_nr_blocks(era); + + if (era->nr_blocks != new_size) { + r = metadata_resize(era->md, &new_size); + if (r) { + DMERR("%s: metadata_resize failed", __func__); + return r; + } + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); + return r; + } + + era->nr_blocks = new_size; + } + + start_worker(era); + + r = in_worker0(era, metadata_era_rollover); + if (r) { + DMERR("%s: metadata_era_rollover failed", __func__); + return r; + } + + return 0; +} + +/* + * Status format: + * + * <#used metadata blocks>/<#total metadata blocks> + * + */ +static void era_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + int r; + struct era *era = ti->private; + ssize_t sz = 0; + struct metadata_stats stats; + char buf[BDEVNAME_SIZE]; + + switch (type) { + case STATUSTYPE_INFO: + r = in_worker1(era, metadata_get_stats, &stats); + if (r) + goto err; + + DMEMIT("%u %llu/%llu %u", + (unsigned int) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), + (unsigned long long) stats.used, + (unsigned long long) stats.total, + (unsigned int) stats.era); + + if (stats.snap != SUPERBLOCK_LOCATION) + DMEMIT(" %llu", stats.snap); + else + DMEMIT(" -"); + break; + + case STATUSTYPE_TABLE: + format_dev_t(buf, era->metadata_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + format_dev_t(buf, era->origin_dev->bdev->bd_dev); + DMEMIT("%s %u", buf, era->sectors_per_block); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return; + +err: + DMEMIT("Error"); +} + +static int era_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct era *era = ti->private; + + if (argc != 1) { + DMERR("incorrect number of message arguments"); + return -EINVAL; + } + + if (!strcasecmp(argv[0], "checkpoint")) + return in_worker0(era, metadata_checkpoint); + + if (!strcasecmp(argv[0], "take_metadata_snap")) + return in_worker0(era, metadata_take_snap); + + if (!strcasecmp(argv[0], "drop_metadata_snap")) + return in_worker0(era, metadata_drop_snap); + + DMERR("unsupported message '%s'", argv[0]); + return -EINVAL; +} + +static sector_t get_dev_size(struct dm_dev *dev) +{ + return bdev_nr_sectors(dev->bdev); +} + +static int era_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct era *era = ti->private; + return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data); +} + +static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct era *era = ti->private; + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with the + * era device's blocksize (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < era->sectors_per_block || + do_div(io_opt_sectors, era->sectors_per_block)) { + blk_limits_io_min(limits, 0); + blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT); + } +} + +/*----------------------------------------------------------------*/ + +static struct target_type era_target = { + .name = "era", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = era_ctr, + .dtr = era_dtr, + .map = era_map, + .postsuspend = era_postsuspend, + .preresume = era_preresume, + .status = era_status, + .message = era_message, + .iterate_devices = era_iterate_devices, + .io_hints = era_io_hints +}; + +static int __init dm_era_init(void) +{ + int r; + + r = dm_register_target(&era_target); + if (r) { + DMERR("era target registration failed: %d", r); + return r; + } + + return 0; +} + +static void __exit dm_era_exit(void) +{ + dm_unregister_target(&era_target); +} + +module_init(dm_era_init); +module_exit(dm_era_exit); + +MODULE_DESCRIPTION(DM_NAME " era target"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c new file mode 100644 index 000000000..cc3987c97 --- /dev/null +++ b/drivers/md/dm-exception-store.c @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "snapshot exception stores" + +static LIST_HEAD(_exception_store_types); +static DEFINE_SPINLOCK(_lock); + +static struct dm_exception_store_type *__find_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + list_for_each_entry(type, &_exception_store_types, list) + if (!strcmp(name, type->name)) + return type; + + return NULL; +} + +static struct dm_exception_store_type *_get_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + spin_lock(&_lock); + + type = __find_exception_store_type(name); + + if (type && !try_module_get(type->module)) + type = NULL; + + spin_unlock(&_lock); + + return type; +} + +/* + * get_type + * @type_name + * + * Attempt to retrieve the dm_exception_store_type by name. If not already + * available, attempt to load the appropriate module. + * + * Exstore modules are named "dm-exstore-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-exstore-", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-shared", it would search + * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'. + * + * 'dm-exception-store-' is too long of a name in my + * opinion, which is why I've chosen to have the files + * containing exception store implementations be 'dm-exstore-'. + * If you want your module to be autoloaded, you will follow this + * naming convention. + * + * Returns: dm_exception_store_type* on success, NULL on failure + */ +static struct dm_exception_store_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dm_exception_store_type *type; + + type = _get_exception_store_type(type_name); + if (type) + return type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMERR("No memory left to attempt load for \"%s\"", type_name); + return NULL; + } + + while (request_module("dm-exstore-%s", type_name_dup) || + !(type = _get_exception_store_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!type) + DMWARN("Module for exstore type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return type; +} + +static void put_type(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + module_put(type->module); + spin_unlock(&_lock); +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type) +{ + int r = 0; + + spin_lock(&_lock); + if (!__find_exception_store_type(type->name)) + list_add(&type->list, &_exception_store_types); + else + r = -EEXIST; + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_exception_store_type_register); + +int dm_exception_store_type_unregister(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + + if (!__find_exception_store_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; + } + + list_del(&type->list); + + spin_unlock(&_lock); + + return 0; +} +EXPORT_SYMBOL(dm_exception_store_type_unregister); + +static int set_chunk_size(struct dm_exception_store *store, + const char *chunk_size_arg, char **error) +{ + unsigned int chunk_size; + + if (kstrtouint(chunk_size_arg, 10, &chunk_size)) { + *error = "Invalid chunk size"; + return -EINVAL; + } + + if (!chunk_size) { + store->chunk_size = store->chunk_mask = store->chunk_shift = 0; + return 0; + } + + return dm_exception_store_set_chunk_size(store, chunk_size, error); +} + +int dm_exception_store_set_chunk_size(struct dm_exception_store *store, + unsigned int chunk_size, + char **error) +{ + /* Check chunk_size is a power of 2 */ + if (!is_power_of_2(chunk_size)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } + + /* Validate the chunk size against the device block size */ + if (chunk_size % + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || + chunk_size % + (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; + } + + if (chunk_size > INT_MAX >> SECTOR_SHIFT) { + *error = "Chunk size is too high"; + return -EINVAL; + } + + store->chunk_size = chunk_size; + store->chunk_mask = chunk_size - 1; + store->chunk_shift = __ffs(chunk_size); + + return 0; +} + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, + unsigned int *args_used, + struct dm_exception_store **store) +{ + int r = 0; + struct dm_exception_store_type *type = NULL; + struct dm_exception_store *tmp_store; + char persistent; + + if (argc < 2) { + ti->error = "Insufficient exception store arguments"; + return -EINVAL; + } + + tmp_store = kzalloc(sizeof(*tmp_store), GFP_KERNEL); + if (!tmp_store) { + ti->error = "Exception store allocation failed"; + return -ENOMEM; + } + + persistent = toupper(*argv[0]); + if (persistent == 'P') + type = get_type("P"); + else if (persistent == 'N') + type = get_type("N"); + else { + ti->error = "Exception store type is not P or N"; + r = -EINVAL; + goto bad_type; + } + + if (!type) { + ti->error = "Exception store type not recognised"; + r = -EINVAL; + goto bad_type; + } + + tmp_store->type = type; + tmp_store->snap = snap; + + r = set_chunk_size(tmp_store, argv[1], &ti->error); + if (r) + goto bad; + + r = type->ctr(tmp_store, (strlen(argv[0]) > 1 ? &argv[0][1] : NULL)); + if (r) { + ti->error = "Exception store type constructor failed"; + goto bad; + } + + *args_used = 2; + *store = tmp_store; + return 0; + +bad: + put_type(type); +bad_type: + kfree(tmp_store); + return r; +} +EXPORT_SYMBOL(dm_exception_store_create); + +void dm_exception_store_destroy(struct dm_exception_store *store) +{ + store->type->dtr(store); + put_type(store->type); + kfree(store); +} +EXPORT_SYMBOL(dm_exception_store_destroy); + +int dm_exception_store_init(void) +{ + int r; + + r = dm_transient_snapshot_init(); + if (r) { + DMERR("Unable to register transient exception store type."); + goto transient_fail; + } + + r = dm_persistent_snapshot_init(); + if (r) { + DMERR("Unable to register persistent exception store type"); + goto persistent_fail; + } + + return 0; + +persistent_fail: + dm_transient_snapshot_exit(); +transient_fail: + return r; +} + +void dm_exception_store_exit(void) +{ + dm_persistent_snapshot_exit(); + dm_transient_snapshot_exit(); +} diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h new file mode 100644 index 000000000..862df68a7 --- /dev/null +++ b/drivers/md/dm-exception-store.h @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * Device-mapper snapshot exception store. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_EXCEPTION_STORE +#define _LINUX_DM_EXCEPTION_STORE + +#include +#include +#include + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 32k - 512k. + */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number + * of chunks that follow contiguously. Remaining bits hold the number of the + * chunk within the device. + */ +struct dm_exception { + struct hlist_bl_node hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct dm_exception_store; +struct dm_exception_store_type { + const char *name; + struct module *module; + + int (*ctr) (struct dm_exception_store *store, char *options); + + /* + * Destroys this object when you've finished with it. + */ + void (*dtr) (struct dm_exception_store *store); + + /* + * The target shouldn't read the COW device until this is + * called. As exceptions are read from the COW, they are + * reported back via the callback. + */ + int (*read_metadata) (struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct dm_exception_store *store, + struct dm_exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct dm_exception_store *store, + struct dm_exception *e, int valid, + void (*callback) (void *, int success), + void *callback_context); + + /* + * Returns 0 if the exception store is empty. + * + * If there are exceptions still to be merged, sets + * *last_old_chunk and *last_new_chunk to the most recent + * still-to-be-merged chunk and returns the number of + * consecutive previous ones. + */ + int (*prepare_merge) (struct dm_exception_store *store, + chunk_t *last_old_chunk, chunk_t *last_new_chunk); + + /* + * Clear the last n exceptions. + * nr_merged must be <= the value returned by prepare_merge. + */ + int (*commit_merge) (struct dm_exception_store *store, int nr_merged); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct dm_exception_store *store); + + unsigned int (*status) (struct dm_exception_store *store, + status_type_t status, char *result, + unsigned int maxlen); + + /* + * Return how full the snapshot is. + */ + void (*usage) (struct dm_exception_store *store, + sector_t *total_sectors, sector_t *sectors_allocated, + sector_t *metadata_sectors); + + /* For internal device-mapper use only. */ + struct list_head list; +}; + +struct dm_snapshot; + +struct dm_exception_store { + struct dm_exception_store_type *type; + struct dm_snapshot *snap; + + /* Size of data blocks saved - must be a power of 2 */ + unsigned int chunk_size; + unsigned int chunk_mask; + unsigned int chunk_shift; + + void *context; + + bool userspace_supports_overflow; +}; + +/* + * Obtain the origin or cow device used by a given snapshot. + */ +struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); +struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); + +/* + * Funtions to manipulate consecutive chunks + */ +#define DM_CHUNK_CONSECUTIVE_BITS 8 +#define DM_CHUNK_NUMBER_BITS 56 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); +} + +static inline unsigned int dm_consecutive_chunk_count(struct dm_exception *e) +{ + return e->new_chunk >> DM_CHUNK_NUMBER_BITS; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) +{ + e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); + + BUG_ON(!dm_consecutive_chunk_count(e)); +} + +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ + BUG_ON(!dm_consecutive_chunk_count(e)); + + e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); +} + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ + return bdev_nr_sectors(bdev); +} + +static inline chunk_t sector_to_chunk(struct dm_exception_store *store, + sector_t sector) +{ + return sector >> store->chunk_shift; +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type); +int dm_exception_store_type_unregister(struct dm_exception_store_type *type); + +int dm_exception_store_set_chunk_size(struct dm_exception_store *store, + unsigned int chunk_size, + char **error); + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, + unsigned int *args_used, + struct dm_exception_store **store); +void dm_exception_store_destroy(struct dm_exception_store *store); + +int dm_exception_store_init(void); +void dm_exception_store_exit(void); + +/* + * Two exception store implementations. + */ +int dm_persistent_snapshot_init(void); +void dm_persistent_snapshot_exit(void); + +int dm_transient_snapshot_init(void); +void dm_transient_snapshot_exit(void); + +#endif /* _LINUX_DM_EXCEPTION_STORE */ diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c new file mode 100644 index 000000000..3b34270ce --- /dev/null +++ b/drivers/md/dm-flakey.c @@ -0,0 +1,530 @@ +/* + * Copyright (C) 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "flakey" + +#define all_corrupt_bio_flags_match(bio, fc) \ + (((bio)->bi_opf & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + +/* + * Flakey: Used for testing only, simulates intermittent, + * catastrophic device failure. + */ +struct flakey_c { + struct dm_dev *dev; + unsigned long start_time; + sector_t start; + unsigned int up_interval; + unsigned int down_interval; + unsigned long flags; + unsigned int corrupt_bio_byte; + unsigned int corrupt_bio_rw; + unsigned int corrupt_bio_value; + blk_opf_t corrupt_bio_flags; +}; + +enum feature_flag_bits { + DROP_WRITES, + ERROR_WRITES +}; + +struct per_bio_data { + bool bio_submitted; +}; + +static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, + struct dm_target *ti) +{ + int r; + unsigned int argc; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + {1, UINT_MAX, "Invalid corrupt bio byte"}, + {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, + {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + }; + + /* No feature arguments supplied. */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + if (!arg_name) { + ti->error = "Insufficient feature arguments"; + return -EINVAL; + } + + /* + * drop_writes + */ + if (!strcasecmp(arg_name, "drop_writes")) { + if (test_and_set_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes duplicated"; + return -EINVAL; + } else if (test_bit(ERROR_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes conflicts with feature error_writes"; + return -EINVAL; + } + + continue; + } + + /* + * error_writes + */ + if (!strcasecmp(arg_name, "error_writes")) { + if (test_and_set_bit(ERROR_WRITES, &fc->flags)) { + ti->error = "Feature error_writes duplicated"; + return -EINVAL; + + } else if (test_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature error_writes conflicts with feature drop_writes"; + return -EINVAL; + } + + continue; + } + + /* + * corrupt_bio_byte + */ + if (!strcasecmp(arg_name, "corrupt_bio_byte")) { + if (!argc) { + ti->error = "Feature corrupt_bio_byte requires parameters"; + return -EINVAL; + } + + r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); + if (r) + return r; + argc--; + + /* + * Direction r or w? + */ + arg_name = dm_shift_arg(as); + if (arg_name && !strcasecmp(arg_name, "w")) + fc->corrupt_bio_rw = WRITE; + else if (arg_name && !strcasecmp(arg_name, "r")) + fc->corrupt_bio_rw = READ; + else { + ti->error = "Invalid corrupt bio direction (r or w)"; + return -EINVAL; + } + argc--; + + /* + * Value of byte (0-255) to write in place of correct one. + */ + r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); + if (r) + return r; + argc--; + + /* + * Only corrupt bios with these flags set. + */ + BUILD_BUG_ON(sizeof(fc->corrupt_bio_flags) != + sizeof(unsigned int)); + r = dm_read_arg(_args + 3, as, + (__force unsigned int *)&fc->corrupt_bio_flags, + &ti->error); + if (r) + return r; + argc--; + + continue; + } + + ti->error = "Unrecognised flakey feature requested"; + return -EINVAL; + } + + if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + + } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } + + return 0; +} + +/* + * Construct a flakey mapping: + * [<#feature args> []*] + * + * Feature args: + * [drop_writes] + * [corrupt_bio_byte ] + * + * Nth_byte starts from 1 for the first byte. + * Direction is r for READ or w for WRITE. + * bio_flags is ignored if 0. + */ +static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + static const struct dm_arg _args[] = { + {0, UINT_MAX, "Invalid up interval"}, + {0, UINT_MAX, "Invalid down interval"}, + }; + + int r; + struct flakey_c *fc; + unsigned long long tmpll; + struct dm_arg_set as; + const char *devname; + char dummy; + + as.argc = argc; + as.argv = argv; + + if (argc < 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + fc = kzalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + fc->start_time = jiffies; + + devname = dm_shift_arg(&as); + + r = -EINVAL; + if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { + ti->error = "Invalid device sector"; + goto bad; + } + fc->start = tmpll; + + r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + if (r) + goto bad; + + if (!(fc->up_interval + fc->down_interval)) { + ti->error = "Total (up + down) interval is zero"; + r = -EINVAL; + goto bad; + } + + if (fc->up_interval + fc->down_interval < fc->up_interval) { + ti->error = "Interval overflow"; + r = -EINVAL; + goto bad; + } + + r = parse_features(&as, fc, ti); + if (r) + goto bad; + + r = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->per_io_data_size = sizeof(struct per_bio_data); + ti->private = fc; + return 0; + +bad: + kfree(fc); + return r; +} + +static void flakey_dtr(struct dm_target *ti) +{ + struct flakey_c *fc = ti->private; + + dm_put_device(ti, fc->dev); + kfree(fc); +} + +static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct flakey_c *fc = ti->private; + + return fc->start + dm_target_offset(ti, bi_sector); +} + +static void flakey_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct flakey_c *fc = ti->private; + + bio_set_dev(bio, fc->dev->bdev); + bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); +} + +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; + + struct bvec_iter iter; + struct bio_vec bvec; + + if (!bio_has_data(bio)) + return; + + /* + * Overwrite the Nth byte of the bio's data, on whichever page + * it falls. + */ + bio_for_each_segment(bvec, bio, iter) { + if (bio_iter_len(bio, iter) > corrupt_bio_byte) { + char *segment; + struct page *page = bio_iter_page(bio, iter); + if (unlikely(page == ZERO_PAGE(0))) + break; + segment = bvec_kmap_local(&bvec); + segment[corrupt_bio_byte] = fc->corrupt_bio_value; + kunmap_local(segment); + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + break; + } + corrupt_bio_byte -= bio_iter_len(bio, iter); + } +} + +static int flakey_map(struct dm_target *ti, struct bio *bio) +{ + struct flakey_c *fc = ti->private; + unsigned int elapsed; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + pb->bio_submitted = false; + + if (op_is_zone_mgmt(bio_op(bio))) + goto map_bio; + + /* Are we alive ? */ + elapsed = (jiffies - fc->start_time) / HZ; + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + /* + * Flag this bio as submitted while down. + */ + pb->bio_submitted = true; + + /* + * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. + * Otherwise, flakey_end_io() will decide if the reads should be modified. + */ + if (bio_data_dir(bio) == READ) { + if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) && + !test_bit(ERROR_WRITES, &fc->flags)) + return DM_MAPIO_KILL; + goto map_bio; + } + + /* + * Drop or error writes? + */ + if (test_bit(DROP_WRITES, &fc->flags)) { + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + else if (test_bit(ERROR_WRITES, &fc->flags)) { + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + + /* + * Corrupt matching writes. + */ + if (fc->corrupt_bio_byte) { + if (fc->corrupt_bio_rw == WRITE) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + } + goto map_bio; + } + + /* + * By default, error all I/O. + */ + return DM_MAPIO_KILL; + } + +map_bio: + flakey_map_bio(ti, bio); + + return DM_MAPIO_REMAPPED; +} + +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) +{ + struct flakey_c *fc = ti->private; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + + if (op_is_zone_mgmt(bio_op(bio))) + return DM_ENDIO_DONE; + + if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (fc->corrupt_bio_byte) { + if ((fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) { + /* + * Corrupt successful matching READs while in down state. + */ + corrupt_bio_data(bio, fc); + } + } else if (!test_bit(DROP_WRITES, &fc->flags) && + !test_bit(ERROR_WRITES, &fc->flags)) { + /* + * Error read during the down_interval if drop_writes + * and error_writes were not configured. + */ + *error = BLK_STS_IOERR; + } + } + + return DM_ENDIO_DONE; +} + +static void flakey_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct flakey_c *fc = ti->private; + unsigned int drop_writes, error_writes; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u %u ", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + + drop_writes = test_bit(DROP_WRITES, &fc->flags); + error_writes = test_bit(ERROR_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5); + + if (drop_writes) + DMEMIT("drop_writes "); + else if (error_writes) + DMEMIT("error_writes "); + + if (fc->corrupt_bio_byte) + DMEMIT("corrupt_bio_byte %u %c %u %u ", + fc->corrupt_bio_byte, + (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', + fc->corrupt_bio_value, fc->corrupt_bio_flags); + + break; + + case STATUSTYPE_IMA: + result[0] = '\0'; + break; + } +} + +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct flakey_c *fc = ti->private; + + *bdev = fc->dev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (fc->start || ti->len != bdev_nr_sectors((*bdev))) + return 1; + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int flakey_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct flakey_c *fc = ti->private; + + return dm_report_zones(fc->dev->bdev, fc->start, + flakey_map_sector(ti, args->next_sector), + args, nr_zones); +} +#else +#define flakey_report_zones NULL +#endif + +static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct flakey_c *fc = ti->private; + + return fn(ti, fc->dev, fc->start, ti->len, data); +} + +static struct target_type flakey_target = { + .name = "flakey", + .version = {1, 5, 0}, + .features = DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, + .report_zones = flakey_report_zones, + .module = THIS_MODULE, + .ctr = flakey_ctr, + .dtr = flakey_dtr, + .map = flakey_map, + .end_io = flakey_end_io, + .status = flakey_status, + .prepare_ioctl = flakey_prepare_ioctl, + .iterate_devices = flakey_iterate_devices, +}; + +static int __init dm_flakey_init(void) +{ + int r = dm_register_target(&flakey_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_flakey_exit(void) +{ + dm_unregister_target(&flakey_target); +} + +/* Module hooks */ +module_init(dm_flakey_init); +module_exit(dm_flakey_exit); + +MODULE_DESCRIPTION(DM_NAME " flakey target"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c new file mode 100644 index 000000000..a1bd7cd52 --- /dev/null +++ b/drivers/md/dm-ima.c @@ -0,0 +1,749 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Microsoft Corporation + * + * Author: Tushar Sugandhi + * + * File: dm-ima.c + * Enables IMA measurements for DM targets + */ + +#include "dm-core.h" +#include "dm-ima.h" + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "ima" + +/* + * Internal function to prefix separator characters in input buffer with escape + * character, so that they don't interfere with the construction of key-value pairs, + * and clients can split the key1=val1,key2=val2,key3=val3; pairs properly. + */ +static void fix_separator_chars(char **buf) +{ + int l = strlen(*buf); + int i, j, sp = 0; + + for (i = 0; i < l; i++) + if ((*buf)[i] == '\\' || (*buf)[i] == ';' || (*buf)[i] == '=' || (*buf)[i] == ',') + sp++; + + if (!sp) + return; + + for (i = l-1, j = i+sp; i >= 0; i--) { + (*buf)[j--] = (*buf)[i]; + if ((*buf)[i] == '\\' || (*buf)[i] == ';' || (*buf)[i] == '=' || (*buf)[i] == ',') + (*buf)[j--] = '\\'; + } +} + +/* + * Internal function to allocate memory for IMA measurements. + */ +static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) +{ + unsigned int noio_flag; + void *ptr; + + if (noio) + noio_flag = memalloc_noio_save(); + + ptr = kzalloc(len, flags); + + if (noio) + memalloc_noio_restore(noio_flag); + + return ptr; +} + +/* + * Internal function to allocate and copy name and uuid for IMA measurements. + */ +static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_name, + char **dev_uuid, bool noio) +{ + int r; + *dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio); + if (!(*dev_name)) { + r = -ENOMEM; + goto error; + } + + *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio); + if (!(*dev_uuid)) { + r = -ENOMEM; + goto error; + } + + r = dm_copy_name_and_uuid(md, *dev_name, *dev_uuid); + if (r) + goto error; + + fix_separator_chars(dev_name); + fix_separator_chars(dev_uuid); + + return 0; +error: + kfree(*dev_name); + kfree(*dev_uuid); + *dev_name = NULL; + *dev_uuid = NULL; + return r; +} + +/* + * Internal function to allocate and copy device data for IMA measurements. + */ +static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **device_data, + unsigned int num_targets, bool noio) +{ + char *dev_name = NULL, *dev_uuid = NULL; + int r; + + r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio); + if (r) + return r; + + *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + if (!(*device_data)) { + r = -ENOMEM; + goto error; + } + + scnprintf(*device_data, DM_IMA_DEVICE_BUF_LEN, + "name=%s,uuid=%s,major=%d,minor=%d,minor_count=%d,num_targets=%u;", + dev_name, dev_uuid, md->disk->major, md->disk->first_minor, + md->disk->minors, num_targets); +error: + kfree(dev_name); + kfree(dev_uuid); + return r; +} + +/* + * Internal wrapper function to call IMA to measure DM data. + */ +static void dm_ima_measure_data(const char *event_name, const void *buf, size_t buf_len, + bool noio) +{ + unsigned int noio_flag; + + if (noio) + noio_flag = memalloc_noio_save(); + + ima_measure_critical_data(DM_NAME, event_name, buf, buf_len, + false, NULL, 0); + + if (noio) + memalloc_noio_restore(noio_flag); +} + +/* + * Internal function to allocate and copy current device capacity for IMA measurements. + */ +static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **capacity_str, + bool noio) +{ + sector_t capacity; + + capacity = get_capacity(md->disk); + + *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio); + if (!(*capacity_str)) + return -ENOMEM; + + scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", + capacity); + + return 0; +} + +/* + * Initialize/reset the dm ima related data structure variables. + */ +void dm_ima_reset_data(struct mapped_device *md) +{ + memset(&(md->ima), 0, sizeof(md->ima)); + md->ima.dm_version_str_len = strlen(DM_IMA_VERSION_STR); +} + +/* + * Build up the IMA data for each target, and finally measure. + */ +void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_flags) +{ + size_t device_data_buf_len, target_metadata_buf_len, target_data_buf_len, l = 0; + char *target_metadata_buf = NULL, *target_data_buf = NULL, *digest_buf = NULL; + char *ima_buf = NULL, *device_data_buf = NULL; + int digest_size, last_target_measured = -1, r; + status_type_t type = STATUSTYPE_IMA; + size_t cur_total_buf_len = 0; + unsigned int num_targets, i; + SHASH_DESC_ON_STACK(shash, NULL); + struct crypto_shash *tfm = NULL; + u8 *digest = NULL; + bool noio = false; + /* + * In below hash_alg_prefix_len assignment +1 is for the additional char (':'), + * when prefixing the hash value with the hash algorithm name. e.g. sha256:. + */ + const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1; + char table_load_event_name[] = "dm_table_load"; + + ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio); + if (!ima_buf) + return; + + target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio); + if (!target_metadata_buf) + goto error; + + target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio); + if (!target_data_buf) + goto error; + + num_targets = table->num_targets; + + if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio)) + goto error; + + tfm = crypto_alloc_shash(DM_IMA_TABLE_HASH_ALG, 0, 0); + if (IS_ERR(tfm)) + goto error; + + shash->tfm = tfm; + digest_size = crypto_shash_digestsize(tfm); + digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio); + if (!digest) + goto error; + + r = crypto_shash_init(shash); + if (r) + goto error; + + memcpy(ima_buf + l, DM_IMA_VERSION_STR, table->md->ima.dm_version_str_len); + l += table->md->ima.dm_version_str_len; + + device_data_buf_len = strlen(device_data_buf); + memcpy(ima_buf + l, device_data_buf, device_data_buf_len); + l += device_data_buf_len; + + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + last_target_measured = 0; + + /* + * First retrieve the target metadata. + */ + scnprintf(target_metadata_buf, DM_IMA_TARGET_METADATA_BUF_LEN, + "target_index=%d,target_begin=%llu,target_len=%llu,", + i, ti->begin, ti->len); + target_metadata_buf_len = strlen(target_metadata_buf); + + /* + * Then retrieve the actual target data. + */ + if (ti->type->status) + ti->type->status(ti, type, status_flags, target_data_buf, + DM_IMA_TARGET_DATA_BUF_LEN); + else + target_data_buf[0] = '\0'; + + target_data_buf_len = strlen(target_data_buf); + + /* + * Check if the total data can fit into the IMA buffer. + */ + cur_total_buf_len = l + target_metadata_buf_len + target_data_buf_len; + + /* + * IMA measurements for DM targets are best-effort. + * If the total data buffered so far, including the current target, + * is too large to fit into DM_IMA_MEASUREMENT_BUF_LEN, measure what + * we have in the current buffer, and continue measuring the remaining + * targets by prefixing the device metadata again. + */ + if (unlikely(cur_total_buf_len >= DM_IMA_MEASUREMENT_BUF_LEN)) { + dm_ima_measure_data(table_load_event_name, ima_buf, l, noio); + r = crypto_shash_update(shash, (const u8 *)ima_buf, l); + if (r < 0) + goto error; + + memset(ima_buf, 0, DM_IMA_MEASUREMENT_BUF_LEN); + l = 0; + + /* + * Each new "dm_table_load" entry in IMA log should have device data + * prefix, so that multiple records from the same "dm_table_load" for + * a given device can be linked together. + */ + memcpy(ima_buf + l, DM_IMA_VERSION_STR, table->md->ima.dm_version_str_len); + l += table->md->ima.dm_version_str_len; + + memcpy(ima_buf + l, device_data_buf, device_data_buf_len); + l += device_data_buf_len; + + /* + * If this iteration of the for loop turns out to be the last target + * in the table, dm_ima_measure_data("dm_table_load", ...) doesn't need + * to be called again, just the hash needs to be finalized. + * "last_target_measured" tracks this state. + */ + last_target_measured = 1; + } + + /* + * Fill-in all the target metadata, so that multiple targets for the same + * device can be linked together. + */ + memcpy(ima_buf + l, target_metadata_buf, target_metadata_buf_len); + l += target_metadata_buf_len; + + memcpy(ima_buf + l, target_data_buf, target_data_buf_len); + l += target_data_buf_len; + } + + if (!last_target_measured) { + dm_ima_measure_data(table_load_event_name, ima_buf, l, noio); + + r = crypto_shash_update(shash, (const u8 *)ima_buf, l); + if (r < 0) + goto error; + } + + /* + * Finalize the table hash, and store it in table->md->ima.inactive_table.hash, + * so that the table data can be verified against the future device state change + * events, e.g. resume, rename, remove, table-clear etc. + */ + r = crypto_shash_final(shash, digest); + if (r < 0) + goto error; + + digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio); + + if (!digest_buf) + goto error; + + snprintf(digest_buf, hash_alg_prefix_len + 1, "%s:", DM_IMA_TABLE_HASH_ALG); + + for (i = 0; i < digest_size; i++) + snprintf((digest_buf + hash_alg_prefix_len + (i*2)), 3, "%02x", digest[i]); + + if (table->md->ima.active_table.hash != table->md->ima.inactive_table.hash) + kfree(table->md->ima.inactive_table.hash); + + table->md->ima.inactive_table.hash = digest_buf; + table->md->ima.inactive_table.hash_len = strlen(digest_buf); + table->md->ima.inactive_table.num_targets = num_targets; + + if (table->md->ima.active_table.device_metadata != + table->md->ima.inactive_table.device_metadata) + kfree(table->md->ima.inactive_table.device_metadata); + + table->md->ima.inactive_table.device_metadata = device_data_buf; + table->md->ima.inactive_table.device_metadata_len = device_data_buf_len; + + goto exit; +error: + kfree(digest_buf); + kfree(device_data_buf); +exit: + kfree(digest); + if (tfm) + crypto_free_shash(tfm); + kfree(ima_buf); + kfree(target_metadata_buf); + kfree(target_data_buf); +} + +/* + * Measure IMA data on device resume. + */ +void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) +{ + char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; + char active[] = "active_table_hash="; + unsigned int active_len = strlen(active), capacity_len = 0; + unsigned int l = 0; + bool noio = true; + bool nodata = true; + int r; + + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + if (!device_table_data) + return; + + r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (r) + goto error; + + memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); + l += md->ima.dm_version_str_len; + + if (swap) { + if (md->ima.active_table.hash != md->ima.inactive_table.hash) + kfree(md->ima.active_table.hash); + + md->ima.active_table.hash = NULL; + md->ima.active_table.hash_len = 0; + + if (md->ima.active_table.device_metadata != + md->ima.inactive_table.device_metadata) + kfree(md->ima.active_table.device_metadata); + + md->ima.active_table.device_metadata = NULL; + md->ima.active_table.device_metadata_len = 0; + md->ima.active_table.num_targets = 0; + + if (md->ima.inactive_table.hash) { + md->ima.active_table.hash = md->ima.inactive_table.hash; + md->ima.active_table.hash_len = md->ima.inactive_table.hash_len; + md->ima.inactive_table.hash = NULL; + md->ima.inactive_table.hash_len = 0; + } + + if (md->ima.inactive_table.device_metadata) { + md->ima.active_table.device_metadata = + md->ima.inactive_table.device_metadata; + md->ima.active_table.device_metadata_len = + md->ima.inactive_table.device_metadata_len; + md->ima.active_table.num_targets = md->ima.inactive_table.num_targets; + md->ima.inactive_table.device_metadata = NULL; + md->ima.inactive_table.device_metadata_len = 0; + md->ima.inactive_table.num_targets = 0; + } + } + + if (md->ima.active_table.device_metadata) { + memcpy(device_table_data + l, md->ima.active_table.device_metadata, + md->ima.active_table.device_metadata_len); + l += md->ima.active_table.device_metadata_len; + + nodata = false; + } + + if (md->ima.active_table.hash) { + memcpy(device_table_data + l, active, active_len); + l += active_len; + + memcpy(device_table_data + l, md->ima.active_table.hash, + md->ima.active_table.hash_len); + l += md->ima.active_table.hash_len; + + memcpy(device_table_data + l, ";", 1); + l++; + + nodata = false; + } + + if (nodata) { + r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio); + if (r) + goto error; + + scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_resume=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); + l = strlen(device_table_data); + + } + + capacity_len = strlen(capacity_str); + memcpy(device_table_data + l, capacity_str, capacity_len); + l += capacity_len; + + dm_ima_measure_data("dm_device_resume", device_table_data, l, noio); + + kfree(dev_name); + kfree(dev_uuid); +error: + kfree(capacity_str); + kfree(device_table_data); +} + +/* + * Measure IMA data on remove. + */ +void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) +{ + char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; + char active_table_str[] = "active_table_hash="; + char inactive_table_str[] = "inactive_table_hash="; + char device_active_str[] = "device_active_metadata="; + char device_inactive_str[] = "device_inactive_metadata="; + char remove_all_str[] = "remove_all="; + unsigned int active_table_len = strlen(active_table_str); + unsigned int inactive_table_len = strlen(inactive_table_str); + unsigned int device_active_len = strlen(device_active_str); + unsigned int device_inactive_len = strlen(device_inactive_str); + unsigned int remove_all_len = strlen(remove_all_str); + unsigned int capacity_len = 0; + unsigned int l = 0; + bool noio = true; + bool nodata = true; + int r; + + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio); + if (!device_table_data) + goto exit; + + r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (r) { + kfree(device_table_data); + goto exit; + } + + memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); + l += md->ima.dm_version_str_len; + + if (md->ima.active_table.device_metadata) { + memcpy(device_table_data + l, device_active_str, device_active_len); + l += device_active_len; + + memcpy(device_table_data + l, md->ima.active_table.device_metadata, + md->ima.active_table.device_metadata_len); + l += md->ima.active_table.device_metadata_len; + + nodata = false; + } + + if (md->ima.inactive_table.device_metadata) { + memcpy(device_table_data + l, device_inactive_str, device_inactive_len); + l += device_inactive_len; + + memcpy(device_table_data + l, md->ima.inactive_table.device_metadata, + md->ima.inactive_table.device_metadata_len); + l += md->ima.inactive_table.device_metadata_len; + + nodata = false; + } + + if (md->ima.active_table.hash) { + memcpy(device_table_data + l, active_table_str, active_table_len); + l += active_table_len; + + memcpy(device_table_data + l, md->ima.active_table.hash, + md->ima.active_table.hash_len); + l += md->ima.active_table.hash_len; + + memcpy(device_table_data + l, ",", 1); + l++; + + nodata = false; + } + + if (md->ima.inactive_table.hash) { + memcpy(device_table_data + l, inactive_table_str, inactive_table_len); + l += inactive_table_len; + + memcpy(device_table_data + l, md->ima.inactive_table.hash, + md->ima.inactive_table.hash_len); + l += md->ima.inactive_table.hash_len; + + memcpy(device_table_data + l, ",", 1); + l++; + + nodata = false; + } + /* + * In case both active and inactive tables, and corresponding + * device metadata is cleared/missing - record the name and uuid + * in IMA measurements. + */ + if (nodata) { + if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) + goto error; + + scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_remove=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); + l = strlen(device_table_data); + } + + memcpy(device_table_data + l, remove_all_str, remove_all_len); + l += remove_all_len; + memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2); + l += 2; + + capacity_len = strlen(capacity_str); + memcpy(device_table_data + l, capacity_str, capacity_len); + l += capacity_len; + + dm_ima_measure_data("dm_device_remove", device_table_data, l, noio); + +error: + kfree(device_table_data); + kfree(capacity_str); +exit: + kfree(md->ima.active_table.device_metadata); + + if (md->ima.active_table.device_metadata != + md->ima.inactive_table.device_metadata) + kfree(md->ima.inactive_table.device_metadata); + + kfree(md->ima.active_table.hash); + + if (md->ima.active_table.hash != md->ima.inactive_table.hash) + kfree(md->ima.inactive_table.hash); + + dm_ima_reset_data(md); + + kfree(dev_name); + kfree(dev_uuid); +} + +/* + * Measure ima data on table clear. + */ +void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) +{ + unsigned int l = 0, capacity_len = 0; + char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; + char inactive_str[] = "inactive_table_hash="; + unsigned int inactive_len = strlen(inactive_str); + bool noio = true; + bool nodata = true; + int r; + + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + if (!device_table_data) + return; + + r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (r) + goto error1; + + memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); + l += md->ima.dm_version_str_len; + + if (md->ima.inactive_table.device_metadata_len && + md->ima.inactive_table.hash_len) { + memcpy(device_table_data + l, md->ima.inactive_table.device_metadata, + md->ima.inactive_table.device_metadata_len); + l += md->ima.inactive_table.device_metadata_len; + + memcpy(device_table_data + l, inactive_str, inactive_len); + l += inactive_len; + + memcpy(device_table_data + l, md->ima.inactive_table.hash, + md->ima.inactive_table.hash_len); + + l += md->ima.inactive_table.hash_len; + + memcpy(device_table_data + l, ";", 1); + l++; + + nodata = false; + } + + if (nodata) { + if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) + goto error2; + + scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;table_clear=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); + l = strlen(device_table_data); + } + + capacity_len = strlen(capacity_str); + memcpy(device_table_data + l, capacity_str, capacity_len); + l += capacity_len; + + dm_ima_measure_data("dm_table_clear", device_table_data, l, noio); + + if (new_map) { + if (md->ima.inactive_table.hash && + md->ima.inactive_table.hash != md->ima.active_table.hash) + kfree(md->ima.inactive_table.hash); + + md->ima.inactive_table.hash = NULL; + md->ima.inactive_table.hash_len = 0; + + if (md->ima.inactive_table.device_metadata && + md->ima.inactive_table.device_metadata != md->ima.active_table.device_metadata) + kfree(md->ima.inactive_table.device_metadata); + + md->ima.inactive_table.device_metadata = NULL; + md->ima.inactive_table.device_metadata_len = 0; + md->ima.inactive_table.num_targets = 0; + + if (md->ima.active_table.hash) { + md->ima.inactive_table.hash = md->ima.active_table.hash; + md->ima.inactive_table.hash_len = md->ima.active_table.hash_len; + } + + if (md->ima.active_table.device_metadata) { + md->ima.inactive_table.device_metadata = + md->ima.active_table.device_metadata; + md->ima.inactive_table.device_metadata_len = + md->ima.active_table.device_metadata_len; + md->ima.inactive_table.num_targets = + md->ima.active_table.num_targets; + } + } + + kfree(dev_name); + kfree(dev_uuid); +error2: + kfree(capacity_str); +error1: + kfree(device_table_data); +} + +/* + * Measure IMA data on device rename. + */ +void dm_ima_measure_on_device_rename(struct mapped_device *md) +{ + char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL; + char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL; + bool noio = true; + int r; + + if (dm_ima_alloc_and_copy_device_data(md, &new_device_data, + md->ima.active_table.num_targets, noio)) + return; + + if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio)) + goto error; + + combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio); + if (!combined_device_data) + goto error; + + r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (r) + goto error; + + old_device_data = md->ima.active_table.device_metadata; + + md->ima.active_table.device_metadata = new_device_data; + md->ima.active_table.device_metadata_len = strlen(new_device_data); + + scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, + "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, + new_dev_name, new_dev_uuid, capacity_str); + + dm_ima_measure_data("dm_device_rename", combined_device_data, strlen(combined_device_data), + noio); + + goto exit; + +error: + kfree(new_device_data); +exit: + kfree(capacity_str); + kfree(combined_device_data); + kfree(old_device_data); + kfree(new_dev_name); + kfree(new_dev_uuid); +} diff --git a/drivers/md/dm-ima.h b/drivers/md/dm-ima.h new file mode 100644 index 000000000..b8c3b6146 --- /dev/null +++ b/drivers/md/dm-ima.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2021 Microsoft Corporation + * + * Author: Tushar Sugandhi + * + * File: dm-ima.h + * Header file for device mapper IMA measurements. + */ + +#ifndef DM_IMA_H +#define DM_IMA_H + +#define DM_IMA_MEASUREMENT_BUF_LEN 4096 +#define DM_IMA_DEVICE_BUF_LEN 1024 +#define DM_IMA_TARGET_METADATA_BUF_LEN 128 +#define DM_IMA_TARGET_DATA_BUF_LEN 2048 +#define DM_IMA_DEVICE_CAPACITY_BUF_LEN 128 +#define DM_IMA_TABLE_HASH_ALG "sha256" + +#define __dm_ima_stringify(s) #s +#define __dm_ima_str(s) __dm_ima_stringify(s) + +#define DM_IMA_VERSION_STR "dm_version=" \ + __dm_ima_str(DM_VERSION_MAJOR) "." \ + __dm_ima_str(DM_VERSION_MINOR) "." \ + __dm_ima_str(DM_VERSION_PATCHLEVEL) ";" + +#ifdef CONFIG_IMA + +struct dm_ima_device_table_metadata { + /* + * Contains data specific to the device which is common across + * all the targets in the table (e.g. name, uuid, major, minor, etc). + * The values are stored in comma separated list of key1=val1,key2=val2; + * pairs delimited by a semicolon at the end of the list. + */ + char *device_metadata; + unsigned int device_metadata_len; + unsigned int num_targets; + + /* + * Contains the sha256 hashes of the IMA measurements of the target + * attributes' key-value pairs from the active/inactive tables. + */ + char *hash; + unsigned int hash_len; +}; + +/* + * This structure contains device metadata, and table hash for + * active and inactive tables for ima measurements. + */ +struct dm_ima_measurements { + struct dm_ima_device_table_metadata active_table; + struct dm_ima_device_table_metadata inactive_table; + unsigned int dm_version_str_len; +}; + +void dm_ima_reset_data(struct mapped_device *md); +void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_flags); +void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap); +void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all); +void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map); +void dm_ima_measure_on_device_rename(struct mapped_device *md); + +#else + +static inline void dm_ima_reset_data(struct mapped_device *md) {} +static inline void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_flags) {} +static inline void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) {} +static inline void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) {} +static inline void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) {} +static inline void dm_ima_measure_on_device_rename(struct mapped_device *md) {} + +#endif /* CONFIG_IMA */ + +#endif /* DM_IMA_H */ diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c new file mode 100644 index 000000000..dc4381d68 --- /dev/null +++ b/drivers/md/dm-init.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * dm-init.c + * Copyright (C) 2017 The Chromium OS Authors + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "init" +#define DM_MAX_DEVICES 256 +#define DM_MAX_TARGETS 256 +#define DM_MAX_STR_SIZE 4096 +#define DM_MAX_WAITFOR 256 + +static char *create; + +static char *waitfor[DM_MAX_WAITFOR]; + +/* + * Format: dm-mod.create=,,,,[,
+][;,,,,
[,
+]+] + * Table format: + * Block devices to wait for to become available before setting up tables: + * dm-mod.waitfor=[,..,] + * + * See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..." format + * details. + */ + +struct dm_device { + struct dm_ioctl dmi; + struct dm_target_spec *table[DM_MAX_TARGETS]; + char *target_args_array[DM_MAX_TARGETS]; + struct list_head list; +}; + +static const char * const dm_allowed_targets[] __initconst = { + "crypt", + "delay", + "linear", + "snapshot-origin", + "striped", + "verity", +}; + +static int __init dm_verify_target_type(const char *target) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dm_allowed_targets); i++) { + if (!strcmp(dm_allowed_targets[i], target)) + return 0; + } + return -EINVAL; +} + +static void __init dm_setup_cleanup(struct list_head *devices) +{ + struct dm_device *dev, *tmp; + unsigned int i; + + list_for_each_entry_safe(dev, tmp, devices, list) { + list_del(&dev->list); + for (i = 0; i < dev->dmi.target_count; i++) { + kfree(dev->table[i]); + kfree(dev->target_args_array[i]); + } + kfree(dev); + } +} + +/** + * str_field_delimit - delimit a string based on a separator char. + * @str: the pointer to the string to delimit. + * @separator: char that delimits the field + * + * Find a @separator and replace it by '\0'. + * Remove leading and trailing spaces. + * Return the remainder string after the @separator. + */ +static char __init *str_field_delimit(char **str, char separator) +{ + char *s; + + /* TODO: add support for escaped characters */ + *str = skip_spaces(*str); + s = strchr(*str, separator); + /* Delimit the field and remove trailing spaces */ + if (s) + *s = '\0'; + *str = strim(*str); + return s ? ++s : NULL; +} + +/** + * dm_parse_table_entry - parse a table entry + * @dev: device to store the parsed information. + * @str: the pointer to a string with the format: + * [, ...] + * + * Return the remainder string after the table entry, i.e, after the comma which + * delimits the entry or NULL if reached the end of the string. + */ +static char __init *dm_parse_table_entry(struct dm_device *dev, char *str) +{ + const unsigned int n = dev->dmi.target_count - 1; + struct dm_target_spec *sp; + unsigned int i; + /* fields: */ + char *field[4]; + char *next; + + field[0] = str; + /* Delimit first 3 fields that are separated by space */ + for (i = 0; i < ARRAY_SIZE(field) - 1; i++) { + field[i + 1] = str_field_delimit(&field[i], ' '); + if (!field[i + 1]) + return ERR_PTR(-EINVAL); + } + /* Delimit last field that can be terminated by comma */ + next = str_field_delimit(&field[i], ','); + + sp = kzalloc(sizeof(*sp), GFP_KERNEL); + if (!sp) + return ERR_PTR(-ENOMEM); + dev->table[n] = sp; + + /* start_sector */ + if (kstrtoull(field[0], 0, &sp->sector_start)) + return ERR_PTR(-EINVAL); + /* num_sector */ + if (kstrtoull(field[1], 0, &sp->length)) + return ERR_PTR(-EINVAL); + /* target_type */ + strscpy(sp->target_type, field[2], sizeof(sp->target_type)); + if (dm_verify_target_type(sp->target_type)) { + DMERR("invalid type \"%s\"", sp->target_type); + return ERR_PTR(-EINVAL); + } + /* target_args */ + dev->target_args_array[n] = kstrndup(field[3], DM_MAX_STR_SIZE, + GFP_KERNEL); + if (!dev->target_args_array[n]) + return ERR_PTR(-ENOMEM); + + return next; +} + +/** + * dm_parse_table - parse "dm-mod.create=" table field + * @dev: device to store the parsed information. + * @str: the pointer to a string with the format: + *
[,
+] + */ +static int __init dm_parse_table(struct dm_device *dev, char *str) +{ + char *table_entry = str; + + while (table_entry) { + DMDEBUG("parsing table \"%s\"", str); + if (++dev->dmi.target_count > DM_MAX_TARGETS) { + DMERR("too many targets %u > %d", + dev->dmi.target_count, DM_MAX_TARGETS); + return -EINVAL; + } + table_entry = dm_parse_table_entry(dev, table_entry); + if (IS_ERR(table_entry)) { + DMERR("couldn't parse table"); + return PTR_ERR(table_entry); + } + } + + return 0; +} + +/** + * dm_parse_device_entry - parse a device entry + * @dev: device to store the parsed information. + * @str: the pointer to a string with the format: + * name,uuid,minor,flags,table[; ...] + * + * Return the remainder string after the table entry, i.e, after the semi-colon + * which delimits the entry or NULL if reached the end of the string. + */ +static char __init *dm_parse_device_entry(struct dm_device *dev, char *str) +{ + /* There are 5 fields: name,uuid,minor,flags,table; */ + char *field[5]; + unsigned int i; + char *next; + + field[0] = str; + /* Delimit first 4 fields that are separated by comma */ + for (i = 0; i < ARRAY_SIZE(field) - 1; i++) { + field[i+1] = str_field_delimit(&field[i], ','); + if (!field[i+1]) + return ERR_PTR(-EINVAL); + } + /* Delimit last field that can be delimited by semi-colon */ + next = str_field_delimit(&field[i], ';'); + + /* name */ + strscpy(dev->dmi.name, field[0], sizeof(dev->dmi.name)); + /* uuid */ + strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid)); + /* minor */ + if (strlen(field[2])) { + if (kstrtoull(field[2], 0, &dev->dmi.dev)) + return ERR_PTR(-EINVAL); + dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG; + } + /* flags */ + if (!strcmp(field[3], "ro")) + dev->dmi.flags |= DM_READONLY_FLAG; + else if (strcmp(field[3], "rw")) + return ERR_PTR(-EINVAL); + /* table */ + if (dm_parse_table(dev, field[4])) + return ERR_PTR(-EINVAL); + + return next; +} + +/** + * dm_parse_devices - parse "dm-mod.create=" argument + * @devices: list of struct dm_device to store the parsed information. + * @str: the pointer to a string with the format: + * [;+] + */ +static int __init dm_parse_devices(struct list_head *devices, char *str) +{ + unsigned long ndev = 0; + struct dm_device *dev; + char *device = str; + + DMDEBUG("parsing \"%s\"", str); + while (device) { + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + list_add_tail(&dev->list, devices); + + if (++ndev > DM_MAX_DEVICES) { + DMERR("too many devices %lu > %d", + ndev, DM_MAX_DEVICES); + return -EINVAL; + } + + device = dm_parse_device_entry(dev, device); + if (IS_ERR(device)) { + DMERR("couldn't parse device"); + return PTR_ERR(device); + } + } + + return 0; +} + +/** + * dm_init_init - parse "dm-mod.create=" argument and configure drivers + */ +static int __init dm_init_init(void) +{ + struct dm_device *dev; + LIST_HEAD(devices); + char *str; + int i, r; + + if (!create) + return 0; + + if (strlen(create) >= DM_MAX_STR_SIZE) { + DMERR("Argument is too big. Limit is %d", DM_MAX_STR_SIZE); + return -EINVAL; + } + str = kstrndup(create, DM_MAX_STR_SIZE, GFP_KERNEL); + if (!str) + return -ENOMEM; + + r = dm_parse_devices(&devices, str); + if (r) + goto out; + + DMINFO("waiting for all devices to be available before creating mapped devices"); + wait_for_device_probe(); + + for (i = 0; i < ARRAY_SIZE(waitfor); i++) { + if (waitfor[i]) { + DMINFO("waiting for device %s ...", waitfor[i]); + while (!dm_get_dev_t(waitfor[i])) + msleep(5); + } + } + + if (waitfor[0]) + DMINFO("all devices available"); + + list_for_each_entry(dev, &devices, list) { + if (dm_early_create(&dev->dmi, dev->table, + dev->target_args_array)) + break; + } +out: + kfree(str); + dm_setup_cleanup(&devices); + return r; +} + +late_initcall(dm_init_init); + +module_param(create, charp, 0); +MODULE_PARM_DESC(create, "Create a mapped device in early boot"); + +module_param_array(waitfor, charp, NULL, 0); +MODULE_PARM_DESC(waitfor, "Devices to wait for before setting up tables"); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c new file mode 100644 index 000000000..77fcff82c --- /dev/null +++ b/drivers/md/dm-integrity.c @@ -0,0 +1,4671 @@ +/* + * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved. + * Copyright (C) 2016-2017 Milan Broz + * Copyright (C) 2016-2017 Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-bio-record.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-audit.h" + +#define DM_MSG_PREFIX "integrity" + +#define DEFAULT_INTERLEAVE_SECTORS 32768 +#define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768 +#define DEFAULT_BUFFER_SECTORS 128 +#define DEFAULT_JOURNAL_WATERMARK 50 +#define DEFAULT_SYNC_MSEC 10000 +#define DEFAULT_MAX_JOURNAL_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192) +#define MIN_LOG2_INTERLEAVE_SECTORS 3 +#define MAX_LOG2_INTERLEAVE_SECTORS 31 +#define METADATA_WORKQUEUE_MAX_ACTIVE 16 +#define RECALC_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048) +#define RECALC_WRITE_SUPER 16 +#define BITMAP_BLOCK_SIZE 4096 /* don't change it */ +#define BITMAP_FLUSH_INTERVAL (10 * HZ) +#define DISCARD_FILLER 0xf6 +#define SALT_SIZE 16 + +/* + * Warning - DEBUG_PRINT prints security-sensitive data to the log, + * so it should not be enabled in the official kernel + */ +//#define DEBUG_PRINT +//#define INTERNAL_VERIFY + +/* + * On disk structures + */ + +#define SB_MAGIC "integrt" +#define SB_VERSION_1 1 +#define SB_VERSION_2 2 +#define SB_VERSION_3 3 +#define SB_VERSION_4 4 +#define SB_VERSION_5 5 +#define SB_SECTORS 8 +#define MAX_SECTORS_PER_BLOCK 8 + +struct superblock { + __u8 magic[8]; + __u8 version; + __u8 log2_interleave_sectors; + __le16 integrity_tag_size; + __le32 journal_sections; + __le64 provided_data_sectors; /* userspace uses this value */ + __le32 flags; + __u8 log2_sectors_per_block; + __u8 log2_blocks_per_bitmap_bit; + __u8 pad[2]; + __le64 recalc_sector; + __u8 pad2[8]; + __u8 salt[SALT_SIZE]; +}; + +#define SB_FLAG_HAVE_JOURNAL_MAC 0x1 +#define SB_FLAG_RECALCULATING 0x2 +#define SB_FLAG_DIRTY_BITMAP 0x4 +#define SB_FLAG_FIXED_PADDING 0x8 +#define SB_FLAG_FIXED_HMAC 0x10 + +#define JOURNAL_ENTRY_ROUNDUP 8 + +typedef __le64 commit_id_t; +#define JOURNAL_MAC_PER_SECTOR 8 + +struct journal_entry { + union { + struct { + __le32 sector_lo; + __le32 sector_hi; + } s; + __le64 sector; + } u; + commit_id_t last_bytes[]; + /* __u8 tag[0]; */ +}; + +#define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) + +#if BITS_PER_LONG == 64 +#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) +#else +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) +#endif +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) +#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) +#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) +#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0) + +#define JOURNAL_BLOCK_SECTORS 8 +#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t)) +#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS) + +struct journal_sector { + struct_group(sectors, + __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR]; + __u8 mac[JOURNAL_MAC_PER_SECTOR]; + ); + commit_id_t commit_id; +}; + +#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) + +#define METADATA_PADDING_SECTORS 8 + +#define N_COMMIT_IDS 4 + +static unsigned char prev_commit_seq(unsigned char seq) +{ + return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS; +} + +static unsigned char next_commit_seq(unsigned char seq) +{ + return (seq + 1) % N_COMMIT_IDS; +} + +/* + * In-memory structures + */ + +struct journal_node { + struct rb_node node; + sector_t sector; +}; + +struct alg_spec { + char *alg_string; + char *key_string; + __u8 *key; + unsigned int key_size; +}; + +struct dm_integrity_c { + struct dm_dev *dev; + struct dm_dev *meta_dev; + unsigned int tag_size; + __s8 log2_tag_size; + sector_t start; + mempool_t journal_io_mempool; + struct dm_io_client *io; + struct dm_bufio_client *bufio; + struct workqueue_struct *metadata_wq; + struct superblock *sb; + unsigned int journal_pages; + unsigned int n_bitmap_blocks; + + struct page_list *journal; + struct page_list *journal_io; + struct page_list *journal_xor; + struct page_list *recalc_bitmap; + struct page_list *may_write_bitmap; + struct bitmap_block_status *bbs; + unsigned int bitmap_flush_interval; + int synchronous_mode; + struct bio_list synchronous_bios; + struct delayed_work bitmap_flush_work; + + struct crypto_skcipher *journal_crypt; + struct scatterlist **journal_scatterlist; + struct scatterlist **journal_io_scatterlist; + struct skcipher_request **sk_requests; + + struct crypto_shash *journal_mac; + + struct journal_node *journal_tree; + struct rb_root journal_tree_root; + + sector_t provided_data_sectors; + + unsigned short journal_entry_size; + unsigned char journal_entries_per_sector; + unsigned char journal_section_entries; + unsigned short journal_section_sectors; + unsigned int journal_sections; + unsigned int journal_entries; + sector_t data_device_sectors; + sector_t meta_device_sectors; + unsigned int initial_sectors; + unsigned int metadata_run; + __s8 log2_metadata_run; + __u8 log2_buffer_sectors; + __u8 sectors_per_block; + __u8 log2_blocks_per_bitmap_bit; + + unsigned char mode; + + int failed; + + struct crypto_shash *internal_hash; + + struct dm_target *ti; + + /* these variables are locked with endio_wait.lock */ + struct rb_root in_progress; + struct list_head wait_list; + wait_queue_head_t endio_wait; + struct workqueue_struct *wait_wq; + struct workqueue_struct *offload_wq; + + unsigned char commit_seq; + commit_id_t commit_ids[N_COMMIT_IDS]; + + unsigned int committed_section; + unsigned int n_committed_sections; + + unsigned int uncommitted_section; + unsigned int n_uncommitted_sections; + + unsigned int free_section; + unsigned char free_section_entry; + unsigned int free_sectors; + + unsigned int free_sectors_threshold; + + struct workqueue_struct *commit_wq; + struct work_struct commit_work; + + struct workqueue_struct *writer_wq; + struct work_struct writer_work; + + struct workqueue_struct *recalc_wq; + struct work_struct recalc_work; + u8 *recalc_buffer; + u8 *recalc_tags; + + struct bio_list flush_bio_list; + + unsigned long autocommit_jiffies; + struct timer_list autocommit_timer; + unsigned int autocommit_msec; + + wait_queue_head_t copy_to_journal_wait; + + struct completion crypto_backoff; + + bool wrote_to_journal; + bool journal_uptodate; + bool just_formatted; + bool recalculate_flag; + bool reset_recalculate_flag; + bool discard; + bool fix_padding; + bool fix_hmac; + bool legacy_recalculate; + + struct alg_spec internal_hash_alg; + struct alg_spec journal_crypt_alg; + struct alg_spec journal_mac_alg; + + atomic64_t number_of_mismatches; + + struct notifier_block reboot_notifier; +}; + +struct dm_integrity_range { + sector_t logical_sector; + sector_t n_sectors; + bool waiting; + union { + struct rb_node node; + struct { + struct task_struct *task; + struct list_head wait_entry; + }; + }; +}; + +struct dm_integrity_io { + struct work_struct work; + + struct dm_integrity_c *ic; + enum req_op op; + bool fua; + + struct dm_integrity_range range; + + sector_t metadata_block; + unsigned int metadata_offset; + + atomic_t in_flight; + blk_status_t bi_status; + + struct completion *completion; + + struct dm_bio_details bio_details; +}; + +struct journal_completion { + struct dm_integrity_c *ic; + atomic_t in_flight; + struct completion comp; +}; + +struct journal_io { + struct dm_integrity_range range; + struct journal_completion *comp; +}; + +struct bitmap_block_status { + struct work_struct work; + struct dm_integrity_c *ic; + unsigned int idx; + unsigned long *bitmap; + struct bio_list bio_queue; + spinlock_t bio_queue_lock; + +}; + +static struct kmem_cache *journal_io_cache; + +#define JOURNAL_IO_MEMPOOL 32 + +#ifdef DEBUG_PRINT +#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) +static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) +{ + va_list args; + va_start(args, msg); + vprintk(msg, args); + va_end(args); + if (len) + pr_cont(":"); + while (len) { + pr_cont(" %02x", *bytes); + bytes++; + len--; + } + pr_cont("\n"); +} +#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__) +#else +#define DEBUG_print(x, ...) do { } while (0) +#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) +#endif + +static void dm_integrity_prepare(struct request *rq) +{ +} + +static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +/* + * DM Integrity profile, protection is performed layer above (dm-crypt) + */ +static const struct blk_integrity_profile dm_integrity_profile = { + .name = "DM-DIF-EXT-TAG", + .generate_fn = NULL, + .verify_fn = NULL, + .prepare_fn = dm_integrity_prepare, + .complete_fn = dm_integrity_complete, +}; + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); +static void integrity_bio_wait(struct work_struct *w); +static void dm_integrity_dtr(struct dm_target *ti); + +static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err) +{ + if (err == -EILSEQ) + atomic64_inc(&ic->number_of_mismatches); + if (!cmpxchg(&ic->failed, 0, err)) + DMERR("Error on %s: %d", msg, err); +} + +static int dm_integrity_failed(struct dm_integrity_c *ic) +{ + return READ_ONCE(ic->failed); +} + +static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic) +{ + if (ic->legacy_recalculate) + return false; + if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ? + ic->internal_hash_alg.key || ic->journal_mac_alg.key : + ic->internal_hash_alg.key && !ic->journal_mac_alg.key) + return true; + return false; +} + +static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned int i, + unsigned int j, unsigned char seq) +{ + /* + * Xor the number with section and sector, so that if a piece of + * journal is written at wrong place, it is detected. + */ + return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j); +} + +static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, + sector_t *area, sector_t *offset) +{ + if (!ic->meta_dev) { + __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; + *area = data_sector >> log2_interleave_sectors; + *offset = (unsigned int)data_sector & ((1U << log2_interleave_sectors) - 1); + } else { + *area = 0; + *offset = data_sector; + } +} + +#define sector_to_block(ic, n) \ +do { \ + BUG_ON((n) & (unsigned int)((ic)->sectors_per_block - 1)); \ + (n) >>= (ic)->sb->log2_sectors_per_block; \ +} while (0) + +static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area, + sector_t offset, unsigned int *metadata_offset) +{ + __u64 ms; + unsigned int mo; + + ms = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + ms += area << ic->log2_metadata_run; + else + ms += area * ic->metadata_run; + ms >>= ic->log2_buffer_sectors; + + sector_to_block(ic, offset); + + if (likely(ic->log2_tag_size >= 0)) { + ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size); + mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } else { + ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors); + mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } + *metadata_offset = mo; + return ms; +} + +static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset) +{ + sector_t result; + + if (ic->meta_dev) + return offset; + + result = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + result += (area + 1) << ic->log2_metadata_run; + else + result += (area + 1) * ic->metadata_run; + + result += (sector_t)ic->initial_sectors + offset; + result += ic->start; + + return result; +} + +static void wraparound_section(struct dm_integrity_c *ic, unsigned int *sec_ptr) +{ + if (unlikely(*sec_ptr >= ic->journal_sections)) + *sec_ptr -= ic->journal_sections; +} + +static void sb_set_version(struct dm_integrity_c *ic) +{ + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) + ic->sb->version = SB_VERSION_5; + else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + ic->sb->version = SB_VERSION_4; + else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + ic->sb->version = SB_VERSION_3; + else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + ic->sb->version = SB_VERSION_2; + else + ic->sb->version = SB_VERSION_1; +} + +static int sb_mac(struct dm_integrity_c *ic, bool wr) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned int size = crypto_shash_digestsize(ic->journal_mac); + + if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) { + dm_integrity_io_error(ic, "digest is too long", -EINVAL); + return -EINVAL; + } + + desc->tfm = ic->journal_mac; + + r = crypto_shash_init(desc); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + return r; + } + + r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + return r; + } + + if (likely(wr)) { + r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + return r; + } + } else { + __u8 result[HASH_MAX_DIGESTSIZE]; + r = crypto_shash_final(desc, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + return r; + } + if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) { + dm_integrity_io_error(ic, "superblock mac", -EILSEQ); + dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); + return -EILSEQ; + } + } + + return 0; +} + +static int sync_rw_sb(struct dm_integrity_c *ic, blk_opf_t opf) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + const enum req_op op = opf & REQ_OP_MASK; + int r; + + io_req.bi_opf = opf; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = ic->sb; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev; + io_loc.sector = ic->start; + io_loc.count = SB_SECTORS; + + if (op == REQ_OP_WRITE) { + sb_set_version(ic); + if (ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = sb_mac(ic, true); + if (unlikely(r)) + return r; + } + } + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) + return r; + + if (op == REQ_OP_READ) { + if (ic->mode != 'R' && ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = sb_mac(ic, false); + if (unlikely(r)) + return r; + } + } + + return 0; +} + +#define BITMAP_OP_TEST_ALL_SET 0 +#define BITMAP_OP_TEST_ALL_CLEAR 1 +#define BITMAP_OP_SET 2 +#define BITMAP_OP_CLEAR 3 + +static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, + sector_t sector, sector_t n_sectors, int mode) +{ + unsigned long bit, end_bit, this_end_bit, page, end_page; + unsigned long *data; + + if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { + DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", + sector, + n_sectors, + ic->sb->log2_sectors_per_block, + ic->log2_blocks_per_bitmap_bit, + mode); + BUG(); + } + + if (unlikely(!n_sectors)) + return true; + + bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + end_bit = (sector + n_sectors - 1) >> + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + + page = bit / (PAGE_SIZE * 8); + bit %= PAGE_SIZE * 8; + + end_page = end_bit / (PAGE_SIZE * 8); + end_bit %= PAGE_SIZE * 8; + +repeat: + if (page < end_page) { + this_end_bit = PAGE_SIZE * 8 - 1; + } else { + this_end_bit = end_bit; + } + + data = lowmem_page_address(bitmap[page].page); + + if (mode == BITMAP_OP_TEST_ALL_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != -1) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (!test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != 0) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = -1; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __set_bit(bit, data); + bit++; + } + } else if (mode == BITMAP_OP_CLEAR) { + if (!bit && this_end_bit == PAGE_SIZE * 8 - 1) + clear_page(data); + else while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = 0; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __clear_bit(bit, data); + bit++; + } + } else { + BUG(); + } + + if (unlikely(page < end_page)) { + bit = 0; + page++; + goto repeat; + } + + return true; +} + +static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src) +{ + unsigned int n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + unsigned int i; + + for (i = 0; i < n_bitmap_pages; i++) { + unsigned long *dst_data = lowmem_page_address(dst[i].page); + unsigned long *src_data = lowmem_page_address(src[i].page); + copy_page(dst_data, src_data); + } +} + +static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector) +{ + unsigned int bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + unsigned int bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8); + + BUG_ON(bitmap_block >= ic->n_bitmap_blocks); + return &ic->bbs[bitmap_block]; +} + +static void access_journal_check(struct dm_integrity_c *ic, unsigned int section, unsigned int offset, + bool e, const char *function) +{ +#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY) + unsigned int limit = e ? ic->journal_section_entries : ic->journal_section_sectors; + + if (unlikely(section >= ic->journal_sections) || + unlikely(offset >= limit)) { + DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)", + function, section, offset, ic->journal_sections, limit); + BUG(); + } +#endif +} + +static void page_list_location(struct dm_integrity_c *ic, unsigned int section, unsigned int offset, + unsigned int *pl_index, unsigned int *pl_offset) +{ + unsigned int sector; + + access_journal_check(ic, section, offset, false, "page_list_location"); + + sector = section * ic->journal_section_sectors + offset; + + *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); +} + +static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl, + unsigned int section, unsigned int offset, unsigned int *n_sectors) +{ + unsigned int pl_index, pl_offset; + char *va; + + page_list_location(ic, section, offset, &pl_index, &pl_offset); + + if (n_sectors) + *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT; + + va = lowmem_page_address(pl[pl_index].page); + + return (struct journal_sector *)(va + pl_offset); +} + +static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned int section, unsigned int offset) +{ + return access_page_list(ic, ic->journal, section, offset, NULL); +} + +static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned int section, unsigned int n) +{ + unsigned int rel_sector, offset; + struct journal_sector *js; + + access_journal_check(ic, section, n, true, "access_journal_entry"); + + rel_sector = n % JOURNAL_BLOCK_SECTORS; + offset = n / JOURNAL_BLOCK_SECTORS; + + js = access_journal(ic, section, rel_sector); + return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size); +} + +static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned int section, unsigned int n) +{ + n <<= ic->sb->log2_sectors_per_block; + + n += JOURNAL_BLOCK_SECTORS; + + access_journal_check(ic, section, n, false, "access_journal_data"); + + return access_journal(ic, section, n); +} + +static void section_mac(struct dm_integrity_c *ic, unsigned int section, __u8 result[JOURNAL_MAC_SIZE]) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned int j, size; + + desc->tfm = ic->journal_mac; + + r = crypto_shash_init(desc); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto err; + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + __le64 section_le; + + r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + + section_le = cpu_to_le64(section); + r = crypto_shash_update(desc, (__u8 *)§ion_le, sizeof section_le); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, section, j); + r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + + size = crypto_shash_digestsize(ic->journal_mac); + + if (likely(size <= JOURNAL_MAC_SIZE)) { + r = crypto_shash_final(desc, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memset(result + size, 0, JOURNAL_MAC_SIZE - size); + } else { + __u8 digest[HASH_MAX_DIGESTSIZE]; + + if (WARN_ON(size > sizeof(digest))) { + dm_integrity_io_error(ic, "digest_size", -EINVAL); + goto err; + } + r = crypto_shash_final(desc, digest); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memcpy(result, digest, JOURNAL_MAC_SIZE); + } + + return; +err: + memset(result, 0, JOURNAL_MAC_SIZE); +} + +static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool wr) +{ + __u8 result[JOURNAL_MAC_SIZE]; + unsigned int j; + + if (!ic->journal_mac) + return; + + section_mac(ic, section, result); + + for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) { + struct journal_sector *js = access_journal(ic, section, j); + + if (likely(wr)) + memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); + else { + if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { + dm_integrity_io_error(ic, "journal mac", -EILSEQ); + dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); + } + } + } +} + +static void complete_journal_op(void *context) +{ + struct journal_completion *comp = context; + BUG_ON(!atomic_read(&comp->in_flight)); + if (likely(atomic_dec_and_test(&comp->in_flight))) + complete(&comp->comp); +} + +static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned int section, + unsigned int n_sections, struct journal_completion *comp) +{ + struct async_submit_ctl submit; + size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT; + unsigned int pl_index, pl_offset, section_index; + struct page_list *source_pl, *target_pl; + + if (likely(encrypt)) { + source_pl = ic->journal; + target_pl = ic->journal_io; + } else { + source_pl = ic->journal_io; + target_pl = ic->journal; + } + + page_list_location(ic, section, 0, &pl_index, &pl_offset); + + atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight); + + init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL); + + section_index = pl_index; + + do { + size_t this_step; + struct page *src_pages[2]; + struct page *dst_page; + + while (unlikely(pl_index == section_index)) { + unsigned int dummy; + if (likely(encrypt)) + rw_section_mac(ic, section, true); + section++; + n_sections--; + if (!n_sections) + break; + page_list_location(ic, section, 0, §ion_index, &dummy); + } + + this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset); + dst_page = target_pl[pl_index].page; + src_pages[0] = source_pl[pl_index].page; + src_pages[1] = ic->journal_xor[pl_index].page; + + async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit); + + pl_index++; + pl_offset = 0; + n_bytes -= this_step; + } while (n_bytes); + + BUG_ON(n_sections); + + async_tx_issue_pending_all(); +} + +static void complete_journal_encrypt(struct crypto_async_request *req, int err) +{ + struct journal_completion *comp = req->data; + if (unlikely(err)) { + if (likely(err == -EINPROGRESS)) { + complete(&comp->ic->crypto_backoff); + return; + } + dm_integrity_io_error(comp->ic, "asynchronous encrypt", err); + } + complete_journal_op(comp); +} + +static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp) +{ + int r; + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + complete_journal_encrypt, comp); + if (likely(encrypt)) + r = crypto_skcipher_encrypt(req); + else + r = crypto_skcipher_decrypt(req); + if (likely(!r)) + return false; + if (likely(r == -EINPROGRESS)) + return true; + if (likely(r == -EBUSY)) { + wait_for_completion(&comp->ic->crypto_backoff); + reinit_completion(&comp->ic->crypto_backoff); + return true; + } + dm_integrity_io_error(comp->ic, "encrypt", r); + return false; +} + +static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned int section, + unsigned int n_sections, struct journal_completion *comp) +{ + struct scatterlist **source_sg; + struct scatterlist **target_sg; + + atomic_add(2, &comp->in_flight); + + if (likely(encrypt)) { + source_sg = ic->journal_scatterlist; + target_sg = ic->journal_io_scatterlist; + } else { + source_sg = ic->journal_io_scatterlist; + target_sg = ic->journal_scatterlist; + } + + do { + struct skcipher_request *req; + unsigned int ivsize; + char *iv; + + if (likely(encrypt)) + rw_section_mac(ic, section, true); + + req = ic->sk_requests[section]; + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + iv = req->iv; + + memcpy(iv, iv + ivsize, ivsize); + + req->src = source_sg[section]; + req->dst = target_sg[section]; + + if (unlikely(do_crypt(encrypt, req, comp))) + atomic_inc(&comp->in_flight); + + section++; + n_sections--; + } while (n_sections); + + atomic_dec(&comp->in_flight); + complete_journal_op(comp); +} + +static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned int section, + unsigned int n_sections, struct journal_completion *comp) +{ + if (ic->journal_xor) + return xor_journal(ic, encrypt, section, n_sections, comp); + else + return crypt_journal(ic, encrypt, section, n_sections, comp); +} + +static void complete_journal_io(unsigned long error, void *context) +{ + struct journal_completion *comp = context; + if (unlikely(error != 0)) + dm_integrity_io_error(comp->ic, "writing journal", -EIO); + complete_journal_op(comp); +} + +static void rw_journal_sectors(struct dm_integrity_c *ic, blk_opf_t opf, + unsigned int sector, unsigned int n_sectors, + struct journal_completion *comp) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + unsigned int pl_index, pl_offset; + int r; + + if (unlikely(dm_integrity_failed(ic))) { + if (comp) + complete_journal_io(-1UL, comp); + return; + } + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_opf = opf; + io_req.mem.type = DM_IO_PAGE_LIST; + if (ic->journal_io) + io_req.mem.ptr.pl = &ic->journal_io[pl_index]; + else + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + if (likely(comp != NULL)) { + io_req.notify.fn = complete_journal_io; + io_req.notify.context = comp; + } else { + io_req.notify.fn = NULL; + } + io_req.client = ic->io; + io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev; + io_loc.sector = ic->start + SB_SECTORS + sector; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, (opf & REQ_OP_MASK) == REQ_OP_READ ? + "reading journal" : "writing journal", r); + if (comp) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + complete_journal_io(-1UL, comp); + } + } +} + +static void rw_journal(struct dm_integrity_c *ic, blk_opf_t opf, + unsigned int section, unsigned int n_sections, + struct journal_completion *comp) +{ + unsigned int sector, n_sectors; + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + rw_journal_sectors(ic, opf, sector, n_sectors, comp); +} + +static void write_journal(struct dm_integrity_c *ic, unsigned int commit_start, unsigned int commit_sections) +{ + struct journal_completion io_comp; + struct journal_completion crypt_comp_1; + struct journal_completion crypt_comp_2; + unsigned int i; + + io_comp.ic = ic; + init_completion(&io_comp.comp); + + if (commit_start + commit_sections <= ic->journal_sections) { + io_comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (ic->journal_io) { + crypt_comp_1.ic = ic; + init_completion(&crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + for (i = 0; i < commit_sections; i++) + rw_section_mac(ic, commit_start + i, true); + } + rw_journal(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, commit_start, + commit_sections, &io_comp); + } else { + unsigned int to_end; + io_comp.in_flight = (atomic_t)ATOMIC_INIT(2); + to_end = ic->journal_sections - commit_start; + if (ic->journal_io) { + crypt_comp_1.ic = ic; + init_completion(&crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); + if (try_wait_for_completion(&crypt_comp_1.comp)) { + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, + commit_start, to_end, &io_comp); + reinit_completion(&crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + crypt_comp_2.ic = ic; + init_completion(&crypt_comp_2.comp); + crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); + wait_for_completion_io(&crypt_comp_1.comp); + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, commit_start, to_end, &io_comp); + wait_for_completion_io(&crypt_comp_2.comp); + } + } else { + for (i = 0; i < to_end; i++) + rw_section_mac(ic, commit_start + i, true); + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, commit_start, to_end, &io_comp); + for (i = 0; i < commit_sections - to_end; i++) + rw_section_mac(ic, i, true); + } + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, 0, commit_sections - to_end, &io_comp); + } + + wait_for_completion_io(&io_comp.comp); +} + +static void copy_from_journal(struct dm_integrity_c *ic, unsigned int section, unsigned int offset, + unsigned int n_sectors, sector_t target, io_notify_fn fn, void *data) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + int r; + unsigned int sector, pl_index, pl_offset; + + BUG_ON((target | n_sectors | offset) & (unsigned int)(ic->sectors_per_block - 1)); + + if (unlikely(dm_integrity_failed(ic))) { + fn(-1UL, data); + return; + } + + sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_opf = REQ_OP_WRITE; + io_req.mem.type = DM_IO_PAGE_LIST; + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + io_req.notify.fn = fn; + io_req.notify.context = data; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = target; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + fn(-1UL, data); + } +} + +static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2) +{ + return range1->logical_sector < range2->logical_sector + range2->n_sectors && + range1->logical_sector + range1->n_sectors > range2->logical_sector; +} + +static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting) +{ + struct rb_node **n = &ic->in_progress.rb_node; + struct rb_node *parent; + + BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned int)(ic->sectors_per_block - 1)); + + if (likely(check_waiting)) { + struct dm_integrity_range *range; + list_for_each_entry(range, &ic->wait_list, wait_entry) { + if (unlikely(ranges_overlap(range, new_range))) + return false; + } + } + + parent = NULL; + + while (*n) { + struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node); + + parent = *n; + if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) { + n = &range->node.rb_left; + } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) { + n = &range->node.rb_right; + } else { + return false; + } + } + + rb_link_node(&new_range->node, parent, n); + rb_insert_color(&new_range->node, &ic->in_progress); + + return true; +} + +static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + rb_erase(&range->node, &ic->in_progress); + while (unlikely(!list_empty(&ic->wait_list))) { + struct dm_integrity_range *last_range = + list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry); + struct task_struct *last_range_task; + last_range_task = last_range->task; + list_del(&last_range->wait_entry); + if (!add_new_range(ic, last_range, false)) { + last_range->task = last_range_task; + list_add(&last_range->wait_entry, &ic->wait_list); + break; + } + last_range->waiting = false; + wake_up_process(last_range_task); + } +} + +static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->endio_wait.lock, flags); + remove_range_unlocked(ic, range); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); +} + +static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + new_range->waiting = true; + list_add_tail(&new_range->wait_entry, &ic->wait_list); + new_range->task = current; + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + } while (unlikely(new_range->waiting)); +} + +static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + if (unlikely(!add_new_range(ic, new_range, true))) + wait_and_add_new_range(ic, new_range); +} + +static void init_journal_node(struct journal_node *node) +{ + RB_CLEAR_NODE(&node->node); + node->sector = (sector_t)-1; +} + +static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector) +{ + struct rb_node **link; + struct rb_node *parent; + + node->sector = sector; + BUG_ON(!RB_EMPTY_NODE(&node->node)); + + link = &ic->journal_tree_root.rb_node; + parent = NULL; + + while (*link) { + struct journal_node *j; + parent = *link; + j = container_of(parent, struct journal_node, node); + if (sector < j->sector) + link = &j->node.rb_left; + else + link = &j->node.rb_right; + } + + rb_link_node(&node->node, parent, link); + rb_insert_color(&node->node, &ic->journal_tree_root); +} + +static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + BUG_ON(RB_EMPTY_NODE(&node->node)); + rb_erase(&node->node, &ic->journal_tree_root); + init_journal_node(node); +} + +#define NOT_FOUND (-1U) + +static unsigned int find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector) +{ + struct rb_node *n = ic->journal_tree_root.rb_node; + unsigned int found = NOT_FOUND; + *next_sector = (sector_t)-1; + while (n) { + struct journal_node *j = container_of(n, struct journal_node, node); + if (sector == j->sector) { + found = j - ic->journal_tree; + } + if (sector < j->sector) { + *next_sector = j->sector; + n = j->node.rb_left; + } else { + n = j->node.rb_right; + } + } + + return found; +} + +static bool test_journal_node(struct dm_integrity_c *ic, unsigned int pos, sector_t sector) +{ + struct journal_node *node, *next_node; + struct rb_node *next; + + if (unlikely(pos >= ic->journal_entries)) + return false; + node = &ic->journal_tree[pos]; + if (unlikely(RB_EMPTY_NODE(&node->node))) + return false; + if (unlikely(node->sector != sector)) + return false; + + next = rb_next(&node->node); + if (unlikely(!next)) + return true; + + next_node = container_of(next, struct journal_node, node); + return next_node->sector != sector; +} + +static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + struct rb_node *next; + struct journal_node *next_node; + unsigned int next_section; + + BUG_ON(RB_EMPTY_NODE(&node->node)); + + next = rb_next(&node->node); + if (unlikely(!next)) + return false; + + next_node = container_of(next, struct journal_node, node); + + if (next_node->sector != node->sector) + return false; + + next_section = (unsigned int)(next_node - ic->journal_tree) / ic->journal_section_entries; + if (next_section >= ic->committed_section && + next_section < ic->committed_section + ic->n_committed_sections) + return true; + if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections) + return true; + + return false; +} + +#define TAG_READ 0 +#define TAG_WRITE 1 +#define TAG_CMP 2 + +static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, + unsigned int *metadata_offset, unsigned int total_size, int op) +{ +#define MAY_BE_FILLER 1 +#define MAY_BE_HASH 2 + unsigned int hash_offset = 0; + unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + + do { + unsigned char *data, *dp; + struct dm_buffer *b; + unsigned int to_copy; + int r; + + r = dm_integrity_failed(ic); + if (unlikely(r)) + return r; + + data = dm_bufio_read(ic->bufio, *metadata_block, &b); + if (IS_ERR(data)) + return PTR_ERR(data); + + to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size); + dp = data + *metadata_offset; + if (op == TAG_READ) { + memcpy(tag, dp, to_copy); + } else if (op == TAG_WRITE) { + if (memcmp(dp, tag, to_copy)) { + memcpy(dp, tag, to_copy); + dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); + } + } else { + /* e.g.: op == TAG_CMP */ + + if (likely(is_power_of_2(ic->tag_size))) { + if (unlikely(memcmp(dp, tag, to_copy))) + if (unlikely(!ic->discard) || + unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { + goto thorough_test; + } + } else { + unsigned int i, ts; +thorough_test: + ts = total_size; + + for (i = 0; i < to_copy; i++, ts--) { + if (unlikely(dp[i] != tag[i])) + may_be &= ~MAY_BE_HASH; + if (likely(dp[i] != DISCARD_FILLER)) + may_be &= ~MAY_BE_FILLER; + hash_offset++; + if (unlikely(hash_offset == ic->tag_size)) { + if (unlikely(!may_be)) { + dm_bufio_release(b); + return ts; + } + hash_offset = 0; + may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + } + } + } + } + dm_bufio_release(b); + + tag += to_copy; + *metadata_offset += to_copy; + if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) { + (*metadata_block)++; + *metadata_offset = 0; + } + + if (unlikely(!is_power_of_2(ic->tag_size))) { + hash_offset = (hash_offset + to_copy) % ic->tag_size; + } + + total_size -= to_copy; + } while (unlikely(total_size)); + + return 0; +#undef MAY_BE_FILLER +#undef MAY_BE_HASH +} + +struct flush_request { + struct dm_io_request io_req; + struct dm_io_region io_reg; + struct dm_integrity_c *ic; + struct completion comp; +}; + +static void flush_notify(unsigned long error, void *fr_) +{ + struct flush_request *fr = fr_; + if (unlikely(error != 0)) + dm_integrity_io_error(fr->ic, "flushing disk cache", -EIO); + complete(&fr->comp); +} + +static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data) +{ + int r; + + struct flush_request fr; + + if (!ic->meta_dev) + flush_data = false; + if (flush_data) { + fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, + fr.io_req.mem.type = DM_IO_KMEM, + fr.io_req.mem.ptr.addr = NULL, + fr.io_req.notify.fn = flush_notify, + fr.io_req.notify.context = &fr; + fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio), + fr.io_reg.bdev = ic->dev->bdev, + fr.io_reg.sector = 0, + fr.io_reg.count = 0, + fr.ic = ic; + init_completion(&fr.comp); + r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL); + BUG_ON(r); + } + + r = dm_bufio_write_dirty_buffers(ic->bufio); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing tags", r); + + if (flush_data) + wait_for_completion(&fr.comp); +} + +static void sleep_on_endio_wait(struct dm_integrity_c *ic) +{ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&ic->endio_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + __remove_wait_queue(&ic->endio_wait, &wait); +} + +static void autocommit_fn(struct timer_list *t) +{ + struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer); + + if (likely(!dm_integrity_failed(ic))) + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void schedule_autocommit(struct dm_integrity_c *ic) +{ + if (!timer_pending(&ic->autocommit_timer)) + mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies); +} + +static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio; + unsigned long flags; + + spin_lock_irqsave(&ic->endio_wait.lock, flags); + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + bio_list_add(&ic->flush_bio_list, bio); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); + + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void do_endio(struct dm_integrity_c *ic, struct bio *bio) +{ + int r = dm_integrity_failed(ic); + if (unlikely(r) && !bio->bi_status) + bio->bi_status = errno_to_blk_status(r); + if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) { + unsigned long flags; + spin_lock_irqsave(&ic->endio_wait.lock, flags); + bio_list_add(&ic->synchronous_bios, bio); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); + return; + } + bio_endio(bio); +} + +static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic))) + submit_flush_bio(ic, dio); + else + do_endio(ic, bio); +} + +static void dec_in_flight(struct dm_integrity_io *dio) +{ + if (atomic_dec_and_test(&dio->in_flight)) { + struct dm_integrity_c *ic = dio->ic; + struct bio *bio; + + remove_range(ic, &dio->range); + + if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD)) + schedule_autocommit(ic); + + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->bi_status) && !bio->bi_status) + bio->bi_status = dio->bi_status; + if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + dio->range.logical_sector += dio->range.n_sectors; + bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->offload_wq, &dio->work); + return; + } + do_endio_flush(ic, dio); + } +} + +static void integrity_end_io(struct bio *bio) +{ + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + dm_bio_restore(&dio->bio_details, bio); + if (bio->bi_integrity) + bio->bi_opf |= REQ_INTEGRITY; + + if (dio->completion) + complete(dio->completion); + + dec_in_flight(dio); +} + +static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, + const char *data, char *result) +{ + __le64 sector_le = cpu_to_le64(sector); + SHASH_DESC_ON_STACK(req, ic->internal_hash); + int r; + unsigned int digest_size; + + req->tfm = ic->internal_hash; + + r = crypto_shash_init(req); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto failed; + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + } + + r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof sector_le); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + + r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + + r = crypto_shash_final(req, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto failed; + } + + digest_size = crypto_shash_digestsize(ic->internal_hash); + if (unlikely(digest_size < ic->tag_size)) + memset(result + digest_size, 0, ic->tag_size - digest_size); + + return; + +failed: + /* this shouldn't happen anyway, the hash functions have no reason to fail */ + get_random_bytes(result, ic->tag_size); +} + +static void integrity_metadata(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; + + int r; + + if (ic->internal_hash) { + struct bvec_iter iter; + struct bio_vec bv; + unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + char *checksums; + unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + sector_t sector; + unsigned int sectors_to_process; + + if (unlikely(ic->mode == 'R')) + goto skip_io; + + if (likely(dio->op != REQ_OP_DISCARD)) + checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + else + checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (!checksums) { + checksums = checksums_onstack; + if (WARN_ON(extra_space && + digest_size > sizeof(checksums_onstack))) { + r = -EINVAL; + goto error; + } + } + + if (unlikely(dio->op == REQ_OP_DISCARD)) { + unsigned int bi_size = dio->bio_details.bi_iter.bi_size; + unsigned int max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE; + unsigned int max_blocks = max_size / ic->tag_size; + memset(checksums, DISCARD_FILLER, max_size); + + while (bi_size) { + unsigned int this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + this_step_blocks = min(this_step_blocks, max_blocks); + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + this_step_blocks * ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto skip_io; + } + + sector = dio->range.logical_sector; + sectors_to_process = dio->range.n_sectors; + + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { + struct bio_vec bv_copy = bv; + unsigned int pos; + char *mem, *checksums_ptr; + +again: + mem = bvec_kmap_local(&bv_copy); + pos = 0; + checksums_ptr = checksums; + do { + integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); + checksums_ptr += ic->tag_size; + sectors_to_process -= ic->sectors_per_block; + pos += ic->sectors_per_block << SECTOR_SHIFT; + sector += ic->sectors_per_block; + } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack); + kunmap_local(mem); + + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); + if (unlikely(r)) { + if (r > 0) { + sector_t s; + + s = sector - ((r + ic->tag_size - 1) / ic->tag_size); + DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", + bio->bi_bdev, s); + r = -EILSEQ; + atomic64_inc(&ic->number_of_mismatches); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", + bio, s, 0); + } + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + if (!sectors_to_process) + break; + + if (unlikely(pos < bv_copy.bv_len)) { + bv_copy.bv_offset += pos; + bv_copy.bv_len -= pos; + goto again; + } + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + } else { + struct bio_integrity_payload *bip = dio->bio_details.bi_integrity; + + if (bip) { + struct bio_vec biv; + struct bvec_iter iter; + unsigned int data_to_process = dio->range.n_sectors; + sector_to_block(ic, data_to_process); + data_to_process *= ic->tag_size; + + bip_for_each_vec(biv, bip, iter) { + unsigned char *tag; + unsigned int this_len; + + BUG_ON(PageHighMem(biv.bv_page)); + tag = bvec_virt(&biv); + this_len = min(biv.bv_len, data_to_process); + r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, + this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); + if (unlikely(r)) + goto error; + data_to_process -= this_len; + if (!data_to_process) + break; + } + } + } +skip_io: + dec_in_flight(dio); + return; +error: + dio->bi_status = errno_to_blk_status(r); + dec_in_flight(dio); +} + +static int dm_integrity_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_integrity_c *ic = ti->private; + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + struct bio_integrity_payload *bip; + + sector_t area, offset; + + dio->ic = ic; + dio->bi_status = 0; + dio->op = bio_op(bio); + + if (unlikely(dio->op == REQ_OP_DISCARD)) { + if (ti->max_io_len) { + sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector); + unsigned int log2_max_io_len = __fls(ti->max_io_len); + sector_t start_boundary = sec >> log2_max_io_len; + sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len; + if (start_boundary < end_boundary) { + sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1)); + dm_accept_partial_bio(bio, len); + } + } + } + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + submit_flush_bio(ic, dio); + return DM_MAPIO_SUBMITTED; + } + + dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA; + if (unlikely(dio->fua)) { + /* + * Don't pass down the FUA flag because we have to flush + * disk cache anyway. + */ + bio->bi_opf &= ~REQ_FUA; + } + if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { + DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", + dio->range.logical_sector, bio_sectors(bio), + ic->provided_data_sectors); + return DM_MAPIO_KILL; + } + if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) { + DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", + ic->sectors_per_block, + dio->range.logical_sector, bio_sectors(bio)); + return DM_MAPIO_KILL; + } + + if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) { + struct bvec_iter iter; + struct bio_vec bv; + bio_for_each_segment(bv, bio, iter) { + if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { + DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", + bv.bv_offset, bv.bv_len, ic->sectors_per_block); + return DM_MAPIO_KILL; + } + } + } + + bip = bio_integrity(bio); + if (!ic->internal_hash) { + if (bip) { + unsigned int wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block; + if (ic->log2_tag_size >= 0) + wanted_tag_size <<= ic->log2_tag_size; + else + wanted_tag_size *= ic->tag_size; + if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { + DMERR("Invalid integrity data size %u, expected %u", + bip->bip_iter.bi_size, wanted_tag_size); + return DM_MAPIO_KILL; + } + } + } else { + if (unlikely(bip != NULL)) { + DMERR("Unexpected integrity data when using internal hash"); + return DM_MAPIO_KILL; + } + } + + if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ)) + return DM_MAPIO_KILL; + + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); + bio->bi_iter.bi_sector = get_data_sector(ic, area, offset); + + dm_integrity_map_continue(dio, true); + return DM_MAPIO_SUBMITTED; +} + +static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, + unsigned int journal_section, unsigned int journal_entry) +{ + struct dm_integrity_c *ic = dio->ic; + sector_t logical_sector; + unsigned int n_sectors; + + logical_sector = dio->range.logical_sector; + n_sectors = dio->range.n_sectors; + do { + struct bio_vec bv = bio_iovec(bio); + char *mem; + + if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors)) + bv.bv_len = n_sectors << SECTOR_SHIFT; + n_sectors -= bv.bv_len >> SECTOR_SHIFT; + bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); +retry_kmap: + mem = kmap_local_page(bv.bv_page); + if (likely(dio->op == REQ_OP_WRITE)) + flush_dcache_page(bv.bv_page); + + do { + struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); + + if (unlikely(dio->op == REQ_OP_READ)) { + struct journal_sector *js; + char *mem_ptr; + unsigned int s; + + if (unlikely(journal_entry_is_inprogress(je))) { + flush_dcache_page(bv.bv_page); + kunmap_local(mem); + + __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); + goto retry_kmap; + } + smp_rmb(); + BUG_ON(journal_entry_get_sector(je) != logical_sector); + js = access_journal_data(ic, journal_section, journal_entry); + mem_ptr = mem + bv.bv_offset; + s = 0; + do { + memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA); + *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s]; + js++; + mem_ptr += 1 << SECTOR_SHIFT; + } while (++s < ic->sectors_per_block); +#ifdef INTERNAL_VERIFY + if (ic->internal_hash) { + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + + integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); + if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { + DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", + logical_sector); + dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", + bio, logical_sector, 0); + } + } +#endif + } + + if (!ic->internal_hash) { + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned int tag_todo = ic->tag_size; + char *tag_ptr = journal_entry_tag(ic, je); + + if (bip) do { + struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + unsigned int tag_now = min(biv.bv_len, tag_todo); + char *tag_addr; + BUG_ON(PageHighMem(biv.bv_page)); + tag_addr = bvec_virt(&biv); + if (likely(dio->op == REQ_OP_WRITE)) + memcpy(tag_ptr, tag_addr, tag_now); + else + memcpy(tag_addr, tag_ptr, tag_now); + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now); + tag_ptr += tag_now; + tag_todo -= tag_now; + } while (unlikely(tag_todo)); else { + if (likely(dio->op == REQ_OP_WRITE)) + memset(tag_ptr, 0, tag_todo); + } + } + + if (likely(dio->op == REQ_OP_WRITE)) { + struct journal_sector *js; + unsigned int s; + + js = access_journal_data(ic, journal_section, journal_entry); + memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT); + + s = 0; + do { + je->last_bytes[s] = js[s].commit_id; + } while (++s < ic->sectors_per_block); + + if (ic->internal_hash) { + unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + if (unlikely(digest_size > ic->tag_size)) { + char checksums_onstack[HASH_MAX_DIGESTSIZE]; + integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); + memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); + } else + integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je)); + } + + journal_entry_set_sector(je, logical_sector); + } + logical_sector += ic->sectors_per_block; + + journal_entry++; + if (unlikely(journal_entry == ic->journal_section_entries)) { + journal_entry = 0; + journal_section++; + wraparound_section(ic, &journal_section); + } + + bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT; + } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT); + + if (unlikely(dio->op == REQ_OP_READ)) + flush_dcache_page(bv.bv_page); + kunmap_local(mem); + } while (n_sectors); + + if (likely(dio->op == REQ_OP_WRITE)) { + smp_mb(); + if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) + wake_up(&ic->copy_to_journal_wait); + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { + queue_work(ic->commit_wq, &ic->commit_work); + } else { + schedule_autocommit(ic); + } + } else { + remove_range(ic, &dio->range); + } + + if (unlikely(bio->bi_iter.bi_size)) { + sector_t area, offset; + + dio->range.logical_sector = logical_sector; + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); + return true; + } + + return false; +} + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map) +{ + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + unsigned int journal_section, journal_entry; + unsigned int journal_read_pos; + struct completion read_comp; + bool discard_retried = false; + bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ; + if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D') + need_sync_io = true; + + if (need_sync_io && from_map) { + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->offload_wq, &dio->work); + return; + } + +lock_retry: + spin_lock_irq(&ic->endio_wait.lock); +retry: + if (unlikely(dm_integrity_failed(ic))) { + spin_unlock_irq(&ic->endio_wait.lock); + do_endio(ic, bio); + return; + } + dio->range.n_sectors = bio_sectors(bio); + journal_read_pos = NOT_FOUND; + if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) { + if (dio->op == REQ_OP_WRITE) { + unsigned int next_entry, i, pos; + unsigned int ws, we, range_sectors; + + dio->range.n_sectors = min(dio->range.n_sectors, + (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block); + if (unlikely(!dio->range.n_sectors)) { + if (from_map) + goto offload_to_thread; + sleep_on_endio_wait(ic); + goto retry; + } + range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block; + ic->free_sectors -= range_sectors; + journal_section = ic->free_section; + journal_entry = ic->free_section_entry; + + next_entry = ic->free_section_entry + range_sectors; + ic->free_section_entry = next_entry % ic->journal_section_entries; + ic->free_section += next_entry / ic->journal_section_entries; + ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; + wraparound_section(ic, &ic->free_section); + + pos = journal_section * ic->journal_section_entries + journal_entry; + ws = journal_section; + we = journal_entry; + i = 0; + do { + struct journal_entry *je; + + add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i); + pos++; + if (unlikely(pos >= ic->journal_entries)) + pos = 0; + + je = access_journal_entry(ic, ws, we); + BUG_ON(!journal_entry_is_unused(je)); + journal_entry_set_inprogress(je); + we++; + if (unlikely(we == ic->journal_section_entries)) { + we = 0; + ws++; + wraparound_section(ic, &ws); + } + } while ((i += ic->sectors_per_block) < dio->range.n_sectors); + + spin_unlock_irq(&ic->endio_wait.lock); + goto journal_read_write; + } else { + sector_t next_sector; + journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (likely(journal_read_pos == NOT_FOUND)) { + if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector)) + dio->range.n_sectors = next_sector - dio->range.logical_sector; + } else { + unsigned int i; + unsigned int jp = journal_read_pos + 1; + for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) { + if (!test_journal_node(ic, jp, dio->range.logical_sector + i)) + break; + } + dio->range.n_sectors = i; + } + } + } + if (unlikely(!add_new_range(ic, &dio->range, true))) { + /* + * We must not sleep in the request routine because it could + * stall bios on current->bio_list. + * So, we offload the bio to a workqueue if we have to sleep. + */ + if (from_map) { +offload_to_thread: + spin_unlock_irq(&ic->endio_wait.lock); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return; + } + if (journal_read_pos != NOT_FOUND) + dio->range.n_sectors = ic->sectors_per_block; + wait_and_add_new_range(ic, &dio->range); + /* + * wait_and_add_new_range drops the spinlock, so the journal + * may have been changed arbitrarily. We need to recheck. + * To simplify the code, we restrict I/O size to just one block. + */ + if (journal_read_pos != NOT_FOUND) { + sector_t next_sector; + unsigned int new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != journal_read_pos)) { + remove_range_unlocked(ic, &dio->range); + goto retry; + } + } + } + if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) { + sector_t next_sector; + unsigned int new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != NOT_FOUND) || + unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) { + remove_range_unlocked(ic, &dio->range); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); + flush_workqueue(ic->commit_wq); + queue_work(ic->writer_wq, &ic->writer_work); + flush_workqueue(ic->writer_wq); + discard_retried = true; + goto lock_retry; + } + } + spin_unlock_irq(&ic->endio_wait.lock); + + if (unlikely(journal_read_pos != NOT_FOUND)) { + journal_section = journal_read_pos / ic->journal_section_entries; + journal_entry = journal_read_pos % ic->journal_section_entries; + goto journal_read_write; + } + + if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) { + if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + struct bitmap_block_status *bbs; + + bbs = sector_to_bitmap_block(ic, dio->range.logical_sector); + spin_lock(&bbs->bio_queue_lock); + bio_list_add(&bbs->bio_queue, bio); + spin_unlock(&bbs->bio_queue_lock); + queue_work(ic->writer_wq, &bbs->work); + return; + } + } + + dio->in_flight = (atomic_t)ATOMIC_INIT(2); + + if (need_sync_io) { + init_completion(&read_comp); + dio->completion = &read_comp; + } else + dio->completion = NULL; + + dm_bio_record(&dio->bio_details, bio); + bio_set_dev(bio, ic->dev->bdev); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; + bio->bi_end_io = integrity_end_io; + bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + + if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) { + integrity_metadata(&dio->work); + dm_integrity_flush_buffers(ic, false); + + dio->in_flight = (atomic_t)ATOMIC_INIT(1); + dio->completion = NULL; + + submit_bio_noacct(bio); + + return; + } + + submit_bio_noacct(bio); + + if (need_sync_io) { + wait_for_completion_io(&read_comp); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) + goto skip_check; + if (ic->mode == 'B') { + if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) + goto skip_check; + } + + if (likely(!bio->bi_status)) + integrity_metadata(&dio->work); + else +skip_check: + dec_in_flight(dio); + + } else { + INIT_WORK(&dio->work, integrity_metadata); + queue_work(ic->metadata_wq, &dio->work); + } + + return; + +journal_read_write: + if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry))) + goto lock_retry; + + do_endio_flush(ic, dio); +} + + +static void integrity_bio_wait(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + + dm_integrity_map_continue(dio, false); +} + +static void pad_uncommitted(struct dm_integrity_c *ic) +{ + if (ic->free_section_entry) { + ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry; + ic->free_section_entry = 0; + ic->free_section++; + wraparound_section(ic, &ic->free_section); + ic->n_uncommitted_sections++; + } + if (WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * + ic->journal_section_entries + ic->free_sectors)) { + DMCRIT("journal_sections %u, journal_section_entries %u, " + "n_uncommitted_sections %u, n_committed_sections %u, " + "journal_section_entries %u, free_sectors %u", + ic->journal_sections, ic->journal_section_entries, + ic->n_uncommitted_sections, ic->n_committed_sections, + ic->journal_section_entries, ic->free_sectors); + } +} + +static void integrity_commit(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work); + unsigned int commit_start, commit_sections; + unsigned int i, j, n; + struct bio *flushes; + + del_timer(&ic->autocommit_timer); + + spin_lock_irq(&ic->endio_wait.lock); + flushes = bio_list_get(&ic->flush_bio_list); + if (unlikely(ic->mode != 'J')) { + spin_unlock_irq(&ic->endio_wait.lock); + dm_integrity_flush_buffers(ic, true); + goto release_flush_bios; + } + + pad_uncommitted(ic); + commit_start = ic->uncommitted_section; + commit_sections = ic->n_uncommitted_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (!commit_sections) + goto release_flush_bios; + + ic->wrote_to_journal = true; + + i = commit_start; + for (n = 0; n < commit_sections; n++) { + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je; + je = access_journal_entry(ic, i, j); + io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); + } + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js; + js = access_journal(ic, i, j); + js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq); + } + i++; + if (unlikely(i >= ic->journal_sections)) + ic->commit_seq = next_commit_seq(ic->commit_seq); + wraparound_section(ic, &i); + } + smp_rmb(); + + write_journal(ic, commit_start, commit_sections); + + spin_lock_irq(&ic->endio_wait.lock); + ic->uncommitted_section += commit_sections; + wraparound_section(ic, &ic->uncommitted_section); + ic->n_uncommitted_sections -= commit_sections; + ic->n_committed_sections += commit_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) + queue_work(ic->writer_wq, &ic->writer_work); + +release_flush_bios: + while (flushes) { + struct bio *next = flushes->bi_next; + flushes->bi_next = NULL; + do_endio(ic, flushes); + flushes = next; + } +} + +static void complete_copy_from_journal(unsigned long error, void *context) +{ + struct journal_io *io = context; + struct journal_completion *comp = io->comp; + struct dm_integrity_c *ic = comp->ic; + remove_range(ic, &io->range); + mempool_free(io, &ic->journal_io_mempool); + if (unlikely(error != 0)) + dm_integrity_io_error(ic, "copying from journal", -EIO); + complete_journal_op(comp); +} + +static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js, + struct journal_entry *je) +{ + unsigned int s = 0; + do { + js->commit_id = je->last_bytes[s]; + js++; + } while (++s < ic->sectors_per_block); +} + +static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start, + unsigned int write_sections, bool from_replay) +{ + unsigned int i, j, n; + struct journal_completion comp; + struct blk_plug plug; + + blk_start_plug(&plug); + + comp.ic = ic; + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + init_completion(&comp.comp); + + i = write_start; + for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) { +#ifndef INTERNAL_VERIFY + if (unlikely(from_replay)) +#endif + rw_section_mac(ic, i, false); + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + sector_t sec, area, offset; + unsigned int k, l, next_loop; + sector_t metadata_block; + unsigned int metadata_offset; + struct journal_io *io; + + if (journal_entry_is_unused(je)) + continue; + BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay); + sec = journal_entry_get_sector(je); + if (unlikely(from_replay)) { + if (unlikely(sec & (unsigned int)(ic->sectors_per_block - 1))) { + dm_integrity_io_error(ic, "invalid sector in journal", -EIO); + sec &= ~(sector_t)(ic->sectors_per_block - 1); + } + if (unlikely(sec >= ic->provided_data_sectors)) { + journal_entry_set_unused(je); + continue; + } + } + get_area_and_offset(ic, sec, &area, &offset); + restore_last_bytes(ic, access_journal_data(ic, i, j), je); + for (k = j + 1; k < ic->journal_section_entries; k++) { + struct journal_entry *je2 = access_journal_entry(ic, i, k); + sector_t sec2, area2, offset2; + if (journal_entry_is_unused(je2)) + break; + BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); + sec2 = journal_entry_get_sector(je2); + if (unlikely(sec2 >= ic->provided_data_sectors)) + break; + get_area_and_offset(ic, sec2, &area2, &offset2); + if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block)) + break; + restore_last_bytes(ic, access_journal_data(ic, i, k), je2); + } + next_loop = k - 1; + + io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO); + io->comp = ∁ + io->range.logical_sector = sec; + io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; + + spin_lock_irq(&ic->endio_wait.lock); + add_new_range_and_wait(ic, &io->range); + + if (likely(!from_replay)) { + struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; + + /* don't write if there is newer committed sector */ + while (j < k && find_newer_committed_node(ic, §ion_node[j])) { + struct journal_entry *je2 = access_journal_entry(ic, i, j); + + journal_entry_set_unused(je2); + remove_journal_node(ic, §ion_node[j]); + j++; + sec += ic->sectors_per_block; + offset += ic->sectors_per_block; + } + while (j < k && find_newer_committed_node(ic, §ion_node[k - 1])) { + struct journal_entry *je2 = access_journal_entry(ic, i, k - 1); + + journal_entry_set_unused(je2); + remove_journal_node(ic, §ion_node[k - 1]); + k--; + } + if (j == k) { + remove_range_unlocked(ic, &io->range); + spin_unlock_irq(&ic->endio_wait.lock); + mempool_free(io, &ic->journal_io_mempool); + goto skip_io; + } + for (l = j; l < k; l++) { + remove_journal_node(ic, §ion_node[l]); + } + } + spin_unlock_irq(&ic->endio_wait.lock); + + metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); + for (l = j; l < k; l++) { + int r; + struct journal_entry *je2 = access_journal_entry(ic, i, l); + + if ( +#ifndef INTERNAL_VERIFY + unlikely(from_replay) && +#endif + ic->internal_hash) { + char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + + integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), + (char *)access_journal_data(ic, i, l), test_tag); + if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { + dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); + dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); + } + } + + journal_entry_set_unused(je2); + r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset, + ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading tags", r); + } + } + + atomic_inc(&comp.in_flight); + copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block, + (k - j) << ic->sb->log2_sectors_per_block, + get_data_sector(ic, area, offset), + complete_copy_from_journal, io); +skip_io: + j = next_loop; + } + } + + dm_bufio_write_dirty_buffers_async(ic->bufio); + + blk_finish_plug(&plug); + + complete_journal_op(&comp); + wait_for_completion_io(&comp.comp); + + dm_integrity_flush_buffers(ic, true); +} + +static void integrity_writer(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work); + unsigned int write_start, write_sections; + + unsigned int prev_free_sectors; + + spin_lock_irq(&ic->endio_wait.lock); + write_start = ic->committed_section; + write_sections = ic->n_committed_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (!write_sections) + return; + + do_journal_write(ic, write_start, write_sections, false); + + spin_lock_irq(&ic->endio_wait.lock); + + ic->committed_section += write_sections; + wraparound_section(ic, &ic->committed_section); + ic->n_committed_sections -= write_sections; + + prev_free_sectors = ic->free_sectors; + ic->free_sectors += write_sections * ic->journal_section_entries; + if (unlikely(!prev_free_sectors)) + wake_up_locked(&ic->endio_wait); + + spin_unlock_irq(&ic->endio_wait.lock); +} + +static void recalc_write_super(struct dm_integrity_c *ic) +{ + int r; + + dm_integrity_flush_buffers(ic, false); + if (dm_integrity_failed(ic)) + return; + + r = sync_rw_sb(ic, REQ_OP_WRITE); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +} + +static void integrity_recalc(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work); + struct dm_integrity_range range; + struct dm_io_request io_req; + struct dm_io_region io_loc; + sector_t area, offset; + sector_t metadata_block; + unsigned int metadata_offset; + sector_t logical_sector, n_sectors; + __u8 *t; + unsigned int i; + int r; + unsigned int super_counter = 0; + + DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector)); + + spin_lock_irq(&ic->endio_wait.lock); + +next_chunk: + + if (unlikely(dm_post_suspending(ic->ti))) + goto unlock_ret; + + range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) { + if (ic->mode == 'B') { + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + DEBUG_print("queue_delayed_work: bitmap_flush_work\n"); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + } + goto unlock_ret; + } + + get_area_and_offset(ic, range.logical_sector, &area, &offset); + range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); + if (!ic->meta_dev) + range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned int)offset); + + add_new_range_and_wait(ic, &range); + spin_unlock_irq(&ic->endio_wait.lock); + logical_sector = range.logical_sector; + n_sectors = range.n_sectors; + + if (ic->mode == 'B') { + if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { + goto advance_and_next; + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + logical_sector += ic->sectors_per_block; + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + get_area_and_offset(ic, logical_sector, &area, &offset); + } + + DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors); + + if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { + recalc_write_super(ic); + if (ic->mode == 'B') { + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); + } + super_counter = 0; + } + + if (unlikely(dm_integrity_failed(ic))) + goto err; + + io_req.bi_opf = REQ_OP_READ; + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.addr = ic->recalc_buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = get_data_sector(ic, area, offset); + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading data", r); + goto err; + } + + t = ic->recalc_tags; + for (i = 0; i < n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + t += ic->tag_size; + } + + metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); + + r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE); + if (unlikely(r)) { + dm_integrity_io_error(ic, "writing tags", r); + goto err; + } + + if (ic->mode == 'B') { + sector_t start, end; + start = (range.logical_sector >> + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) << + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + end = ((range.logical_sector + range.n_sectors) >> + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) << + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + block_bitmap_op(ic, ic->recalc_bitmap, start, end - start, BITMAP_OP_CLEAR); + } + +advance_and_next: + cond_resched(); + + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); + goto next_chunk; + +err: + remove_range(ic, &range); + return; + +unlock_ret: + spin_unlock_irq(&ic->endio_wait.lock); + + recalc_write_super(ic); +} + +static void bitmap_block_work(struct work_struct *w) +{ + struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work); + struct dm_integrity_c *ic = bbs->ic; + struct bio *bio; + struct bio_list bio_queue; + struct bio_list waiting; + + bio_list_init(&waiting); + + spin_lock(&bbs->bio_queue_lock); + bio_queue = bbs->bio_queue; + bio_list_init(&bbs->bio_queue); + spin_unlock(&bbs->bio_queue_lock); + + while ((bio = bio_list_pop(&bio_queue))) { + struct dm_integrity_io *dio; + + dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->offload_wq, &dio->work); + } else { + block_bitmap_op(ic, ic->journal, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); + bio_list_add(&waiting, bio); + } + } + + if (bio_list_empty(&waiting)) + return; + + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, + bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), + BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); + + while ((bio = bio_list_pop(&waiting))) { + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); + + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->offload_wq, &dio->work); + } + + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); +} + +static void bitmap_flush_work(struct work_struct *work) +{ + struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work); + struct dm_integrity_range range; + unsigned long limit; + struct bio *bio; + + dm_integrity_flush_buffers(ic, false); + + range.logical_sector = 0; + range.n_sectors = ic->provided_data_sectors; + + spin_lock_irq(&ic->endio_wait.lock); + add_new_range_and_wait(ic, &range); + spin_unlock_irq(&ic->endio_wait.lock); + + dm_integrity_flush_buffers(ic, true); + + limit = ic->provided_data_sectors; + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + limit = le64_to_cpu(ic->sb->recalc_sector) + >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit) + << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + } + /*DEBUG_print("zeroing journal\n");*/ + block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); + + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) { + bio_endio(bio); + spin_unlock_irq(&ic->endio_wait.lock); + spin_lock_irq(&ic->endio_wait.lock); + } + spin_unlock_irq(&ic->endio_wait.lock); +} + + +static void init_journal(struct dm_integrity_c *ic, unsigned int start_section, + unsigned int n_sections, unsigned char commit_seq) +{ + unsigned int i, j, n; + + if (!n_sections) + return; + + for (n = 0; n < n_sections; n++) { + i = start_section + n; + wraparound_section(ic, &i); + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js = access_journal(ic, i, j); + BUILD_BUG_ON(sizeof(js->sectors) != JOURNAL_SECTOR_DATA); + memset(&js->sectors, 0, sizeof(js->sectors)); + js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq); + } + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + journal_entry_set_unused(je); + } + } + + write_journal(ic, start_section, n_sections); +} + +static int find_commit_seq(struct dm_integrity_c *ic, unsigned int i, unsigned int j, commit_id_t id) +{ + unsigned char k; + for (k = 0; k < N_COMMIT_IDS; k++) { + if (dm_integrity_commit_id(ic, i, j, k) == id) + return k; + } + dm_integrity_io_error(ic, "journal commit id", -EIO); + return -EIO; +} + +static void replay_journal(struct dm_integrity_c *ic) +{ + unsigned int i, j; + bool used_commit_ids[N_COMMIT_IDS]; + unsigned int max_commit_id_sections[N_COMMIT_IDS]; + unsigned int write_start, write_sections; + unsigned int continue_section; + bool journal_empty; + unsigned char unused, last_used, want_commit_seq; + + if (ic->mode == 'R') + return; + + if (ic->journal_uptodate) + return; + + last_used = 0; + write_start = 0; + + if (!ic->just_formatted) { + DEBUG_print("reading journal\n"); + rw_journal(ic, REQ_OP_READ, 0, ic->journal_sections, NULL); + if (ic->journal_io) + DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal"); + if (ic->journal_io) { + struct journal_completion crypt_comp; + crypt_comp.ic = ic; + init_completion(&crypt_comp.comp); + crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp); + wait_for_completion(&crypt_comp.comp); + } + DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal"); + } + + if (dm_integrity_failed(ic)) + goto clear_journal; + + journal_empty = true; + memset(used_commit_ids, 0, sizeof used_commit_ids); + memset(max_commit_id_sections, 0, sizeof max_commit_id_sections); + for (i = 0; i < ic->journal_sections; i++) { + for (j = 0; j < ic->journal_section_sectors; j++) { + int k; + struct journal_sector *js = access_journal(ic, i, j); + k = find_commit_seq(ic, i, j, js->commit_id); + if (k < 0) + goto clear_journal; + used_commit_ids[k] = true; + max_commit_id_sections[k] = i; + } + if (journal_empty) { + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + if (!journal_entry_is_unused(je)) { + journal_empty = false; + break; + } + } + } + } + + if (!used_commit_ids[N_COMMIT_IDS - 1]) { + unused = N_COMMIT_IDS - 1; + while (unused && !used_commit_ids[unused - 1]) + unused--; + } else { + for (unused = 0; unused < N_COMMIT_IDS; unused++) + if (!used_commit_ids[unused]) + break; + if (unused == N_COMMIT_IDS) { + dm_integrity_io_error(ic, "journal commit ids", -EIO); + goto clear_journal; + } + } + DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n", + unused, used_commit_ids[0], used_commit_ids[1], + used_commit_ids[2], used_commit_ids[3]); + + last_used = prev_commit_seq(unused); + want_commit_seq = prev_commit_seq(last_used); + + if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)]) + journal_empty = true; + + write_start = max_commit_id_sections[last_used] + 1; + if (unlikely(write_start >= ic->journal_sections)) + want_commit_seq = next_commit_seq(want_commit_seq); + wraparound_section(ic, &write_start); + + i = write_start; + for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) { + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js = access_journal(ic, i, j); + + if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) { + /* + * This could be caused by crash during writing. + * We won't replay the inconsistent part of the + * journal. + */ + DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n", + i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq); + goto brk; + } + } + i++; + if (unlikely(i >= ic->journal_sections)) + want_commit_seq = next_commit_seq(want_commit_seq); + wraparound_section(ic, &i); + } +brk: + + if (!journal_empty) { + DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n", + write_sections, write_start, want_commit_seq); + do_journal_write(ic, write_start, write_sections, true); + } + + if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) { + continue_section = write_start; + ic->commit_seq = want_commit_seq; + DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq); + } else { + unsigned int s; + unsigned char erase_seq; +clear_journal: + DEBUG_print("clearing journal\n"); + + erase_seq = prev_commit_seq(prev_commit_seq(last_used)); + s = write_start; + init_journal(ic, s, 1, erase_seq); + s++; + wraparound_section(ic, &s); + if (ic->journal_sections >= 2) { + init_journal(ic, s, ic->journal_sections - 2, erase_seq); + s += ic->journal_sections - 2; + wraparound_section(ic, &s); + init_journal(ic, s, 1, erase_seq); + } + + continue_section = 0; + ic->commit_seq = next_commit_seq(erase_seq); + } + + ic->committed_section = continue_section; + ic->n_committed_sections = 0; + + ic->uncommitted_section = continue_section; + ic->n_uncommitted_sections = 0; + + ic->free_section = continue_section; + ic->free_section_entry = 0; + ic->free_sectors = ic->journal_entries; + + ic->journal_tree_root = RB_ROOT; + for (i = 0; i < ic->journal_entries; i++) + init_journal_node(&ic->journal_tree[i]); +} + +static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic) +{ + DEBUG_print("dm_integrity_enter_synchronous_mode\n"); + + if (ic->mode == 'B') { + ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1; + ic->synchronous_mode = 1; + + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + flush_workqueue(ic->commit_wq); + } +} + +static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier); + + DEBUG_print("dm_integrity_reboot\n"); + + dm_integrity_enter_synchronous_mode(ic); + + return NOTIFY_DONE; +} + +static void dm_integrity_postsuspend(struct dm_target *ti) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; + + WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); + + del_timer_sync(&ic->autocommit_timer); + + if (ic->recalc_wq) + drain_workqueue(ic->recalc_wq); + + if (ic->mode == 'B') + cancel_delayed_work_sync(&ic->bitmap_flush_work); + + queue_work(ic->commit_wq, &ic->commit_work); + drain_workqueue(ic->commit_wq); + + if (ic->mode == 'J') { + queue_work(ic->writer_wq, &ic->writer_work); + drain_workqueue(ic->writer_wq); + dm_integrity_flush_buffers(ic, true); + if (ic->wrote_to_journal) { + init_journal(ic, ic->free_section, + ic->journal_sections - ic->free_section, ic->commit_seq); + if (ic->free_section) { + init_journal(ic, 0, ic->free_section, + next_commit_seq(ic->commit_seq)); + } + } + } + + if (ic->mode == 'B') { + dm_integrity_flush_buffers(ic, true); +#if 1 + /* set to 0 to test bitmap replay code */ + init_journal(ic, 0, ic->journal_sections, 0); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +#endif + } + + BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + + ic->journal_uptodate = true; +} + +static void dm_integrity_resume(struct dm_target *ti) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); + int r; + + DEBUG_print("resume\n"); + + ic->wrote_to_journal = false; + + if (ic->provided_data_sectors != old_provided_data_sectors) { + if (ic->provided_data_sectors > old_provided_data_sectors && + ic->mode == 'B' && + ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + rw_journal_sectors(ic, REQ_OP_READ, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + block_bitmap_op(ic, ic->journal, old_provided_data_sectors, + ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { + DEBUG_print("resume dirty_bitmap\n"); + rw_journal_sectors(ic, REQ_OP_READ, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + if (ic->mode == 'B') { + if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && + !ic->reset_recalculate_flag) { + block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal); + block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal); + if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, + BITMAP_OP_TEST_ALL_CLEAR)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n", + ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) || + ic->reset_recalculate_flag) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + init_journal(ic, 0, ic->journal_sections, 0); + replay_journal(ic); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + } + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } else { + replay_journal(ic); + if (ic->reset_recalculate_flag) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + if (ic->mode == 'B') { + ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) { + block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + } + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + } + + DEBUG_print("testing recalc: %x\n", ic->sb->flags); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); + DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors); + if (recalc_pos < ic->provided_data_sectors) { + queue_work(ic->recalc_wq, &ic->recalc_work); + } else if (recalc_pos > ic->provided_data_sectors) { + ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors); + recalc_write_super(ic); + } + } + + ic->reboot_notifier.notifier_call = dm_integrity_reboot; + ic->reboot_notifier.next = NULL; + ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */ + WARN_ON(register_reboot_notifier(&ic->reboot_notifier)); + +#if 0 + /* set to 1 to stress test synchronous mode */ + dm_integrity_enter_synchronous_mode(ic); +#endif +} + +static void dm_integrity_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + unsigned int arg_count; + size_t sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu", + (unsigned long long)atomic64_read(&ic->number_of_mismatches), + ic->provided_data_sectors); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector)); + else + DMEMIT(" -"); + break; + + case STATUSTYPE_TABLE: { + __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; + watermark_percentage += ic->journal_entries / 2; + do_div(watermark_percentage, ic->journal_entries); + arg_count = 3; + arg_count += !!ic->meta_dev; + arg_count += ic->sectors_per_block != 1; + arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->reset_recalculate_flag; + arg_count += ic->discard; + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'B'; + arg_count += ic->mode == 'B'; + arg_count += !!ic->internal_hash_alg.alg_string; + arg_count += !!ic->journal_crypt_alg.alg_string; + arg_count += !!ic->journal_mac_alg.alg_string; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0; + arg_count += ic->legacy_recalculate; + DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start, + ic->tag_size, ic->mode, arg_count); + if (ic->meta_dev) + DMEMIT(" meta_device:%s", ic->meta_dev->name); + if (ic->sectors_per_block != 1) + DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + DMEMIT(" recalculate"); + if (ic->reset_recalculate_flag) + DMEMIT(" reset_recalculate"); + if (ic->discard) + DMEMIT(" allow_discards"); + DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); + DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); + DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); + if (ic->mode == 'J') { + DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage); + DMEMIT(" commit_time:%u", ic->autocommit_msec); + } + if (ic->mode == 'B') { + DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); + } + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) + DMEMIT(" fix_padding"); + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0) + DMEMIT(" fix_hmac"); + if (ic->legacy_recalculate) + DMEMIT(" legacy_recalculate"); + +#define EMIT_ALG(a, n) \ + do { \ + if (ic->a.alg_string) { \ + DMEMIT(" %s:%s", n, ic->a.alg_string); \ + if (ic->a.key_string) \ + DMEMIT(":%s", ic->a.key_string);\ + } \ + } while (0) + EMIT_ALG(internal_hash_alg, "internal_hash"); + EMIT_ALG(journal_crypt_alg, "journal_crypt"); + EMIT_ALG(journal_mac_alg, "journal_mac"); + break; + } + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",dev_name=%s,start=%llu,tag_size=%u,mode=%c", + ic->dev->name, ic->start, ic->tag_size, ic->mode); + + if (ic->meta_dev) + DMEMIT(",meta_device=%s", ic->meta_dev->name); + if (ic->sectors_per_block != 1) + DMEMIT(",block_size=%u", ic->sectors_per_block << SECTOR_SHIFT); + + DMEMIT(",recalculate=%c", (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ? + 'y' : 'n'); + DMEMIT(",allow_discards=%c", ic->discard ? 'y' : 'n'); + DMEMIT(",fix_padding=%c", + ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) ? 'y' : 'n'); + DMEMIT(",fix_hmac=%c", + ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0) ? 'y' : 'n'); + DMEMIT(",legacy_recalculate=%c", ic->legacy_recalculate ? 'y' : 'n'); + + DMEMIT(",journal_sectors=%u", ic->initial_sectors - SB_SECTORS); + DMEMIT(",interleave_sectors=%u", 1U << ic->sb->log2_interleave_sectors); + DMEMIT(",buffer_sectors=%u", 1U << ic->log2_buffer_sectors); + DMEMIT(";"); + break; + } +} + +static int dm_integrity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_integrity_c *ic = ti->private; + + if (!ic->meta_dev) + return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); + else + return fn(ti, ic->dev, 0, ti->len, data); +} + +static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_integrity_c *ic = ti->private; + + if (ic->sectors_per_block > 1) { + limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; + limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; + blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT); + limits->dma_alignment = limits->logical_block_size - 1; + } +} + +static void calculate_journal_section_size(struct dm_integrity_c *ic) +{ + unsigned int sector_space = JOURNAL_SECTOR_DATA; + + ic->journal_sections = le32_to_cpu(ic->sb->journal_sections); + ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size, + JOURNAL_ENTRY_ROUNDUP); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) + sector_space -= JOURNAL_MAC_PER_SECTOR; + ic->journal_entries_per_sector = sector_space / ic->journal_entry_size; + ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS; + ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS; + ic->journal_entries = ic->journal_section_entries * ic->journal_sections; +} + +static int calculate_device_limits(struct dm_integrity_c *ic) +{ + __u64 initial_sectors; + + calculate_journal_section_size(ic); + initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections; + if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX) + return -EINVAL; + ic->initial_sectors = initial_sectors; + + if (!ic->meta_dev) { + sector_t last_sector, last_area, last_offset; + + /* we have to maintain excessive padding for compatibility with existing volumes */ + __u64 metadata_run_padding = + ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ? + (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) : + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS); + + ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + metadata_run_padding) >> SECTOR_SHIFT; + if (!(ic->metadata_run & (ic->metadata_run - 1))) + ic->log2_metadata_run = __ffs(ic->metadata_run); + else + ic->log2_metadata_run = -1; + + get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); + last_sector = get_data_sector(ic, last_area, last_offset); + if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) + return -EINVAL; + } else { + __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; + meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) + >> (ic->log2_buffer_sectors + SECTOR_SHIFT); + meta_size <<= ic->log2_buffer_sectors; + if (ic->initial_sectors + meta_size < ic->initial_sectors || + ic->initial_sectors + meta_size > ic->meta_device_sectors) + return -EINVAL; + ic->metadata_run = 1; + ic->log2_metadata_run = 0; + } + + return 0; +} + +static void get_provided_data_sectors(struct dm_integrity_c *ic) +{ + if (!ic->meta_dev) { + int test_bit; + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + } else { + ic->provided_data_sectors = ic->data_device_sectors; + ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + } +} + +static int initialize_superblock(struct dm_integrity_c *ic, + unsigned int journal_sectors, unsigned int interleave_sectors) +{ + unsigned int journal_sections; + int test_bit; + + memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); + memcpy(ic->sb->magic, SB_MAGIC, 8); + ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); + ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); + if (ic->journal_mac_alg.alg_string) + ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC); + + calculate_journal_section_size(ic); + journal_sections = journal_sectors / ic->journal_section_sectors; + if (!journal_sections) + journal_sections = 1; + + if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC); + get_random_bytes(ic->sb->salt, SALT_SIZE); + } + + if (!ic->meta_dev) { + if (ic->fix_padding) + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); + ic->sb->journal_sections = cpu_to_le32(journal_sections); + if (!interleave_sectors) + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + ic->sb->log2_interleave_sectors = __fls(interleave_sectors); + ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) + return -EINVAL; + } else { + ic->sb->log2_interleave_sectors = 0; + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) + return -EINVAL; + +try_smaller_buffer: + ic->sb->journal_sections = cpu_to_le32(0); + for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) { + __u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections); + __u32 test_journal_sections = prev_journal_sections | (1U << test_bit); + if (test_journal_sections > journal_sections) + continue; + ic->sb->journal_sections = cpu_to_le32(test_journal_sections); + if (calculate_device_limits(ic)) + ic->sb->journal_sections = cpu_to_le32(prev_journal_sections); + + } + if (!le32_to_cpu(ic->sb->journal_sections)) { + if (ic->log2_buffer_sectors > 3) { + ic->log2_buffer_sectors--; + goto try_smaller_buffer; + } + return -EINVAL; + } + } + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + + sb_set_version(ic); + + return 0; +} + +static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) +{ + struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); + struct blk_integrity bi; + + memset(&bi, 0, sizeof(bi)); + bi.profile = &dm_integrity_profile; + bi.tuple_size = ic->tag_size; + bi.tag_size = bi.tuple_size; + bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; + + blk_integrity_register(disk, &bi); + blk_queue_max_integrity_segments(disk->queue, UINT_MAX); +} + +static void dm_integrity_free_page_list(struct page_list *pl) +{ + unsigned int i; + + if (!pl) + return; + for (i = 0; pl[i].page; i++) + __free_page(pl[i].page); + kvfree(pl); +} + +static struct page_list *dm_integrity_alloc_page_list(unsigned int n_pages) +{ + struct page_list *pl; + unsigned int i; + + pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO); + if (!pl) + return NULL; + + for (i = 0; i < n_pages; i++) { + pl[i].page = alloc_page(GFP_KERNEL); + if (!pl[i].page) { + dm_integrity_free_page_list(pl); + return NULL; + } + if (i) + pl[i - 1].next = &pl[i]; + } + pl[i].page = NULL; + pl[i].next = NULL; + + return pl; +} + +static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl) +{ + unsigned int i; + for (i = 0; i < ic->journal_sections; i++) + kvfree(sl[i]); + kvfree(sl); +} + +static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, + struct page_list *pl) +{ + struct scatterlist **sl; + unsigned int i; + + sl = kvmalloc_array(ic->journal_sections, + sizeof(struct scatterlist *), + GFP_KERNEL | __GFP_ZERO); + if (!sl) + return NULL; + + for (i = 0; i < ic->journal_sections; i++) { + struct scatterlist *s; + unsigned int start_index, start_offset; + unsigned int end_index, end_offset; + unsigned int n_pages; + unsigned int idx; + + page_list_location(ic, i, 0, &start_index, &start_offset); + page_list_location(ic, i, ic->journal_section_sectors - 1, + &end_index, &end_offset); + + n_pages = (end_index - start_index + 1); + + s = kvmalloc_array(n_pages, sizeof(struct scatterlist), + GFP_KERNEL); + if (!s) { + dm_integrity_free_journal_scatterlist(ic, sl); + return NULL; + } + + sg_init_table(s, n_pages); + for (idx = start_index; idx <= end_index; idx++) { + char *va = lowmem_page_address(pl[idx].page); + unsigned int start = 0, end = PAGE_SIZE; + if (idx == start_index) + start = start_offset; + if (idx == end_index) + end = end_offset + (1 << SECTOR_SHIFT); + sg_set_buf(&s[idx - start_index], va + start, end - start); + } + + sl[i] = s; + } + + return sl; +} + +static void free_alg(struct alg_spec *a) +{ + kfree_sensitive(a->alg_string); + kfree_sensitive(a->key); + memset(a, 0, sizeof *a); +} + +static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval) +{ + char *k; + + free_alg(a); + + a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL); + if (!a->alg_string) + goto nomem; + + k = strchr(a->alg_string, ':'); + if (k) { + *k = 0; + a->key_string = k + 1; + if (strlen(a->key_string) & 1) + goto inval; + + a->key_size = strlen(a->key_string) / 2; + a->key = kmalloc(a->key_size, GFP_KERNEL); + if (!a->key) + goto nomem; + if (hex2bin(a->key, a->key_string, a->key_size)) + goto inval; + } + + return 0; +inval: + *error = error_inval; + return -EINVAL; +nomem: + *error = "Out of memory for an argument"; + return -ENOMEM; +} + +static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, + char *error_alg, char *error_key) +{ + int r; + + if (a->alg_string) { + *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*hash)) { + *error = error_alg; + r = PTR_ERR(*hash); + *hash = NULL; + return r; + } + + if (a->key) { + r = crypto_shash_setkey(*hash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { + *error = error_key; + return -ENOKEY; + } + } + + return 0; +} + +static int create_journal(struct dm_integrity_c *ic, char **error) +{ + int r = 0; + unsigned int i; + __u64 journal_pages, journal_desc_size, journal_tree_size; + unsigned char *crypt_data = NULL, *crypt_iv = NULL; + struct skcipher_request *req = NULL; + + ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL); + ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL); + ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL); + ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL); + + journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors, + PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT); + journal_desc_size = journal_pages * sizeof(struct page_list); + if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) { + *error = "Journal doesn't fit into memory"; + r = -ENOMEM; + goto bad; + } + ic->journal_pages = journal_pages; + + ic->journal = dm_integrity_alloc_page_list(ic->journal_pages); + if (!ic->journal) { + *error = "Could not allocate memory for journal"; + r = -ENOMEM; + goto bad; + } + if (ic->journal_crypt_alg.alg_string) { + unsigned int ivsize, blocksize; + struct journal_completion comp; + + comp.ic = ic; + ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(ic->journal_crypt)) { + *error = "Invalid journal cipher"; + r = PTR_ERR(ic->journal_crypt); + ic->journal_crypt = NULL; + goto bad; + } + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + blocksize = crypto_skcipher_blocksize(ic->journal_crypt); + + if (ic->journal_crypt_alg.key) { + r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key, + ic->journal_crypt_alg.key_size); + if (r) { + *error = "Error setting encryption key"; + goto bad; + } + } + DEBUG_print("cipher %s, block size %u iv size %u\n", + ic->journal_crypt_alg.alg_string, blocksize, ivsize); + + ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages); + if (!ic->journal_io) { + *error = "Could not allocate memory for journal io"; + r = -ENOMEM; + goto bad; + } + + if (blocksize == 1) { + struct scatterlist *sg; + + req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!req) { + *error = "Could not allocate crypt request"; + r = -ENOMEM; + goto bad; + } + + crypt_iv = kzalloc(ivsize, GFP_KERNEL); + if (!crypt_iv) { + *error = "Could not allocate iv"; + r = -ENOMEM; + goto bad; + } + + ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages); + if (!ic->journal_xor) { + *error = "Could not allocate memory for journal xor"; + r = -ENOMEM; + goto bad; + } + + sg = kvmalloc_array(ic->journal_pages + 1, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!sg) { + *error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + sg_init_table(sg, ic->journal_pages + 1); + for (i = 0; i < ic->journal_pages; i++) { + char *va = lowmem_page_address(ic->journal_xor[i].page); + clear_page(va); + sg_set_buf(&sg[i], va, PAGE_SIZE); + } + sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); + + skcipher_request_set_crypt(req, sg, sg, + PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); + init_completion(&comp.comp); + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (do_crypt(true, req, &comp)) + wait_for_completion(&comp.comp); + kvfree(sg); + r = dm_integrity_failed(ic); + if (r) { + *error = "Unable to encrypt journal"; + goto bad; + } + DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data"); + + crypto_free_skcipher(ic->journal_crypt); + ic->journal_crypt = NULL; + } else { + unsigned int crypt_len = roundup(ivsize, blocksize); + + req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!req) { + *error = "Could not allocate crypt request"; + r = -ENOMEM; + goto bad; + } + + crypt_iv = kmalloc(ivsize, GFP_KERNEL); + if (!crypt_iv) { + *error = "Could not allocate iv"; + r = -ENOMEM; + goto bad; + } + + crypt_data = kmalloc(crypt_len, GFP_KERNEL); + if (!crypt_data) { + *error = "Unable to allocate crypt data"; + r = -ENOMEM; + goto bad; + } + + ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal); + if (!ic->journal_scatterlist) { + *error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io); + if (!ic->journal_io_scatterlist) { + *error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + ic->sk_requests = kvmalloc_array(ic->journal_sections, + sizeof(struct skcipher_request *), + GFP_KERNEL | __GFP_ZERO); + if (!ic->sk_requests) { + *error = "Unable to allocate sk requests"; + r = -ENOMEM; + goto bad; + } + for (i = 0; i < ic->journal_sections; i++) { + struct scatterlist sg; + struct skcipher_request *section_req; + __le32 section_le = cpu_to_le32(i); + + memset(crypt_iv, 0x00, ivsize); + memset(crypt_data, 0x00, crypt_len); + memcpy(crypt_data, §ion_le, min((size_t)crypt_len, sizeof(section_le))); + + sg_init_one(&sg, crypt_data, crypt_len); + skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv); + init_completion(&comp.comp); + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (do_crypt(true, req, &comp)) + wait_for_completion(&comp.comp); + + r = dm_integrity_failed(ic); + if (r) { + *error = "Unable to generate iv"; + goto bad; + } + + section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!section_req) { + *error = "Unable to allocate crypt request"; + r = -ENOMEM; + goto bad; + } + section_req->iv = kmalloc_array(ivsize, 2, + GFP_KERNEL); + if (!section_req->iv) { + skcipher_request_free(section_req); + *error = "Unable to allocate iv"; + r = -ENOMEM; + goto bad; + } + memcpy(section_req->iv + ivsize, crypt_data, ivsize); + section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT; + ic->sk_requests[i] = section_req; + DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i); + } + } + } + + for (i = 0; i < N_COMMIT_IDS; i++) { + unsigned int j; +retest_commit_id: + for (j = 0; j < i; j++) { + if (ic->commit_ids[j] == ic->commit_ids[i]) { + ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1); + goto retest_commit_id; + } + } + DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]); + } + + journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node); + if (journal_tree_size > ULONG_MAX) { + *error = "Journal doesn't fit into memory"; + r = -ENOMEM; + goto bad; + } + ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL); + if (!ic->journal_tree) { + *error = "Could not allocate memory for journal tree"; + r = -ENOMEM; + } +bad: + kfree(crypt_data); + kfree(crypt_iv); + skcipher_request_free(req); + + return r; +} + +/* + * Construct a integrity mapping + * + * Arguments: + * device + * offset from the start of the device + * tag size + * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode + * number of optional arguments + * optional arguments: + * journal_sectors + * interleave_sectors + * buffer_sectors + * journal_watermark + * commit_time + * meta_device + * block_size + * sectors_per_bit + * bitmap_flush_interval + * internal_hash + * journal_crypt + * journal_mac + * recalculate + */ +static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_integrity_c *ic; + char dummy; + int r; + unsigned int extra_args; + struct dm_arg_set as; + static const struct dm_arg _args[] = { + {0, 18, "Invalid number of feature args"}, + }; + unsigned int journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; + bool should_write_sb; + __u64 threshold; + unsigned long long start; + __s8 log2_sectors_per_bitmap_bit = -1; + __s8 log2_blocks_per_bitmap_bit; + __u64 bits_in_journal; + __u64 n_bitmap_bits; + +#define DIRECT_ARGUMENTS 4 + + if (argc <= DIRECT_ARGUMENTS) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL); + if (!ic) { + ti->error = "Cannot allocate integrity context"; + return -ENOMEM; + } + ti->private = ic; + ti->per_io_data_size = sizeof(struct dm_integrity_io); + ic->ti = ti; + + ic->in_progress = RB_ROOT; + INIT_LIST_HEAD(&ic->wait_list); + init_waitqueue_head(&ic->endio_wait); + bio_list_init(&ic->flush_bio_list); + init_waitqueue_head(&ic->copy_to_journal_wait); + init_completion(&ic->crypto_backoff); + atomic64_set(&ic->number_of_mismatches, 0); + ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL; + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { + ti->error = "Invalid starting offset"; + r = -EINVAL; + goto bad; + } + ic->start = start; + + if (strcmp(argv[2], "-")) { + if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) { + ti->error = "Invalid tag size"; + r = -EINVAL; + goto bad; + } + } + + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || + !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { + ic->mode = argv[3][0]; + } else { + ti->error = "Invalid mode (expecting J, B, D, R)"; + r = -EINVAL; + goto bad; + } + + journal_sectors = 0; + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + buffer_sectors = DEFAULT_BUFFER_SECTORS; + journal_watermark = DEFAULT_JOURNAL_WATERMARK; + sync_msec = DEFAULT_SYNC_MSEC; + ic->sectors_per_block = 1; + + as.argc = argc - DIRECT_ARGUMENTS; + as.argv = argv + DIRECT_ARGUMENTS; + r = dm_read_arg_group(_args, &as, &extra_args, &ti->error); + if (r) + goto bad; + + while (extra_args--) { + const char *opt_string; + unsigned int val; + unsigned long long llval; + opt_string = dm_shift_arg(&as); + if (!opt_string) { + r = -EINVAL; + ti->error = "Not enough feature arguments"; + goto bad; + } + if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1) + journal_sectors = val ? val : 1; + else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1) + interleave_sectors = val; + else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1) + buffer_sectors = val; + else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100) + journal_watermark = val; + else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1) + sync_msec = val; + else if (!strncmp(opt_string, "meta_device:", strlen("meta_device:"))) { + if (ic->meta_dev) { + dm_put_device(ti, ic->meta_dev); + ic->meta_dev = NULL; + } + r = dm_get_device(ti, strchr(opt_string, ':') + 1, + dm_table_get_mode(ti->table), &ic->meta_dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + } else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { + if (val < 1 << SECTOR_SHIFT || + val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT || + (val & (val -1))) { + r = -EINVAL; + ti->error = "Invalid block_size argument"; + goto bad; + } + ic->sectors_per_block = val >> SECTOR_SHIFT; + } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) { + log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval); + } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) { + if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { + r = -EINVAL; + ti->error = "Invalid bitmap_flush_interval argument"; + goto bad; + } + ic->bitmap_flush_interval = msecs_to_jiffies(val); + } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { + r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, + "Invalid internal_hash argument"); + if (r) + goto bad; + } else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) { + r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error, + "Invalid journal_crypt argument"); + if (r) + goto bad; + } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) { + r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, + "Invalid journal_mac argument"); + if (r) + goto bad; + } else if (!strcmp(opt_string, "recalculate")) { + ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "reset_recalculate")) { + ic->recalculate_flag = true; + ic->reset_recalculate_flag = true; + } else if (!strcmp(opt_string, "allow_discards")) { + ic->discard = true; + } else if (!strcmp(opt_string, "fix_padding")) { + ic->fix_padding = true; + } else if (!strcmp(opt_string, "fix_hmac")) { + ic->fix_hmac = true; + } else if (!strcmp(opt_string, "legacy_recalculate")) { + ic->legacy_recalculate = true; + } else { + r = -EINVAL; + ti->error = "Invalid argument"; + goto bad; + } + } + + ic->data_device_sectors = bdev_nr_sectors(ic->dev->bdev); + if (!ic->meta_dev) + ic->meta_device_sectors = ic->data_device_sectors; + else + ic->meta_device_sectors = bdev_nr_sectors(ic->meta_dev->bdev); + + if (!journal_sectors) { + journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, + ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + } + + if (!buffer_sectors) + buffer_sectors = 1; + ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT); + + r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, + "Invalid internal hash", "Error setting internal hash key"); + if (r) + goto bad; + + r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, + "Invalid journal mac", "Error setting journal mac key"); + if (r) + goto bad; + + if (!ic->tag_size) { + if (!ic->internal_hash) { + ti->error = "Unknown tag size"; + r = -EINVAL; + goto bad; + } + ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + } + if (ic->tag_size > MAX_TAG_SIZE) { + ti->error = "Too big tag size"; + r = -EINVAL; + goto bad; + } + if (!(ic->tag_size & (ic->tag_size - 1))) + ic->log2_tag_size = __ffs(ic->tag_size); + else + ic->log2_tag_size = -1; + + if (ic->mode == 'B' && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Bitmap mode can be only used with internal hash"; + goto bad; + } + + if (ic->discard && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Discard can be only used with internal hash"; + goto bad; + } + + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); + ic->autocommit_msec = sync_msec; + timer_setup(&ic->autocommit_timer, autocommit_fn, 0); + + ic->io = dm_io_client_create(); + if (IS_ERR(ic->io)) { + r = PTR_ERR(ic->io); + ic->io = NULL; + ti->error = "Cannot allocate dm io"; + goto bad; + } + + r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache); + if (r) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", + WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->metadata_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + /* + * If this workqueue were percpu, it would cause bio reordering + * and reduced performance. + */ + ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!ic->wait_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM, + METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->offload_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); + if (!ic->commit_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->commit_work, integrity_commit); + + if (ic->mode == 'J' || ic->mode == 'B') { + ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); + if (!ic->writer_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->writer_work, integrity_writer); + } + + ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL); + if (!ic->sb) { + r = -ENOMEM; + ti->error = "Cannot allocate superblock area"; + goto bad; + } + + r = sync_rw_sb(ic, REQ_OP_READ); + if (r) { + ti->error = "Error reading superblock"; + goto bad; + } + should_write_sb = false; + if (memcmp(ic->sb->magic, SB_MAGIC, 8)) { + if (ic->mode != 'R') { + if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) { + r = -EINVAL; + ti->error = "The device is not initialized"; + goto bad; + } + } + + r = initialize_superblock(ic, journal_sectors, interleave_sectors); + if (r) { + ti->error = "Could not initialize superblock"; + goto bad; + } + if (ic->mode != 'R') + should_write_sb = true; + } + + if (!ic->sb->version || ic->sb->version > SB_VERSION_5) { + r = -EINVAL; + ti->error = "Unknown version"; + goto bad; + } + if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) { + r = -EINVAL; + ti->error = "Tag size doesn't match the information in superblock"; + goto bad; + } + if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) { + r = -EINVAL; + ti->error = "Block size doesn't match the information in superblock"; + goto bad; + } + if (!le32_to_cpu(ic->sb->journal_sections)) { + r = -EINVAL; + ti->error = "Corrupted superblock, journal_sections is 0"; + goto bad; + } + /* make sure that ti->max_io_len doesn't overflow */ + if (!ic->meta_dev) { + if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || + ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } + } else { + if (ic->sb->log2_interleave_sectors) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } + } + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { + r = -EINVAL; + ti->error = "Journal mac mismatch"; + goto bad; + } + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) { + r = -EINVAL; + ti->error = "The device is too small"; + goto bad; + } + +try_smaller_buffer: + r = calculate_device_limits(ic); + if (r) { + if (ic->meta_dev) { + if (ic->log2_buffer_sectors > 3) { + ic->log2_buffer_sectors--; + goto try_smaller_buffer; + } + } + ti->error = "The device is too small"; + goto bad; + } + + if (log2_sectors_per_bitmap_bit < 0) + log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT); + if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block) + log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block; + + bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3); + if (bits_in_journal > UINT_MAX) + bits_in_journal = UINT_MAX; + while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) + log2_sectors_per_bitmap_bit++; + + log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block; + ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + if (should_write_sb) { + ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + } + n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) + + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit; + ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8); + + if (!ic->meta_dev) + ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); + + if (ti->len > ic->provided_data_sectors) { + r = -EINVAL; + ti->error = "Not enough provided sectors for requested mapping size"; + goto bad; + } + + + threshold = (__u64)ic->journal_entries * (100 - journal_watermark); + threshold += 50; + do_div(threshold, 100); + ic->free_sectors_threshold = threshold; + + DEBUG_print("initialized:\n"); + DEBUG_print(" integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size)); + DEBUG_print(" journal_entry_size %u\n", ic->journal_entry_size); + DEBUG_print(" journal_entries_per_sector %u\n", ic->journal_entries_per_sector); + DEBUG_print(" journal_section_entries %u\n", ic->journal_section_entries); + DEBUG_print(" journal_section_sectors %u\n", ic->journal_section_sectors); + DEBUG_print(" journal_sections %u\n", (unsigned int)le32_to_cpu(ic->sb->journal_sections)); + DEBUG_print(" journal_entries %u\n", ic->journal_entries); + DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); + DEBUG_print(" data_device_sectors 0x%llx\n", bdev_nr_sectors(ic->dev->bdev)); + DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); + DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); + DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); + DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors); + DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + DEBUG_print(" bits_in_journal %llu\n", bits_in_journal); + + if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + + if (ic->internal_hash) { + size_t recalc_tags_size; + ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); + if (!ic->recalc_wq ) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->recalc_work, integrity_recalc); + ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); + if (!ic->recalc_buffer) { + ti->error = "Cannot allocate buffer for recalculating"; + r = -ENOMEM; + goto bad; + } + recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size; + if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) + recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL); + if (!ic->recalc_tags) { + ti->error = "Cannot allocate tags for recalculating"; + r = -ENOMEM; + goto bad; + } + } else { + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + ti->error = "Recalculate can only be specified with internal_hash"; + r = -EINVAL; + goto bad; + } + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors && + dm_integrity_disable_recalculate(ic)) { + ti->error = "Recalculating with HMAC is disabled for security reasons - if you really need it, use the argument \"legacy_recalculate\""; + r = -EOPNOTSUPP; + goto bad; + } + + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); + if (IS_ERR(ic->bufio)) { + r = PTR_ERR(ic->bufio); + ti->error = "Cannot initialize dm-bufio"; + ic->bufio = NULL; + goto bad; + } + dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); + + if (ic->mode != 'R') { + r = create_journal(ic, &ti->error); + if (r) + goto bad; + + } + + if (ic->mode == 'B') { + unsigned int i; + unsigned int n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + + ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->recalc_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->may_write_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); + if (!ic->bbs) { + r = -ENOMEM; + goto bad; + } + INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work); + for (i = 0; i < ic->n_bitmap_blocks; i++) { + struct bitmap_block_status *bbs = &ic->bbs[i]; + unsigned int sector, pl_index, pl_offset; + + INIT_WORK(&bbs->work, bitmap_block_work); + bbs->ic = ic; + bbs->idx = i; + bio_list_init(&bbs->bio_queue); + spin_lock_init(&bbs->bio_queue_lock); + + sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT); + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset; + } + } + + if (should_write_sb) { + init_journal(ic, 0, ic->journal_sections, 0); + r = dm_integrity_failed(ic); + if (unlikely(r)) { + ti->error = "Error initializing journal"; + goto bad; + } + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); + if (r) { + ti->error = "Error initializing superblock"; + goto bad; + } + ic->just_formatted = true; + } + + if (!ic->meta_dev) { + r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); + if (r) + goto bad; + } + if (ic->mode == 'B') { + unsigned int max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8); + if (!max_io_len) + max_io_len = 1U << 31; + DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len); + if (!ti->max_io_len || ti->max_io_len > max_io_len) { + r = dm_set_target_max_io_len(ti, max_io_len); + if (r) + goto bad; + } + } + + if (!ic->internal_hash) + dm_integrity_set(ti, ic); + + ti->num_flush_bios = 1; + ti->flush_supported = true; + if (ic->discard) + ti->num_discard_bios = 1; + + dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1); + return 0; + +bad: + dm_audit_log_ctr(DM_MSG_PREFIX, ti, 0); + dm_integrity_dtr(ti); + return r; +} + +static void dm_integrity_dtr(struct dm_target *ti) +{ + struct dm_integrity_c *ic = ti->private; + + BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + BUG_ON(!list_empty(&ic->wait_list)); + + if (ic->mode == 'B') + cancel_delayed_work_sync(&ic->bitmap_flush_work); + if (ic->metadata_wq) + destroy_workqueue(ic->metadata_wq); + if (ic->wait_wq) + destroy_workqueue(ic->wait_wq); + if (ic->offload_wq) + destroy_workqueue(ic->offload_wq); + if (ic->commit_wq) + destroy_workqueue(ic->commit_wq); + if (ic->writer_wq) + destroy_workqueue(ic->writer_wq); + if (ic->recalc_wq) + destroy_workqueue(ic->recalc_wq); + vfree(ic->recalc_buffer); + kvfree(ic->recalc_tags); + kvfree(ic->bbs); + if (ic->bufio) + dm_bufio_client_destroy(ic->bufio); + mempool_exit(&ic->journal_io_mempool); + if (ic->io) + dm_io_client_destroy(ic->io); + if (ic->dev) + dm_put_device(ti, ic->dev); + if (ic->meta_dev) + dm_put_device(ti, ic->meta_dev); + dm_integrity_free_page_list(ic->journal); + dm_integrity_free_page_list(ic->journal_io); + dm_integrity_free_page_list(ic->journal_xor); + dm_integrity_free_page_list(ic->recalc_bitmap); + dm_integrity_free_page_list(ic->may_write_bitmap); + if (ic->journal_scatterlist) + dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); + if (ic->journal_io_scatterlist) + dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist); + if (ic->sk_requests) { + unsigned int i; + + for (i = 0; i < ic->journal_sections; i++) { + struct skcipher_request *req = ic->sk_requests[i]; + if (req) { + kfree_sensitive(req->iv); + skcipher_request_free(req); + } + } + kvfree(ic->sk_requests); + } + kvfree(ic->journal_tree); + if (ic->sb) + free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); + + if (ic->internal_hash) + crypto_free_shash(ic->internal_hash); + free_alg(&ic->internal_hash_alg); + + if (ic->journal_crypt) + crypto_free_skcipher(ic->journal_crypt); + free_alg(&ic->journal_crypt_alg); + + if (ic->journal_mac) + crypto_free_shash(ic->journal_mac); + free_alg(&ic->journal_mac_alg); + + kfree(ic); + dm_audit_log_dtr(DM_MSG_PREFIX, ti, 1); +} + +static struct target_type integrity_target = { + .name = "integrity", + .version = {1, 10, 0}, + .module = THIS_MODULE, + .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, + .ctr = dm_integrity_ctr, + .dtr = dm_integrity_dtr, + .map = dm_integrity_map, + .postsuspend = dm_integrity_postsuspend, + .resume = dm_integrity_resume, + .status = dm_integrity_status, + .iterate_devices = dm_integrity_iterate_devices, + .io_hints = dm_integrity_io_hints, +}; + +static int __init dm_integrity_init(void) +{ + int r; + + journal_io_cache = kmem_cache_create("integrity_journal_io", + sizeof(struct journal_io), 0, 0, NULL); + if (!journal_io_cache) { + DMERR("can't allocate journal io cache"); + return -ENOMEM; + } + + r = dm_register_target(&integrity_target); + if (r < 0) { + DMERR("register failed %d", r); + kmem_cache_destroy(journal_io_cache); + return r; + } + + return 0; +} + +static void __exit dm_integrity_exit(void) +{ + dm_unregister_target(&integrity_target); + kmem_cache_destroy(journal_io_cache); +} + +module_init(dm_integrity_init); +module_exit(dm_integrity_exit); + +MODULE_AUTHOR("Milan Broz"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-io-rewind.c b/drivers/md/dm-io-rewind.c new file mode 100644 index 000000000..773c4cff8 --- /dev/null +++ b/drivers/md/dm-io-rewind.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2022 Red Hat, Inc. + */ + +#include +#include +#include + +#include "dm-core.h" + +static inline bool dm_bvec_iter_rewind(const struct bio_vec *bv, + struct bvec_iter *iter, + unsigned int bytes) +{ + int idx; + + iter->bi_size += bytes; + if (bytes <= iter->bi_bvec_done) { + iter->bi_bvec_done -= bytes; + return true; + } + + bytes -= iter->bi_bvec_done; + idx = iter->bi_idx - 1; + + while (idx >= 0 && bytes && bytes > bv[idx].bv_len) { + bytes -= bv[idx].bv_len; + idx--; + } + + if (WARN_ONCE(idx < 0 && bytes, + "Attempted to rewind iter beyond bvec's boundaries\n")) { + iter->bi_size -= bytes; + iter->bi_bvec_done = 0; + iter->bi_idx = 0; + return false; + } + + iter->bi_idx = idx; + iter->bi_bvec_done = bv[idx].bv_len - bytes; + return true; +} + +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +/** + * dm_bio_integrity_rewind - Rewind integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes to rewind + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and rewind the + * integrity vector accordingly. + */ +static void dm_bio_integrity_rewind(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int bytes = bio_integrity_bytes(bi, bytes_done >> 9); + + bip->bip_iter.bi_sector -= bio_integrity_intervals(bi, bytes_done >> 9); + dm_bvec_iter_rewind(bip->bip_vec, &bip->bip_iter, bytes); +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +static inline void dm_bio_integrity_rewind(struct bio *bio, + unsigned int bytes_done) +{ + return; +} + +#endif + +#if defined(CONFIG_BLK_INLINE_ENCRYPTION) + +/* Decrements @dun by @dec, treating @dun as a multi-limb integer. */ +static void dm_bio_crypt_dun_decrement(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int dec) +{ + int i; + + for (i = 0; dec && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + u64 prev = dun[i]; + + dun[i] -= dec; + if (dun[i] > prev) + dec = 1; + else + dec = 0; + } +} + +static void dm_bio_crypt_rewind(struct bio *bio, unsigned int bytes) +{ + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + + dm_bio_crypt_dun_decrement(bc->bc_dun, + bytes >> bc->bc_key->data_unit_size_bits); +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_bio_crypt_rewind(struct bio *bio, unsigned int bytes) +{ + return; +} + +#endif + +static inline void dm_bio_rewind_iter(const struct bio *bio, + struct bvec_iter *iter, unsigned int bytes) +{ + iter->bi_sector -= bytes >> 9; + + /* No advance means no rewind */ + if (bio_no_advance_iter(bio)) + iter->bi_size += bytes; + else + dm_bvec_iter_rewind(bio->bi_io_vec, iter, bytes); +} + +/** + * dm_bio_rewind - update ->bi_iter of @bio by rewinding @bytes. + * @bio: bio to rewind + * @bytes: how many bytes to rewind + * + * WARNING: + * Caller must ensure that @bio has a fixed end sector, to allow + * rewinding from end of bio and restoring its original position. + * Caller is also responsibile for restoring bio's size. + */ +static void dm_bio_rewind(struct bio *bio, unsigned int bytes) +{ + if (bio_integrity(bio)) + dm_bio_integrity_rewind(bio, bytes); + + if (bio_has_crypt_ctx(bio)) + dm_bio_crypt_rewind(bio, bytes); + + dm_bio_rewind_iter(bio, &bio->bi_iter, bytes); +} + +void dm_io_rewind(struct dm_io *io, struct bio_set *bs) +{ + struct bio *orig = io->orig_bio; + struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig, + GFP_NOIO, bs); + /* + * dm_bio_rewind can restore to previous position since the + * end sector is fixed for original bio, but we still need + * to restore bio's size manually (using io->sectors). + */ + dm_bio_rewind(new_orig, ((io->sector_offset << 9) - + orig->bi_iter.bi_size)); + bio_trim(new_orig, 0, io->sectors); + + bio_chain(new_orig, orig); + /* + * __bi_remaining was increased (by dm_split_and_process_bio), + * so must drop the one added in bio_chain. + */ + atomic_dec(&orig->__bi_remaining); + io->orig_bio = new_orig; +} diff --git a/drivers/md/dm-io-tracker.h b/drivers/md/dm-io-tracker.h new file mode 100644 index 000000000..bdcc6273e --- /dev/null +++ b/drivers/md/dm-io-tracker.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2021 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_IO_TRACKER_H +#define DM_IO_TRACKER_H + +#include + +struct dm_io_tracker { + spinlock_t lock; + + /* + * Sectors of in-flight IO. + */ + sector_t in_flight; + + /* + * The time, in jiffies, when this device became idle + * (if it is indeed idle). + */ + unsigned long idle_time; + unsigned long last_update_time; +}; + +static inline void dm_iot_init(struct dm_io_tracker *iot) +{ + spin_lock_init(&iot->lock); + iot->in_flight = 0ul; + iot->idle_time = 0ul; + iot->last_update_time = jiffies; +} + +static inline bool dm_iot_idle_for(struct dm_io_tracker *iot, unsigned long j) +{ + bool r = false; + + spin_lock_irq(&iot->lock); + if (!iot->in_flight) + r = time_after(jiffies, iot->idle_time + j); + spin_unlock_irq(&iot->lock); + + return r; +} + +static inline unsigned long dm_iot_idle_time(struct dm_io_tracker *iot) +{ + unsigned long r = 0; + + spin_lock_irq(&iot->lock); + if (!iot->in_flight) + r = jiffies - iot->idle_time; + spin_unlock_irq(&iot->lock); + + return r; +} + +static inline void dm_iot_io_begin(struct dm_io_tracker *iot, sector_t len) +{ + spin_lock_irq(&iot->lock); + iot->in_flight += len; + spin_unlock_irq(&iot->lock); +} + +static inline void dm_iot_io_end(struct dm_io_tracker *iot, sector_t len) +{ + unsigned long flags; + + if (!len) + return; + + spin_lock_irqsave(&iot->lock, flags); + iot->in_flight -= len; + if (!iot->in_flight) + iot->idle_time = jiffies; + spin_unlock_irqrestore(&iot->lock, flags); +} + +#endif diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c new file mode 100644 index 000000000..e488b05e3 --- /dev/null +++ b/drivers/md/dm-io.c @@ -0,0 +1,544 @@ +/* + * Copyright (C) 2003 Sistina Software + * Copyright (C) 2006 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-core.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "io" + +#define DM_IO_MAX_REGIONS BITS_PER_LONG + +struct dm_io_client { + mempool_t pool; + struct bio_set bios; +}; + +/* + * Aligning 'struct io' reduces the number of bits required to store + * its address. Refer to store_io_and_region_in_bio() below. + */ +struct io { + unsigned long error_bits; + atomic_t count; + struct dm_io_client *client; + io_notify_fn callback; + void *context; + void *vma_invalidate_address; + unsigned long vma_invalidate_size; +} __attribute__((aligned(DM_IO_MAX_REGIONS))); + +static struct kmem_cache *_dm_io_cache; + +/* + * Create a client with mempool and bioset. + */ +struct dm_io_client *dm_io_client_create(void) +{ + struct dm_io_client *client; + unsigned int min_ios = dm_get_reserved_bio_based_ios(); + int ret; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return ERR_PTR(-ENOMEM); + + ret = mempool_init_slab_pool(&client->pool, min_ios, _dm_io_cache); + if (ret) + goto bad; + + ret = bioset_init(&client->bios, min_ios, 0, BIOSET_NEED_BVECS); + if (ret) + goto bad; + + return client; + + bad: + mempool_exit(&client->pool); + kfree(client); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(dm_io_client_create); + +void dm_io_client_destroy(struct dm_io_client *client) +{ + mempool_exit(&client->pool); + bioset_exit(&client->bios); + kfree(client); +} +EXPORT_SYMBOL(dm_io_client_destroy); + +/*----------------------------------------------------------------- + * We need to keep track of which region a bio is doing io for. + * To avoid a memory allocation to store just 5 or 6 bits, we + * ensure the 'struct io' pointer is aligned so enough low bits are + * always zero and then combine it with the region number directly in + * bi_private. + *---------------------------------------------------------------*/ +static void store_io_and_region_in_bio(struct bio *bio, struct io *io, + unsigned int region) +{ + if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { + DMCRIT("Unaligned struct io pointer %p", io); + BUG(); + } + + bio->bi_private = (void *)((unsigned long)io | region); +} + +static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, + unsigned int *region) +{ + unsigned long val = (unsigned long)bio->bi_private; + + *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); + *region = val & (DM_IO_MAX_REGIONS - 1); +} + +/*----------------------------------------------------------------- + * We need an io object to keep track of the number of bios that + * have been dispatched for a particular io. + *---------------------------------------------------------------*/ +static void complete_io(struct io *io) +{ + unsigned long error_bits = io->error_bits; + io_notify_fn fn = io->callback; + void *context = io->context; + + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); + + mempool_free(io, &io->client->pool); + fn(error_bits, context); +} + +static void dec_count(struct io *io, unsigned int region, blk_status_t error) +{ + if (error) + set_bit(region, &io->error_bits); + + if (atomic_dec_and_test(&io->count)) + complete_io(io); +} + +static void endio(struct bio *bio) +{ + struct io *io; + unsigned int region; + blk_status_t error; + + if (bio->bi_status && bio_data_dir(bio) == READ) + zero_fill_bio(bio); + + /* + * The bio destructor in bio_put() may use the io object. + */ + retrieve_io_and_region_from_bio(bio, &io, ®ion); + + error = bio->bi_status; + bio_put(bio); + + dec_count(io, region, error); +} + +/*----------------------------------------------------------------- + * These little objects provide an abstraction for getting a new + * destination page for io. + *---------------------------------------------------------------*/ +struct dpages { + void (*get_page)(struct dpages *dp, + struct page **p, unsigned long *len, unsigned int *offset); + void (*next_page)(struct dpages *dp); + + union { + unsigned int context_u; + struct bvec_iter context_bi; + }; + void *context_ptr; + + void *vma_invalidate_address; + unsigned long vma_invalidate_size; +}; + +/* + * Functions for getting the pages from a list. + */ +static void list_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned int *offset) +{ + unsigned int o = dp->context_u; + struct page_list *pl = (struct page_list *) dp->context_ptr; + + *p = pl->page; + *len = PAGE_SIZE - o; + *offset = o; +} + +static void list_next_page(struct dpages *dp) +{ + struct page_list *pl = (struct page_list *) dp->context_ptr; + dp->context_ptr = pl->next; + dp->context_u = 0; +} + +static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned int offset) +{ + dp->get_page = list_get_page; + dp->next_page = list_next_page; + dp->context_u = offset; + dp->context_ptr = pl; +} + +/* + * Functions for getting the pages from a bvec. + */ +static void bio_get_page(struct dpages *dp, struct page **p, + unsigned long *len, unsigned int *offset) +{ + struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr, + dp->context_bi); + + *p = bvec.bv_page; + *len = bvec.bv_len; + *offset = bvec.bv_offset; + + /* avoid figuring it out again in bio_next_page() */ + dp->context_bi.bi_sector = (sector_t)bvec.bv_len; +} + +static void bio_next_page(struct dpages *dp) +{ + unsigned int len = (unsigned int)dp->context_bi.bi_sector; + + bvec_iter_advance((struct bio_vec *)dp->context_ptr, + &dp->context_bi, len); +} + +static void bio_dp_init(struct dpages *dp, struct bio *bio) +{ + dp->get_page = bio_get_page; + dp->next_page = bio_next_page; + + /* + * We just use bvec iterator to retrieve pages, so it is ok to + * access the bvec table directly here + */ + dp->context_ptr = bio->bi_io_vec; + dp->context_bi = bio->bi_iter; +} + +/* + * Functions for getting the pages from a VMA. + */ +static void vm_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned int *offset) +{ + *p = vmalloc_to_page(dp->context_ptr); + *offset = dp->context_u; + *len = PAGE_SIZE - dp->context_u; +} + +static void vm_next_page(struct dpages *dp) +{ + dp->context_ptr += PAGE_SIZE - dp->context_u; + dp->context_u = 0; +} + +static void vm_dp_init(struct dpages *dp, void *data) +{ + dp->get_page = vm_get_page; + dp->next_page = vm_next_page; + dp->context_u = offset_in_page(data); + dp->context_ptr = data; +} + +/* + * Functions for getting the pages from kernel memory. + */ +static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, + unsigned int *offset) +{ + *p = virt_to_page(dp->context_ptr); + *offset = dp->context_u; + *len = PAGE_SIZE - dp->context_u; +} + +static void km_next_page(struct dpages *dp) +{ + dp->context_ptr += PAGE_SIZE - dp->context_u; + dp->context_u = 0; +} + +static void km_dp_init(struct dpages *dp, void *data) +{ + dp->get_page = km_get_page; + dp->next_page = km_next_page; + dp->context_u = offset_in_page(data); + dp->context_ptr = data; +} + +/*----------------------------------------------------------------- + * IO routines that accept a list of pages. + *---------------------------------------------------------------*/ +static void do_region(const blk_opf_t opf, unsigned int region, + struct dm_io_region *where, struct dpages *dp, + struct io *io) +{ + struct bio *bio; + struct page *page; + unsigned long len; + unsigned int offset; + unsigned int num_bvecs; + sector_t remaining = where->count; + struct request_queue *q = bdev_get_queue(where->bdev); + sector_t num_sectors; + unsigned int special_cmd_max_sectors; + const enum req_op op = opf & REQ_OP_MASK; + + /* + * Reject unsupported discard and write same requests. + */ + if (op == REQ_OP_DISCARD) + special_cmd_max_sectors = bdev_max_discard_sectors(where->bdev); + else if (op == REQ_OP_WRITE_ZEROES) + special_cmd_max_sectors = q->limits.max_write_zeroes_sectors; + if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) && + special_cmd_max_sectors == 0) { + atomic_inc(&io->count); + dec_count(io, region, BLK_STS_NOTSUPP); + return; + } + + /* + * where->count may be zero if op holds a flush and we need to + * send a zero-sized flush. + */ + do { + /* + * Allocate a suitably sized-bio. + */ + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + num_bvecs = 0; + break; + default: + num_bvecs = bio_max_segs(dm_sector_div_up(remaining, + (PAGE_SIZE >> SECTOR_SHIFT))); + } + + bio = bio_alloc_bioset(where->bdev, num_bvecs, opf, GFP_NOIO, + &io->client->bios); + bio->bi_iter.bi_sector = where->sector + (where->count - remaining); + bio->bi_end_io = endio; + store_io_and_region_in_bio(bio, io, region); + + if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) { + num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); + bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; + remaining -= num_sectors; + } else while (remaining) { + /* + * Try and add as many pages as possible. + */ + dp->get_page(dp, &page, &len, &offset); + len = min(len, to_bytes(remaining)); + if (!bio_add_page(bio, page, len, offset)) + break; + + offset = 0; + remaining -= to_sector(len); + dp->next_page(dp); + } + + atomic_inc(&io->count); + submit_bio(bio); + } while (remaining); +} + +static void dispatch_io(blk_opf_t opf, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, + struct io *io, int sync) +{ + int i; + struct dpages old_pages = *dp; + + BUG_ON(num_regions > DM_IO_MAX_REGIONS); + + if (sync) + opf |= REQ_SYNC; + + /* + * For multiple regions we need to be careful to rewind + * the dp object for each call to do_region. + */ + for (i = 0; i < num_regions; i++) { + *dp = old_pages; + if (where[i].count || (opf & REQ_PREFLUSH)) + do_region(opf, i, where + i, dp, io); + } + + /* + * Drop the extra reference that we were holding to avoid + * the io being completed too early. + */ + dec_count(io, 0, 0); +} + +struct sync_io { + unsigned long error_bits; + struct completion wait; +}; + +static void sync_io_complete(unsigned long error, void *context) +{ + struct sync_io *sio = context; + + sio->error_bits = error; + complete(&sio->wait); +} + +static int sync_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, + unsigned long *error_bits) +{ + struct io *io; + struct sync_io sio; + + if (num_regions > 1 && !op_is_write(opf)) { + WARN_ON(1); + return -EIO; + } + + init_completion(&sio.wait); + + io = mempool_alloc(&client->pool, GFP_NOIO); + io->error_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->client = client; + io->callback = sync_io_complete; + io->context = &sio; + + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + + dispatch_io(opf, num_regions, where, dp, io, 1); + + wait_for_completion_io(&sio.wait); + + if (error_bits) + *error_bits = sio.error_bits; + + return sio.error_bits ? -EIO : 0; +} + +static int async_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, blk_opf_t opf, + struct dpages *dp, io_notify_fn fn, void *context) +{ + struct io *io; + + if (num_regions > 1 && !op_is_write(opf)) { + WARN_ON(1); + fn(1, context); + return -EIO; + } + + io = mempool_alloc(&client->pool, GFP_NOIO); + io->error_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->client = client; + io->callback = fn; + io->context = context; + + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + + dispatch_io(opf, num_regions, where, dp, io, 0); + return 0; +} + +static int dp_init(struct dm_io_request *io_req, struct dpages *dp, + unsigned long size) +{ + /* Set up dpages based on memory type */ + + dp->vma_invalidate_address = NULL; + dp->vma_invalidate_size = 0; + + switch (io_req->mem.type) { + case DM_IO_PAGE_LIST: + list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); + break; + + case DM_IO_BIO: + bio_dp_init(dp, io_req->mem.ptr.bio); + break; + + case DM_IO_VMA: + flush_kernel_vmap_range(io_req->mem.ptr.vma, size); + if ((io_req->bi_opf & REQ_OP_MASK) == REQ_OP_READ) { + dp->vma_invalidate_address = io_req->mem.ptr.vma; + dp->vma_invalidate_size = size; + } + vm_dp_init(dp, io_req->mem.ptr.vma); + break; + + case DM_IO_KMEM: + km_dp_init(dp, io_req->mem.ptr.addr); + break; + + default: + return -EINVAL; + } + + return 0; +} + +int dm_io(struct dm_io_request *io_req, unsigned int num_regions, + struct dm_io_region *where, unsigned long *sync_error_bits) +{ + int r; + struct dpages dp; + + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); + if (r) + return r; + + if (!io_req->notify.fn) + return sync_io(io_req->client, num_regions, where, + io_req->bi_opf, &dp, sync_error_bits); + + return async_io(io_req->client, num_regions, where, + io_req->bi_opf, &dp, io_req->notify.fn, + io_req->notify.context); +} +EXPORT_SYMBOL(dm_io); + +int __init dm_io_init(void) +{ + _dm_io_cache = KMEM_CACHE(io, 0); + if (!_dm_io_cache) + return -ENOMEM; + + return 0; +} + +void dm_io_exit(void) +{ + kmem_cache_destroy(_dm_io_cache); + _dm_io_cache = NULL; +} diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c new file mode 100644 index 000000000..206e6ce55 --- /dev/null +++ b/drivers/md/dm-ioctl.c @@ -0,0 +1,2271 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-core.h" +#include "dm-ima.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DM_MSG_PREFIX "ioctl" +#define DM_DRIVER_EMAIL "dm-devel@redhat.com" + +struct dm_file { + /* + * poll will wait until the global event number is greater than + * this value. + */ + volatile unsigned int global_event_nr; +}; + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { + struct rb_node name_node; + struct rb_node uuid_node; + bool name_set; + bool uuid_set; + + char *name; + char *uuid; + struct mapped_device *md; + struct dm_table *new_map; +}; + +struct vers_iter { + size_t param_size; + struct dm_target_versions *vers, *old_vers; + char *end; + uint32_t flags; +}; + + +static struct rb_root name_rb_tree = RB_ROOT; +static struct rb_root uuid_rb_tree = RB_ROOT; + +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); + +/* + * Guards access to both hash tables. + */ +static DECLARE_RWSEM(_hash_lock); + +/* + * Protects use of mdptr to obtain hash cell name and uuid from mapped device. + */ +static DEFINE_MUTEX(dm_hash_cells_mutex); + +static void dm_hash_exit(void) +{ + dm_hash_remove_all(false, false, false); +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ + struct rb_node *n = name_rb_tree.rb_node; + + while (n) { + struct hash_cell *hc = container_of(n, struct hash_cell, name_node); + int c = strcmp(hc->name, str); + if (!c) { + dm_get(hc->md); + return hc; + } + n = c >= 0 ? n->rb_left : n->rb_right; + } + + return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ + struct rb_node *n = uuid_rb_tree.rb_node; + + while (n) { + struct hash_cell *hc = container_of(n, struct hash_cell, uuid_node); + int c = strcmp(hc->uuid, str); + if (!c) { + dm_get(hc->md); + return hc; + } + n = c >= 0 ? n->rb_left : n->rb_right; + } + + return NULL; +} + +static void __unlink_name(struct hash_cell *hc) +{ + if (hc->name_set) { + hc->name_set = false; + rb_erase(&hc->name_node, &name_rb_tree); + } +} + +static void __unlink_uuid(struct hash_cell *hc) +{ + if (hc->uuid_set) { + hc->uuid_set = false; + rb_erase(&hc->uuid_node, &uuid_rb_tree); + } +} + +static void __link_name(struct hash_cell *new_hc) +{ + struct rb_node **n, *parent; + + __unlink_name(new_hc); + + new_hc->name_set = true; + + n = &name_rb_tree.rb_node; + parent = NULL; + + while (*n) { + struct hash_cell *hc = container_of(*n, struct hash_cell, name_node); + int c = strcmp(hc->name, new_hc->name); + BUG_ON(!c); + parent = *n; + n = c >= 0 ? &hc->name_node.rb_left : &hc->name_node.rb_right; + } + + rb_link_node(&new_hc->name_node, parent, n); + rb_insert_color(&new_hc->name_node, &name_rb_tree); +} + +static void __link_uuid(struct hash_cell *new_hc) +{ + struct rb_node **n, *parent; + + __unlink_uuid(new_hc); + + new_hc->uuid_set = true; + + n = &uuid_rb_tree.rb_node; + parent = NULL; + + while (*n) { + struct hash_cell *hc = container_of(*n, struct hash_cell, uuid_node); + int c = strcmp(hc->uuid, new_hc->uuid); + BUG_ON(!c); + parent = *n; + n = c > 0 ? &hc->uuid_node.rb_left : &hc->uuid_node.rb_right; + } + + rb_link_node(&new_hc->uuid_node, parent, n); + rb_insert_color(&new_hc->uuid_node, &uuid_rb_tree); +} + +static struct hash_cell *__get_dev_cell(uint64_t dev) +{ + struct mapped_device *md; + struct hash_cell *hc; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + + hc = dm_get_mdptr(md); + if (!hc) { + dm_put(md); + return NULL; + } + + return hc; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static struct hash_cell *alloc_cell(const char *name, const char *uuid, + struct mapped_device *md) +{ + struct hash_cell *hc; + + hc = kmalloc(sizeof(*hc), GFP_KERNEL); + if (!hc) + return NULL; + + hc->name = kstrdup(name, GFP_KERNEL); + if (!hc->name) { + kfree(hc); + return NULL; + } + + if (!uuid) + hc->uuid = NULL; + + else { + hc->uuid = kstrdup(uuid, GFP_KERNEL); + if (!hc->uuid) { + kfree(hc->name); + kfree(hc); + return NULL; + } + } + + hc->name_set = hc->uuid_set = false; + hc->md = md; + hc->new_map = NULL; + return hc; +} + +static void free_cell(struct hash_cell *hc) +{ + if (hc) { + kfree(hc->name); + kfree(hc->uuid); + kfree(hc); + } +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ + struct hash_cell *cell, *hc; + + /* + * Allocate the new cells. + */ + cell = alloc_cell(name, uuid, md); + if (!cell) + return -ENOMEM; + + /* + * Insert the cell into both hash tables. + */ + down_write(&_hash_lock); + hc = __get_name_cell(name); + if (hc) { + dm_put(hc->md); + goto bad; + } + + __link_name(cell); + + if (uuid) { + hc = __get_uuid_cell(uuid); + if (hc) { + __unlink_name(cell); + dm_put(hc->md); + goto bad; + } + __link_uuid(cell); + } + dm_get(md); + mutex_lock(&dm_hash_cells_mutex); + dm_set_mdptr(md, cell); + mutex_unlock(&dm_hash_cells_mutex); + up_write(&_hash_lock); + + return 0; + + bad: + up_write(&_hash_lock); + free_cell(cell); + return -EBUSY; +} + +static struct dm_table *__hash_remove(struct hash_cell *hc) +{ + struct dm_table *table; + int srcu_idx; + + /* remove from the dev trees */ + __unlink_name(hc); + __unlink_uuid(hc); + mutex_lock(&dm_hash_cells_mutex); + dm_set_mdptr(hc->md, NULL); + mutex_unlock(&dm_hash_cells_mutex); + + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) + dm_table_event(table); + dm_put_live_table(hc->md, srcu_idx); + + table = NULL; + if (hc->new_map) + table = hc->new_map; + dm_put(hc->md); + free_cell(hc); + + return table; +} + +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) +{ + int dev_skipped; + struct rb_node *n; + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *t; + +retry: + dev_skipped = 0; + + down_write(&_hash_lock); + + for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + hc = container_of(n, struct hash_cell, name_node); + md = hc->md; + dm_get(md); + + if (keep_open_devices && + dm_lock_for_deletion(md, mark_deferred, only_deferred)) { + dm_put(md); + dev_skipped++; + continue; + } + + t = __hash_remove(hc); + + up_write(&_hash_lock); + + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + dm_ima_measure_on_device_remove(md, true); + dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; + } + + up_write(&_hash_lock); + + if (dev_skipped) + DMWARN("remove_all left %d open device(s)", dev_skipped); +} + +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + __link_uuid(hc); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + + /* + * Rename and move the name cell. + */ + __unlink_name(hc); + old_name = hc->name; + + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); + + __link_name(hc); + + return old_name; +} + +static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, + const char *new) +{ + char *new_data, *old_name = NULL; + struct hash_cell *hc; + struct dm_table *table; + struct mapped_device *md; + unsigned int change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + int srcu_idx; + + /* + * duplicate new. + */ + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) + return ERR_PTR(-ENOMEM); + + down_write(&_hash_lock); + + /* + * Is new free ? + */ + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + + if (hc) { + DMERR("Unable to change %s on mapped device %s to one that already exists: %s", + change_uuid ? "uuid" : "name", + param->name, new); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EBUSY); + } + + /* + * Is there such a device as 'old' ? + */ + hc = __get_name_cell(param->name); + if (!hc) { + DMERR("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-ENXIO); + } + + /* + * Does this device already have a uuid? + */ + if (change_uuid && hc->uuid) { + DMERR("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); + + /* + * Wake up any dm event waiters. + */ + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) + dm_table_event(table); + dm_put_live_table(hc->md, srcu_idx); + + if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr, false)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + + md = hc->md; + + dm_ima_measure_on_device_rename(md); + + up_write(&_hash_lock); + kfree(old_name); + + return md; +} + +void dm_deferred_remove(void) +{ + dm_hash_remove_all(true, false, true); +} + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_size); + +static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); + param->data_size = 0; + return 0; +} + +/* + * Round up the ptr to an 8-byte boundary. + */ +#define ALIGN_MASK 7 +static inline size_t align_val(size_t val) +{ + return (val + ALIGN_MASK) & ~ALIGN_MASK; +} +static inline void *align_ptr(void *ptr) +{ + return (void *)align_val((size_t)ptr); +} + +/* + * Retrieves the data payload buffer from an already allocated + * struct dm_ioctl. + */ +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, + size_t *len) +{ + param->data_start = align_ptr(param + 1) - (void *) param; + + if (param->data_start < param_size) + *len = param_size - param->data_start; + else + *len = 0; + + return ((void *) param) + param->data_start; +} + +static bool filter_device(struct hash_cell *hc, const char *pfx_name, const char *pfx_uuid) +{ + const char *val; + size_t val_len, pfx_len; + + val = hc->name; + val_len = strlen(val); + pfx_len = strnlen(pfx_name, DM_NAME_LEN); + if (pfx_len > val_len) + return false; + if (memcmp(val, pfx_name, pfx_len)) + return false; + + val = hc->uuid ? hc->uuid : ""; + val_len = strlen(val); + pfx_len = strnlen(pfx_uuid, DM_UUID_LEN); + if (pfx_len > val_len) + return false; + if (memcmp(val, pfx_uuid, pfx_len)) + return false; + + return true; +} + +static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct rb_node *n; + struct hash_cell *hc; + size_t len, needed = 0; + struct gendisk *disk; + struct dm_name_list *orig_nl, *nl, *old_nl = NULL; + uint32_t *event_nr; + + down_write(&_hash_lock); + + /* + * Loop through all the devices working out how much + * space we need. + */ + for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + hc = container_of(n, struct hash_cell, name_node); + if (!filter_device(hc, param->name, param->uuid)) + continue; + needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); + needed += align_val(sizeof(uint32_t) * 2); + if (param->flags & DM_UUID_FLAG && hc->uuid) + needed += align_val(strlen(hc->uuid) + 1); + } + + /* + * Grab our output buffer. + */ + nl = orig_nl = get_result_buffer(param, param_size, &len); + if (len < needed || len < sizeof(nl->dev)) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + nl->dev = 0; /* Flags no data */ + + /* + * Now loop through filling out the names. + */ + for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + void *uuid_ptr; + hc = container_of(n, struct hash_cell, name_node); + if (!filter_device(hc, param->name, param->uuid)) + continue; + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); + disk = dm_disk(hc->md); + nl->dev = huge_encode_dev(disk_devt(disk)); + nl->next = 0; + strcpy(nl->name, hc->name); + + old_nl = nl; + event_nr = align_ptr(nl->name + strlen(hc->name) + 1); + event_nr[0] = dm_get_event_nr(hc->md); + event_nr[1] = 0; + uuid_ptr = align_ptr(event_nr + 2); + if (param->flags & DM_UUID_FLAG) { + if (hc->uuid) { + event_nr[1] |= DM_NAME_LIST_FLAG_HAS_UUID; + strcpy(uuid_ptr, hc->uuid); + uuid_ptr = align_ptr(uuid_ptr + strlen(hc->uuid) + 1); + } else { + event_nr[1] |= DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID; + } + } + nl = uuid_ptr; + } + /* + * If mismatch happens, security may be compromised due to buffer + * overflow, so it's better to crash. + */ + BUG_ON((char *)nl - (char *)orig_nl != needed); + + out: + up_write(&_hash_lock); + return 0; +} + +static void list_version_get_needed(struct target_type *tt, void *needed_param) +{ + size_t *needed = needed_param; + + *needed += sizeof(struct dm_target_versions); + *needed += strlen(tt->name) + 1; + *needed += ALIGN_MASK; +} + +static void list_version_get_info(struct target_type *tt, void *param) +{ + struct vers_iter *info = param; + + /* Check space - it might have changed since the first iteration */ + if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 > + info->end) { + + info->flags = DM_BUFFER_FULL_FLAG; + return; + } + + if (info->old_vers) + info->old_vers->next = (uint32_t) ((void *)info->vers - + (void *)info->old_vers); + info->vers->version[0] = tt->version[0]; + info->vers->version[1] = tt->version[1]; + info->vers->version[2] = tt->version[2]; + info->vers->next = 0; + strcpy(info->vers->name, tt->name); + + info->old_vers = info->vers; + info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); +} + +static int __list_versions(struct dm_ioctl *param, size_t param_size, const char *name) +{ + size_t len, needed = 0; + struct dm_target_versions *vers; + struct vers_iter iter_info; + struct target_type *tt = NULL; + + if (name) { + tt = dm_get_target_type(name); + if (!tt) + return -EINVAL; + } + + /* + * Loop through all the devices working out how much + * space we need. + */ + if (!tt) + dm_target_iterate(list_version_get_needed, &needed); + else + list_version_get_needed(tt, &needed); + + /* + * Grab our output buffer. + */ + vers = get_result_buffer(param, param_size, &len); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + iter_info.param_size = param_size; + iter_info.old_vers = NULL; + iter_info.vers = vers; + iter_info.flags = 0; + iter_info.end = (char *)vers + needed; + + /* + * Now loop through filling out the names & versions. + */ + if (!tt) + dm_target_iterate(list_version_get_info, &iter_info); + else + list_version_get_info(tt, &iter_info); + param->flags |= iter_info.flags; + + out: + if (tt) + dm_put_target_type(tt); + return 0; +} + +static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + return __list_versions(param, param_size, NULL); +} + +static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + return __list_versions(param, param_size, param->name); +} + +static int check_name(const char *name) +{ + if (strchr(name, '/')) { + DMERR("invalid device name"); + return -EINVAL; + } + + return 0; +} + +/* + * On successful return, the caller must not attempt to acquire + * _hash_lock without first calling dm_put_live_table, because dm_table_destroy + * waits for this dm_put_live_table and could be called under this lock. + */ +static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) +{ + struct hash_cell *hc; + struct dm_table *table = NULL; + + /* increment rcu count, we don't care about the table pointer */ + dm_get_live_table(md, srcu_idx); + + down_read(&_hash_lock); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + DMERR("device has been removed from the dev hash table."); + goto out; + } + + table = hc->new_map; + +out: + up_read(&_hash_lock); + + return table; +} + +static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, + struct dm_ioctl *param, + int *srcu_idx) +{ + return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? + dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx); +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) +{ + struct gendisk *disk = dm_disk(md); + struct dm_table *table; + int srcu_idx; + + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | + DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG); + + if (dm_suspended_md(md)) + param->flags |= DM_SUSPEND_FLAG; + + if (dm_suspended_internally_md(md)) + param->flags |= DM_INTERNAL_SUSPEND_FLAG; + + if (dm_test_deferred_remove_flag(md)) + param->flags |= DM_DEFERRED_REMOVE; + + param->dev = huge_encode_dev(disk_devt(disk)); + + /* + * Yes, this will be out of date by the time it gets back + * to userland, but it is still very useful for + * debugging. + */ + param->open_count = dm_open_count(md); + + param->event_nr = dm_get_event_nr(md); + param->target_count = 0; + + table = dm_get_live_table(md, &srcu_idx); + if (table) { + if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { + if (get_disk_ro(disk)) + param->flags |= DM_READONLY_FLAG; + param->target_count = table->num_targets; + } + + param->flags |= DM_ACTIVE_PRESENT_FLAG; + } + dm_put_live_table(md, srcu_idx); + + if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { + int srcu_idx; + table = dm_get_inactive_table(md, &srcu_idx); + if (table) { + if (!(dm_table_get_mode(table) & FMODE_WRITE)) + param->flags |= DM_READONLY_FLAG; + param->target_count = table->num_targets; + } + dm_put_live_table(md, srcu_idx); + } +} + +static int dev_create(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int r, m = DM_ANY_MINOR; + struct mapped_device *md; + + r = check_name(param->name); + if (r) + return r; + + if (param->flags & DM_PERSISTENT_DEV_FLAG) + m = MINOR(huge_decode_dev(param->dev)); + + r = dm_create(m, &md); + if (r) + return r; + + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); + if (r) { + dm_put(md); + dm_destroy(md); + return r; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + __dev_status(md, param); + + dm_put(md); + + return 0; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +{ + struct hash_cell *hc = NULL; + + if (*param->uuid) { + if (*param->name || param->dev) { + DMERR("Invalid ioctl structure: uuid %s, name %s, dev %llx", + param->uuid, param->name, (unsigned long long)param->dev); + return NULL; + } + + hc = __get_uuid_cell(param->uuid); + if (!hc) + return NULL; + } else if (*param->name) { + if (param->dev) { + DMERR("Invalid ioctl structure: name %s, dev %llx", + param->name, (unsigned long long)param->dev); + return NULL; + } + + hc = __get_name_cell(param->name); + if (!hc) + return NULL; + } else if (param->dev) { + hc = __get_dev_cell(param->dev); + if (!hc) + return NULL; + } else + return NULL; + + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strlcpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); + else + param->uuid[0] = '\0'; + + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + return hc; +} + +static struct mapped_device *find_device(struct dm_ioctl *param) +{ + struct hash_cell *hc; + struct mapped_device *md = NULL; + + down_read(&_hash_lock); + hc = __find_device_hash_cell(param); + if (hc) + md = hc->md; + up_read(&_hash_lock); + + return md; +} + +static int dev_remove(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + struct mapped_device *md; + int r; + struct dm_table *t; + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + + if (!hc) { + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + + /* + * Ensure the device is not open and nothing further can open it. + */ + r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false); + if (r) { + if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) { + up_write(&_hash_lock); + dm_put(md); + return 0; + } + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); + up_write(&_hash_lock); + dm_put(md); + return r; + } + + t = __hash_remove(hc); + up_write(&_hash_lock); + + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + + param->flags &= ~DM_DEFERRED_REMOVE; + + dm_ima_measure_on_device_remove(md, false); + + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr, false)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + + dm_put(md); + dm_destroy(md); + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str < end) + if (!*str++) + return 0; + + return -EINVAL; +} + +static int dev_rename(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int r; + char *new_data = (char *) param + param->data_start; + struct mapped_device *md; + unsigned int change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || !*new_data || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMERR("Invalid new mapped device name or uuid string supplied."); + return -EINVAL; + } + + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } + + md = dm_hash_rename(param, new_data); + if (IS_ERR(md)) + return PTR_ERR(md); + + __dev_status(md, param); + dm_put(md); + + return 0; +} + +static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int r = -EINVAL, x; + struct mapped_device *md; + struct hd_geometry geometry; + unsigned long indata[4]; + char *geostr = (char *) param + param->data_start; + char dummy; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (geostr < param->data || + invalid_str(geostr, (void *) param + param_size)) { + DMERR("Invalid geometry supplied."); + goto out; + } + + x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, + indata + 1, indata + 2, indata + 3, &dummy); + + if (x != 4) { + DMERR("Unable to interpret geometry settings."); + goto out; + } + + if (indata[0] > 65535 || indata[1] > 255 || + indata[2] > 255 || indata[3] > ULONG_MAX) { + DMERR("Geometry exceeds range limits."); + goto out; + } + + geometry.cylinders = indata[0]; + geometry.heads = indata[1]; + geometry.sectors = indata[2]; + geometry.start = indata[3]; + + r = dm_set_geometry(md, &geometry); + + param->data_size = 0; + +out: + dm_put(md); + return r; +} + +static int do_suspend(struct dm_ioctl *param) +{ + int r = 0; + unsigned int suspend_flags = DM_SUSPEND_LOCKFS_FLAG; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (param->flags & DM_SKIP_LOCKFS_FLAG) + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; + + if (!dm_suspended_md(md)) { + r = dm_suspend(md, suspend_flags); + if (r) + goto out; + } + + __dev_status(md, param); + +out: + dm_put(md); + + return r; +} + +static int do_resume(struct dm_ioctl *param) +{ + int r = 0; + unsigned int suspend_flags = DM_SUSPEND_LOCKFS_FLAG; + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *new_map, *old_map = NULL; + bool need_resize_uevent = false; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + + new_map = hc->new_map; + hc->new_map = NULL; + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + up_write(&_hash_lock); + + /* Do we need to load a new map ? */ + if (new_map) { + sector_t old_size, new_size; + + /* Suspend if it isn't already suspended */ + if (param->flags & DM_SKIP_LOCKFS_FLAG) + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; + if (!dm_suspended_md(md)) + dm_suspend(md, suspend_flags); + + old_size = dm_get_size(md); + old_map = dm_swap_table(md, new_map); + if (IS_ERR(old_map)) { + dm_sync_table(md); + dm_table_destroy(new_map); + dm_put(md); + return PTR_ERR(old_map); + } + new_size = dm_get_size(md); + if (old_size && new_size && old_size != new_size) + need_resize_uevent = true; + + if (dm_table_get_mode(new_map) & FMODE_WRITE) + set_disk_ro(dm_disk(md), 0); + else + set_disk_ro(dm_disk(md), 1); + } + + if (dm_suspended_md(md)) { + r = dm_resume(md); + if (!r) { + dm_ima_measure_on_device_resume(md, new_map ? true : false); + + if (!dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr, need_resize_uevent)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + } + } + + /* + * Since dm_swap_table synchronizes RCU, nobody should be in + * read-side critical section already. + */ + if (old_map) + dm_table_destroy(old_map); + + if (!r) + __dev_status(md, param); + + dm_put(md); + return r; +} + +/* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. + */ +static int dev_suspend(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + if (param->flags & DM_SUSPEND_FLAG) + return do_suspend(param); + + return do_resume(param); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int dev_status(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + __dev_status(md, param); + dm_put(md); + + return 0; +} + +/* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, + struct dm_ioctl *param, size_t param_size) +{ + unsigned int i, num_targets; + struct dm_target_spec *spec; + char *outbuf, *outptr; + status_type_t type; + size_t remaining, len, used = 0; + unsigned int status_flags = 0; + + outptr = outbuf = get_result_buffer(param, param_size, &len); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else if (param->flags & DM_IMA_MEASUREMENT_FLAG) + type = STATUSTYPE_IMA; + else + type = STATUSTYPE_INFO; + + /* Get all the target info */ + num_targets = table->num_targets; + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + size_t l; + + remaining = len - (outptr - outbuf); + if (remaining <= sizeof(struct dm_target_spec)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type) - 1); + + outptr += sizeof(struct dm_target_spec); + remaining = len - (outptr - outbuf); + if (remaining <= 0) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + /* Get the status/table string from the target driver */ + if (ti->type->status) { + if (param->flags & DM_NOFLUSH_FLAG) + status_flags |= DM_STATUS_NOFLUSH_FLAG; + ti->type->status(ti, type, status_flags, outptr, remaining); + } else + outptr[0] = '\0'; + + l = strlen(outptr) + 1; + if (l == remaining) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + outptr += l; + used = param->data_start + (outptr - outbuf); + + outptr = align_ptr(outptr); + spec->next = outptr - outbuf; + } + + if (used) + param->data_size = used; + + param->target_count = num_targets; +} + +/* + * Wait for a device to report an event + */ +static int dev_wait(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int r = 0; + struct mapped_device *md; + struct dm_table *table; + int srcu_idx; + + md = find_device(param); + if (!md) + return -ENXIO; + + /* + * Wait for a notification event + */ + if (dm_wait_event(md, param->event_nr)) { + r = -ERESTARTSYS; + goto out; + } + + /* + * The userland program is going to want to know what + * changed to trigger the event, so we may as well tell + * him and save an ioctl. + */ + __dev_status(md, param); + + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) + retrieve_status(table, param, param_size); + dm_put_live_table(md, srcu_idx); + +out: + dm_put(md); + + return r; +} + +/* + * Remember the global event number and make it possible to poll + * for further events. + */ +static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct dm_file *priv = filp->private_data; + + priv->global_event_nr = atomic_read(&dm_global_event_nr); + + return 0; +} + +static inline fmode_t get_mode(struct dm_ioctl *param) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + if (param->flags & DM_READONLY_FLAG) + mode = FMODE_READ; + + return mode; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, void *end, + struct dm_target_spec **spec, char **target_params) +{ + *spec = (struct dm_target_spec *) ((unsigned char *) last + next); + *target_params = (char *) (*spec + 1); + + if (*spec < (last + 1)) + return -EINVAL; + + return invalid_str(*target_params, end); +} + +static int populate_table(struct dm_table *table, + struct dm_ioctl *param, size_t param_size) +{ + int r; + unsigned int i = 0; + struct dm_target_spec *spec = (struct dm_target_spec *) param; + uint32_t next = param->data_start; + void *end = (void *) param + param_size; + char *target_params; + + if (!param->target_count) { + DMERR("populate_table: no targets specified"); + return -EINVAL; + } + + for (i = 0; i < param->target_count; i++) { + + r = next_target(spec, next, end, &spec, &target_params); + if (r) { + DMERR("unable to find target"); + return r; + } + + r = dm_table_add_target(table, spec->target_type, + (sector_t) spec->sector_start, + (sector_t) spec->length, + target_params); + if (r) { + DMERR("error adding target to table"); + return r; + } + + next = spec->next; + } + + return dm_table_complete(table); +} + +static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new) +{ + if (cur == new || + (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) + return true; + + return false; +} + +static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + struct dm_table *t, *old_map = NULL; + struct mapped_device *md; + struct target_type *immutable_target_type; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = dm_table_create(&t, get_mode(param), param->target_count, md); + if (r) + goto err; + + /* Protect md->type and md->queue against concurrent table loads. */ + dm_lock_md_type(md); + r = populate_table(t, param, param_size); + if (r) + goto err_unlock_md_type; + + dm_ima_measure_on_table_load(t, STATUSTYPE_IMA); + + immutable_target_type = dm_get_immutable_target_type(md); + if (immutable_target_type && + (immutable_target_type != dm_table_get_immutable_target_type(t)) && + !dm_table_get_wildcard_target(t)) { + DMERR("can't replace immutable target type %s", + immutable_target_type->name); + r = -EINVAL; + goto err_unlock_md_type; + } + + if (dm_get_md_type(md) == DM_TYPE_NONE) { + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md, t); + if (r) { + DMERR("unable to set up device queue for new table."); + goto err_unlock_md_type; + } + } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) { + DMERR("can't change device type (old=%u vs new=%u) after initial table load.", + dm_get_md_type(md), dm_table_get_type(t)); + r = -EINVAL; + goto err_unlock_md_type; + } + + dm_unlock_md_type(md); + + /* stage inactive table */ + down_write(&_hash_lock); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + DMERR("device has been removed from the dev hash table."); + up_write(&_hash_lock); + r = -ENXIO; + goto err_destroy_table; + } + + if (hc->new_map) + old_map = hc->new_map; + hc->new_map = t; + up_write(&_hash_lock); + + param->flags |= DM_INACTIVE_PRESENT_FLAG; + __dev_status(md, param); + + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } + + dm_put(md); + + return 0; + +err_unlock_md_type: + dm_unlock_md_type(md); +err_destroy_table: + dm_table_destroy(t); +err: + dm_put(md); + + return r; +} + +static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *old_map = NULL; + bool has_new_map = false; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) { + old_map = hc->new_map; + hc->new_map = NULL; + has_new_map = true; + } + + md = hc->md; + up_write(&_hash_lock); + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + __dev_status(md, param); + + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } + dm_ima_measure_on_table_clear(md, has_new_map); + dm_put(md); + + return 0; +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static void retrieve_deps(struct dm_table *table, + struct dm_ioctl *param, size_t param_size) +{ + unsigned int count = 0; + struct list_head *tmp; + size_t len, needed; + struct dm_dev_internal *dd; + struct dm_target_deps *deps; + + down_read(&table->devices_lock); + + deps = get_result_buffer(param, param_size, &len); + + /* + * Count the devices. + */ + list_for_each(tmp, dm_table_get_devices(table)) + count++; + + /* + * Check we have enough space. + */ + needed = struct_size(deps, dev, count); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + + /* + * Fill in the devices. + */ + deps->count = count; + count = 0; + list_for_each_entry(dd, dm_table_get_devices(table), list) + deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev); + + param->data_size = param->data_start + needed; + +out: + up_read(&table->devices_lock); +} + +static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct mapped_device *md; + struct dm_table *table; + int srcu_idx; + + md = find_device(param); + if (!md) + return -ENXIO; + + __dev_status(md, param); + + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) + retrieve_deps(table, param, param_size); + dm_put_live_table(md, srcu_idx); + + dm_put(md); + + return 0; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int table_status(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + struct mapped_device *md; + struct dm_table *table; + int srcu_idx; + + md = find_device(param); + if (!md) + return -ENXIO; + + __dev_status(md, param); + + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) + retrieve_status(table, param, param_size); + dm_put_live_table(md, srcu_idx); + + dm_put(md); + + return 0; +} + +/* + * Process device-mapper dependent messages. Messages prefixed with '@' + * are processed by the DM core. All others are delivered to the target. + * Returns a number <= 1 if message was processed by device mapper. + * Returns 2 if message should be delivered to the target. + */ +static int message_for_md(struct mapped_device *md, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r; + + if (**argv != '@') + return 2; /* no '@' prefix, deliver to target */ + + if (!strcasecmp(argv[0], "@cancel_deferred_remove")) { + if (argc != 1) { + DMERR("Invalid arguments for @cancel_deferred_remove"); + return -EINVAL; + } + return dm_cancel_deferred_remove(md); + } + + r = dm_stats_message(md, argc, argv, result, maxlen); + if (r < 2) + return r; + + DMERR("Unsupported message sent to DM core: %s", argv[0]); + return -EINVAL; +} + +/* + * Pass a message to the target that's at the supplied device offset. + */ +static int target_message(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int r, argc; + char **argv; + struct mapped_device *md; + struct dm_table *table; + struct dm_target *ti; + struct dm_target_msg *tmsg = (void *) param + param->data_start; + size_t maxlen; + char *result = get_result_buffer(param, param_size, &maxlen); + int srcu_idx; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (tmsg < (struct dm_target_msg *) param->data || + invalid_str(tmsg->message, (void *) param + param_size)) { + DMERR("Invalid target message parameters."); + r = -EINVAL; + goto out; + } + + r = dm_split_args(&argc, &argv, tmsg->message); + if (r) { + DMERR("Failed to split target message parameters"); + goto out; + } + + if (!argc) { + DMERR("Empty message received."); + r = -EINVAL; + goto out_argv; + } + + r = message_for_md(md, argc, argv, result, maxlen); + if (r <= 1) + goto out_argv; + + table = dm_get_live_table(md, &srcu_idx); + if (!table) + goto out_table; + + if (dm_deleting_md(md)) { + r = -ENXIO; + goto out_table; + } + + ti = dm_table_find_target(table, tmsg->sector); + if (!ti) { + DMERR("Target message sector outside device."); + r = -EINVAL; + } else if (ti->type->message) + r = ti->type->message(ti, argc, argv, result, maxlen); + else { + DMERR("Target type does not support messages"); + r = -EINVAL; + } + + out_table: + dm_put_live_table(md, srcu_idx); + out_argv: + kfree(argv); + out: + if (r >= 0) + __dev_status(md, param); + + if (r == 1) { + param->flags |= DM_DATA_OUT_FLAG; + if (dm_message_test_buffer_overflow(result, maxlen)) + param->flags |= DM_BUFFER_FULL_FLAG; + else + param->data_size = param->data_start + strlen(result) + 1; + r = 0; + } + + dm_put(md); + return r; +} + +/* + * The ioctl parameter block consists of two parts, a dm_ioctl struct + * followed by a data buffer. This flag is set if the second part, + * which has a variable size, is not used by the function processing + * the ioctl. + */ +#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_ISSUE_GLOBAL_EVENT 2 + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) +{ + static const struct { + int cmd; + int flags; + ioctl_fn fn; + } _ioctls[] = { + {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, remove_all}, + {DM_LIST_DEVICES_CMD, 0, list_devices}, + + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_remove}, + {DM_DEV_RENAME_CMD, IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_rename}, + {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, + {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, + {DM_DEV_WAIT_CMD, 0, dev_wait}, + + {DM_TABLE_LOAD_CMD, 0, table_load}, + {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear}, + {DM_TABLE_DEPS_CMD, 0, table_deps}, + {DM_TABLE_STATUS_CMD, 0, table_status}, + + {DM_LIST_VERSIONS_CMD, 0, list_versions}, + + {DM_TARGET_MSG_CMD, 0, target_message}, + {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, + {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, + {DM_GET_TARGET_VERSION, 0, get_target_version}, + }; + + if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) + return NULL; + + cmd = array_index_nospec(cmd, ARRAY_SIZE(_ioctls)); + *ioctl_flags = _ioctls[cmd].flags; + return _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(unsigned int cmd, struct dm_ioctl __user *user, + struct dm_ioctl *kernel_params) +{ + int r = 0; + + /* Make certain version is first member of dm_ioctl struct */ + BUILD_BUG_ON(offsetof(struct dm_ioctl, version) != 0); + + if (copy_from_user(kernel_params->version, user->version, sizeof(kernel_params->version))) + return -EFAULT; + + if ((kernel_params->version[0] != DM_VERSION_MAJOR) || + (kernel_params->version[1] > DM_VERSION_MINOR)) { + DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + kernel_params->version[0], + kernel_params->version[1], + kernel_params->version[2], + cmd); + r = -EINVAL; + } + + /* + * Fill in the kernel version. + */ + kernel_params->version[0] = DM_VERSION_MAJOR; + kernel_params->version[1] = DM_VERSION_MINOR; + kernel_params->version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, kernel_params->version, sizeof(kernel_params->version))) + return -EFAULT; + + return r; +} + +#define DM_PARAMS_MALLOC 0x0001 /* Params allocated with kvmalloc() */ +#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ + +static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) +{ + if (param_flags & DM_WIPE_BUFFER) + memset(param, 0, param_size); + + if (param_flags & DM_PARAMS_MALLOC) + kvfree(param); +} + +static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, + int ioctl_flags, struct dm_ioctl **param, int *param_flags) +{ + struct dm_ioctl *dmi; + int secure_data; + const size_t minimum_data_size = offsetof(struct dm_ioctl, data); + unsigned int noio_flag; + + /* check_version() already copied version from userspace, avoid TOCTOU */ + if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version), + (char __user *)user + sizeof(param_kernel->version), + minimum_data_size - sizeof(param_kernel->version))) + return -EFAULT; + + if (param_kernel->data_size < minimum_data_size) { + DMERR("Invalid data size in the ioctl structure: %u", + param_kernel->data_size); + return -EINVAL; + } + + secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG; + + *param_flags = secure_data ? DM_WIPE_BUFFER : 0; + + if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) { + dmi = param_kernel; + dmi->data_size = minimum_data_size; + goto data_copied; + } + + /* + * Use __GFP_HIGH to avoid low memory issues when a device is + * suspended and the ioctl is needed to resume it. + * Use kmalloc() rather than vmalloc() when we can. + */ + dmi = NULL; + noio_flag = memalloc_noio_save(); + dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH); + memalloc_noio_restore(noio_flag); + + if (!dmi) { + if (secure_data && clear_user(user, param_kernel->data_size)) + return -EFAULT; + return -ENOMEM; + } + + *param_flags |= DM_PARAMS_MALLOC; + + /* Copy from param_kernel (which was already copied from user) */ + memcpy(dmi, param_kernel, minimum_data_size); + + if (copy_from_user(&dmi->data, (char __user *)user + minimum_data_size, + param_kernel->data_size - minimum_data_size)) + goto bad; +data_copied: + /* Wipe the user buffer so we do not return it to userspace */ + if (secure_data && clear_user(user, param_kernel->data_size)) + goto bad; + + *param = dmi; + return 0; + +bad: + free_params(dmi, param_kernel->data_size, *param_flags); + + return -EFAULT; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ + /* Always clear this flag */ + param->flags &= ~DM_BUFFER_FULL_FLAG; + param->flags &= ~DM_UEVENT_GENERATED_FLAG; + param->flags &= ~DM_SECURE_DATA_FLAG; + param->flags &= ~DM_DATA_OUT_FLAG; + + /* Ignores parameters */ + if (cmd == DM_REMOVE_ALL_CMD || + cmd == DM_LIST_DEVICES_CMD || + cmd == DM_LIST_VERSIONS_CMD) + return 0; + + if (cmd == DM_DEV_CREATE_CMD) { + if (!*param->name) { + DMERR("name not supplied when creating device"); + return -EINVAL; + } + } else if (*param->uuid && *param->name) { + DMERR("only supply one of name or uuid, cmd(%u)", cmd); + return -EINVAL; + } + + /* Ensure strings are terminated */ + param->name[DM_NAME_LEN - 1] = '\0'; + param->uuid[DM_UUID_LEN - 1] = '\0'; + + return 0; +} + +static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *user) +{ + int r = 0; + int ioctl_flags; + int param_flags; + unsigned int cmd; + struct dm_ioctl *param; + ioctl_fn fn = NULL; + size_t input_param_size; + struct dm_ioctl param_kernel; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) + return -ENOTTY; + + cmd = _IOC_NR(command); + + /* + * Check the interface version passed in. This also + * writes out the kernel's interface version. + */ + r = check_version(cmd, user, ¶m_kernel); + if (r) + return r; + + /* + * Nothing more to do for the version command. + */ + if (cmd == DM_VERSION_CMD) + return 0; + + fn = lookup_ioctl(cmd, &ioctl_flags); + if (!fn) { + DMERR("dm_ctl_ioctl: unknown command 0x%x", command); + return -ENOTTY; + } + + /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m_kernel, ioctl_flags, ¶m, ¶m_flags); + + if (r) + return r; + + input_param_size = param->data_size; + r = validate_params(cmd, param); + if (r) + goto out; + + param->data_size = offsetof(struct dm_ioctl, data); + r = fn(file, param, input_param_size); + + if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && + unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) + DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); + + if (!r && ioctl_flags & IOCTL_FLAGS_ISSUE_GLOBAL_EVENT) + dm_issue_global_event(); + + /* + * Copy the results back to userland. + */ + if (!r && copy_to_user(user, param, param->data_size)) + r = -EFAULT; + +out: + free_params(param, input_param_size, param_flags); + return r; +} + +static long dm_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)ctl_ioctl(file, command, (struct dm_ioctl __user *)u); +} + +#ifdef CONFIG_COMPAT +static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define dm_compat_ctl_ioctl NULL +#endif + +static int dm_open(struct inode *inode, struct file *filp) +{ + int r; + struct dm_file *priv; + + r = nonseekable_open(inode, filp); + if (unlikely(r)) + return r; + + priv = filp->private_data = kmalloc(sizeof(struct dm_file), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->global_event_nr = atomic_read(&dm_global_event_nr); + + return 0; +} + +static int dm_release(struct inode *inode, struct file *filp) +{ + kfree(filp->private_data); + return 0; +} + +static __poll_t dm_poll(struct file *filp, poll_table *wait) +{ + struct dm_file *priv = filp->private_data; + __poll_t mask = 0; + + poll_wait(filp, &dm_global_eventq, wait); + + if ((int)(atomic_read(&dm_global_event_nr) - priv->global_event_nr) > 0) + mask |= EPOLLIN; + + return mask; +} + +static const struct file_operations _ctl_fops = { + .open = dm_open, + .release = dm_release, + .poll = dm_poll, + .unlocked_ioctl = dm_ctl_ioctl, + .compat_ioctl = dm_compat_ctl_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice _dm_misc = { + .minor = MAPPER_CTRL_MINOR, + .name = DM_NAME, + .nodename = DM_DIR "/" DM_CONTROL_NODE, + .fops = &_ctl_fops +}; + +MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); +MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ + int r; + + r = misc_register(&_dm_misc); + if (r) { + DMERR("misc_register failed for control device"); + return r; + } + + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, + DM_DRIVER_EMAIL); + return 0; +} + +void dm_interface_exit(void) +{ + misc_deregister(&_dm_misc); + dm_hash_exit(); +} + +/** + * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers + * @md: Pointer to mapped_device + * @name: Buffer (size DM_NAME_LEN) for name + * @uuid: Buffer (size DM_UUID_LEN) for uuid or empty string if uuid not defined + */ +int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) +{ + int r = 0; + struct hash_cell *hc; + + if (!md) + return -ENXIO; + + mutex_lock(&dm_hash_cells_mutex); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + r = -ENXIO; + goto out; + } + + if (name) + strcpy(name, hc->name); + if (uuid) + strcpy(uuid, hc->uuid ? : ""); + +out: + mutex_unlock(&dm_hash_cells_mutex); + + return r; +} +EXPORT_SYMBOL_GPL(dm_copy_name_and_uuid); + +/** + * dm_early_create - create a mapped device in early boot. + * + * @dmi: Contains main information of the device mapping to be created. + * @spec_array: array of pointers to struct dm_target_spec. Describes the + * mapping table of the device. + * @target_params_array: array of strings with the parameters to a specific + * target. + * + * Instead of having the struct dm_target_spec and the parameters for every + * target embedded at the end of struct dm_ioctl (as performed in a normal + * ioctl), pass them as arguments, so the caller doesn't need to serialize them. + * The size of the spec_array and target_params_array is given by + * @dmi->target_count. + * This function is supposed to be called in early boot, so locking mechanisms + * to protect against concurrent loads are not required. + */ +int __init dm_early_create(struct dm_ioctl *dmi, + struct dm_target_spec **spec_array, + char **target_params_array) +{ + int r, m = DM_ANY_MINOR; + struct dm_table *t, *old_map; + struct mapped_device *md; + unsigned int i; + + if (!dmi->target_count) + return -EINVAL; + + r = check_name(dmi->name); + if (r) + return r; + + if (dmi->flags & DM_PERSISTENT_DEV_FLAG) + m = MINOR(huge_decode_dev(dmi->dev)); + + /* alloc dm device */ + r = dm_create(m, &md); + if (r) + return r; + + /* hash insert */ + r = dm_hash_insert(dmi->name, *dmi->uuid ? dmi->uuid : NULL, md); + if (r) + goto err_destroy_dm; + + /* alloc table */ + r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md); + if (r) + goto err_hash_remove; + + /* add targets */ + for (i = 0; i < dmi->target_count; i++) { + r = dm_table_add_target(t, spec_array[i]->target_type, + (sector_t) spec_array[i]->sector_start, + (sector_t) spec_array[i]->length, + target_params_array[i]); + if (r) { + DMERR("error adding target to table"); + goto err_destroy_table; + } + } + + /* finish table */ + r = dm_table_complete(t); + if (r) + goto err_destroy_table; + + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md, t); + if (r) { + DMERR("unable to set up device queue for new table."); + goto err_destroy_table; + } + + /* Set new map */ + dm_suspend(md, 0); + old_map = dm_swap_table(md, t); + if (IS_ERR(old_map)) { + r = PTR_ERR(old_map); + goto err_destroy_table; + } + set_disk_ro(dm_disk(md), !!(dmi->flags & DM_READONLY_FLAG)); + + /* resume device */ + r = dm_resume(md); + if (r) + goto err_destroy_table; + + DMINFO("%s (%s) is ready", md->disk->disk_name, dmi->name); + dm_put(md); + return 0; + +err_destroy_table: + dm_table_destroy(t); +err_hash_remove: + (void) __hash_remove(__get_name_cell(dmi->name)); + /* release reference from __get_name_cell */ + dm_put(md); +err_destroy_dm: + dm_put(md); + dm_destroy(md); + return r; +} diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c new file mode 100644 index 000000000..0ef78e56a --- /dev/null +++ b/drivers/md/dm-kcopyd.c @@ -0,0 +1,988 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * Copyright (C) 2006 Red Hat GmbH + * + * This file is released under the GPL. + * + * Kcopyd provides a simple interface for copying an area of one + * block-device to one or more other block-devices, with an asynchronous + * completion notification. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-core.h" + +#define SPLIT_COUNT 8 +#define MIN_JOBS 8 + +#define DEFAULT_SUB_JOB_SIZE_KB 512 +#define MAX_SUB_JOB_SIZE_KB 1024 + +static unsigned int kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB; + +module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients"); + +static unsigned int dm_get_kcopyd_subjob_size(void) +{ + unsigned int sub_job_size_kb; + + sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb, + DEFAULT_SUB_JOB_SIZE_KB, + MAX_SUB_JOB_SIZE_KB); + + return sub_job_size_kb << 1; +} + +/*----------------------------------------------------------------- + * Each kcopyd client has its own little pool of preallocated + * pages for kcopyd io. + *---------------------------------------------------------------*/ +struct dm_kcopyd_client { + struct page_list *pages; + unsigned int nr_reserved_pages; + unsigned int nr_free_pages; + unsigned int sub_job_size; + + struct dm_io_client *io_client; + + wait_queue_head_t destroyq; + + mempool_t job_pool; + + struct workqueue_struct *kcopyd_wq; + struct work_struct kcopyd_work; + + struct dm_kcopyd_throttle *throttle; + + atomic_t nr_jobs; + +/* + * We maintain four lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that don't need to do any IO and just run a callback + * iv) jobs that have completed. + * + * All four of these are protected by job_lock. + */ + spinlock_t job_lock; + struct list_head callback_jobs; + struct list_head complete_jobs; + struct list_head io_jobs; + struct list_head pages_jobs; +}; + +static struct page_list zero_page_list; + +static DEFINE_SPINLOCK(throttle_spinlock); + +/* + * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period. + * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided + * by 2. + */ +#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ + +/* + * Sleep this number of milliseconds. + * + * The value was decided experimentally. + * Smaller values seem to cause an increased copy rate above the limit. + * The reason for this is unknown but possibly due to jiffies rounding errors + * or read/write cache inside the disk. + */ +#define SLEEP_MSEC 100 + +/* + * Maximum number of sleep events. There is a theoretical livelock if more + * kcopyd clients do work simultaneously which this limit avoids. + */ +#define MAX_SLEEPS 10 + +static void io_job_start(struct dm_kcopyd_throttle *t) +{ + unsigned int throttle, now, difference; + int slept = 0, skew; + + if (unlikely(!t)) + return; + +try_again: + spin_lock_irq(&throttle_spinlock); + + throttle = READ_ONCE(t->throttle); + + if (likely(throttle >= 100)) + goto skip_limit; + + now = jiffies; + difference = now - t->last_jiffies; + t->last_jiffies = now; + if (t->num_io_jobs) + t->io_period += difference; + t->total_period += difference; + + /* + * Maintain sane values if we got a temporary overflow. + */ + if (unlikely(t->io_period > t->total_period)) + t->io_period = t->total_period; + + if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) { + int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT); + t->total_period >>= shift; + t->io_period >>= shift; + } + + skew = t->io_period - throttle * t->total_period / 100; + + if (unlikely(skew > 0) && slept < MAX_SLEEPS) { + slept++; + spin_unlock_irq(&throttle_spinlock); + msleep(SLEEP_MSEC); + goto try_again; + } + +skip_limit: + t->num_io_jobs++; + + spin_unlock_irq(&throttle_spinlock); +} + +static void io_job_finish(struct dm_kcopyd_throttle *t) +{ + unsigned long flags; + + if (unlikely(!t)) + return; + + spin_lock_irqsave(&throttle_spinlock, flags); + + t->num_io_jobs--; + + if (likely(READ_ONCE(t->throttle) >= 100)) + goto skip_limit; + + if (!t->num_io_jobs) { + unsigned int now, difference; + + now = jiffies; + difference = now - t->last_jiffies; + t->last_jiffies = now; + + t->io_period += difference; + t->total_period += difference; + + /* + * Maintain sane values if we got a temporary overflow. + */ + if (unlikely(t->io_period > t->total_period)) + t->io_period = t->total_period; + } + +skip_limit: + spin_unlock_irqrestore(&throttle_spinlock, flags); +} + + +static void wake(struct dm_kcopyd_client *kc) +{ + queue_work(kc->kcopyd_wq, &kc->kcopyd_work); +} + +/* + * Obtain one page for the use of kcopyd. + */ +static struct page_list *alloc_pl(gfp_t gfp) +{ + struct page_list *pl; + + pl = kmalloc(sizeof(*pl), gfp); + if (!pl) + return NULL; + + pl->page = alloc_page(gfp | __GFP_HIGHMEM); + if (!pl->page) { + kfree(pl); + return NULL; + } + + return pl; +} + +static void free_pl(struct page_list *pl) +{ + __free_page(pl->page); + kfree(pl); +} + +/* + * Add the provided pages to a client's free page list, releasing + * back to the system any beyond the reserved_pages limit. + */ +static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) +{ + struct page_list *next; + + do { + next = pl->next; + + if (kc->nr_free_pages >= kc->nr_reserved_pages) + free_pl(pl); + else { + pl->next = kc->pages; + kc->pages = pl; + kc->nr_free_pages++; + } + + pl = next; + } while (pl); +} + +static int kcopyd_get_pages(struct dm_kcopyd_client *kc, + unsigned int nr, struct page_list **pages) +{ + struct page_list *pl; + + *pages = NULL; + + do { + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); + if (unlikely(!pl)) { + /* Use reserved pages */ + pl = kc->pages; + if (unlikely(!pl)) + goto out_of_memory; + kc->pages = pl->next; + kc->nr_free_pages--; + } + pl->next = *pages; + *pages = pl; + } while (--nr); + + return 0; + +out_of_memory: + if (*pages) + kcopyd_put_pages(kc, *pages); + return -ENOMEM; +} + +/* + * These three functions resize the page pool. + */ +static void drop_pages(struct page_list *pl) +{ + struct page_list *next; + + while (pl) { + next = pl->next; + free_pl(pl); + pl = next; + } +} + +/* + * Allocate and reserve nr_pages for the use of a specific client. + */ +static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned int nr_pages) +{ + unsigned int i; + struct page_list *pl = NULL, *next; + + for (i = 0; i < nr_pages; i++) { + next = alloc_pl(GFP_KERNEL); + if (!next) { + if (pl) + drop_pages(pl); + return -ENOMEM; + } + next->next = pl; + pl = next; + } + + kc->nr_reserved_pages += nr_pages; + kcopyd_put_pages(kc, pl); + + return 0; +} + +static void client_free_pages(struct dm_kcopyd_client *kc) +{ + BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); + drop_pages(kc->pages); + kc->pages = NULL; + kc->nr_free_pages = kc->nr_reserved_pages = 0; +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a deadlock). + *---------------------------------------------------------------*/ +struct kcopyd_job { + struct dm_kcopyd_client *kc; + struct list_head list; + unsigned int flags; + + /* + * Error state of the job. + */ + int read_err; + unsigned long write_err; + + /* + * REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES. + */ + enum req_op op; + struct dm_io_region source; + + /* + * The destinations for the transfer. + */ + unsigned int num_dests; + struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; + + struct page_list *pages; + + /* + * Set this to ensure you are notified when the job has + * completed. 'context' is for callback to use. + */ + dm_kcopyd_notify_fn fn; + void *context; + + /* + * These fields are only used if the job has been split + * into more manageable parts. + */ + struct mutex lock; + atomic_t sub_jobs; + sector_t progress; + sector_t write_offset; + + struct kcopyd_job *master_job; +}; + +static struct kmem_cache *_job_cache; + +int __init dm_kcopyd_init(void) +{ + _job_cache = kmem_cache_create("kcopyd_job", + sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), + __alignof__(struct kcopyd_job), 0, NULL); + if (!_job_cache) + return -ENOMEM; + + zero_page_list.next = &zero_page_list; + zero_page_list.page = ZERO_PAGE(0); + + return 0; +} + +void dm_kcopyd_exit(void) +{ + kmem_cache_destroy(_job_cache); + _job_cache = NULL; +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static struct kcopyd_job *pop_io_job(struct list_head *jobs, + struct dm_kcopyd_client *kc) +{ + struct kcopyd_job *job; + + /* + * For I/O jobs, pop any read, any write without sequential write + * constraint and sequential writes that are at the right position. + */ + list_for_each_entry(job, jobs, list) { + if (job->op == REQ_OP_READ || + !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { + list_del(&job->list); + return job; + } + + if (job->write_offset == job->master_job->write_offset) { + job->master_job->write_offset += job->source.count; + list_del(&job->list); + return job; + } + } + + return NULL; +} + +static struct kcopyd_job *pop(struct list_head *jobs, + struct dm_kcopyd_client *kc) +{ + struct kcopyd_job *job = NULL; + + spin_lock_irq(&kc->job_lock); + + if (!list_empty(jobs)) { + if (jobs == &kc->io_jobs) + job = pop_io_job(jobs, kc); + else { + job = list_entry(jobs->next, struct kcopyd_job, list); + list_del(&job->list); + } + } + spin_unlock_irq(&kc->job_lock); + + return job; +} + +static void push(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + struct dm_kcopyd_client *kc = job->kc; + + spin_lock_irqsave(&kc->job_lock, flags); + list_add_tail(&job->list, jobs); + spin_unlock_irqrestore(&kc->job_lock, flags); +} + + +static void push_head(struct list_head *jobs, struct kcopyd_job *job) +{ + struct dm_kcopyd_client *kc = job->kc; + + spin_lock_irq(&kc->job_lock); + list_add(&job->list, jobs); + spin_unlock_irq(&kc->job_lock); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + * 0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ + void *context = job->context; + int read_err = job->read_err; + unsigned long write_err = job->write_err; + dm_kcopyd_notify_fn fn = job->fn; + struct dm_kcopyd_client *kc = job->kc; + + if (job->pages && job->pages != &zero_page_list) + kcopyd_put_pages(kc, job->pages); + /* + * If this is the master job, the sub jobs have already + * completed so we can free everything. + */ + if (job->master_job == job) { + mutex_destroy(&job->lock); + mempool_free(job, &kc->job_pool); + } + fn(read_err, write_err, context); + + if (atomic_dec_and_test(&kc->nr_jobs)) + wake_up(&kc->destroyq); + + cond_resched(); + + return 0; +} + +static void complete_io(unsigned long error, void *context) +{ + struct kcopyd_job *job = (struct kcopyd_job *) context; + struct dm_kcopyd_client *kc = job->kc; + + io_job_finish(kc->throttle); + + if (error) { + if (op_is_write(job->op)) + job->write_err |= error; + else + job->read_err = 1; + + if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) { + push(&kc->complete_jobs, job); + wake(kc); + return; + } + } + + if (op_is_write(job->op)) + push(&kc->complete_jobs, job); + + else { + job->op = REQ_OP_WRITE; + push(&kc->io_jobs, job); + } + + wake(kc); +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ + int r; + struct dm_io_request io_req = { + .bi_opf = job->op, + .mem.type = DM_IO_PAGE_LIST, + .mem.ptr.pl = job->pages, + .mem.offset = 0, + .notify.fn = complete_io, + .notify.context = job, + .client = job->kc->io_client, + }; + + /* + * If we need to write sequentially and some reads or writes failed, + * no point in continuing. + */ + if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && + job->master_job->write_err) { + job->write_err = job->master_job->write_err; + return -EIO; + } + + io_job_start(job->kc->throttle); + + if (job->op == REQ_OP_READ) + r = dm_io(&io_req, 1, &job->source, NULL); + else + r = dm_io(&io_req, job->num_dests, job->dests, NULL); + + return r; +} + +static int run_pages_job(struct kcopyd_job *job) +{ + int r; + unsigned int nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); + + r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); + if (!r) { + /* this job is ready for io */ + push(&job->kc->io_jobs, job); + return 0; + } + + if (r == -ENOMEM) + /* can't complete now */ + return 1; + + return r; +} + +/* + * Run through a list for as long as possible. Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, + int (*fn) (struct kcopyd_job *)) +{ + struct kcopyd_job *job; + int r, count = 0; + + while ((job = pop(jobs, kc))) { + + r = fn(job); + + if (r < 0) { + /* error this rogue job */ + if (op_is_write(job->op)) + job->write_err = (unsigned long) -1L; + else + job->read_err = 1; + push(&kc->complete_jobs, job); + wake(kc); + break; + } + + if (r > 0) { + /* + * We couldn't service this job ATM, so + * push this job back onto the list. + */ + push_head(jobs, job); + break; + } + + count++; + } + + return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(struct work_struct *work) +{ + struct dm_kcopyd_client *kc = container_of(work, + struct dm_kcopyd_client, kcopyd_work); + struct blk_plug plug; + + /* + * The order that these are called is *very* important. + * complete jobs can free some pages for pages jobs. + * Pages jobs when successful will jump onto the io jobs + * list. io jobs call wake when they complete and it all + * starts again. + */ + spin_lock_irq(&kc->job_lock); + list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); + spin_unlock_irq(&kc->job_lock); + + blk_start_plug(&plug); + process_jobs(&kc->complete_jobs, kc, run_complete_job); + process_jobs(&kc->pages_jobs, kc, run_pages_job); + process_jobs(&kc->io_jobs, kc, run_io_job); + blk_finish_plug(&plug); +} + +/* + * If we are copying a small region we just dispatch a single job + * to do the copy, otherwise the io has to be split up into many + * jobs. + */ +static void dispatch_job(struct kcopyd_job *job) +{ + struct dm_kcopyd_client *kc = job->kc; + atomic_inc(&kc->nr_jobs); + if (unlikely(!job->source.count)) + push(&kc->callback_jobs, job); + else if (job->pages == &zero_page_list) + push(&kc->io_jobs, job); + else + push(&kc->pages_jobs, job); + wake(kc); +} + +static void segment_complete(int read_err, unsigned long write_err, + void *context) +{ + /* FIXME: tidy this function */ + sector_t progress = 0; + sector_t count = 0; + struct kcopyd_job *sub_job = (struct kcopyd_job *) context; + struct kcopyd_job *job = sub_job->master_job; + struct dm_kcopyd_client *kc = job->kc; + + mutex_lock(&job->lock); + + /* update the error */ + if (read_err) + job->read_err = 1; + + if (write_err) + job->write_err |= write_err; + + /* + * Only dispatch more work if there hasn't been an error. + */ + if ((!job->read_err && !job->write_err) || + job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) { + /* get the next chunk of work */ + progress = job->progress; + count = job->source.count - progress; + if (count) { + if (count > kc->sub_job_size) + count = kc->sub_job_size; + + job->progress += count; + } + } + mutex_unlock(&job->lock); + + if (count) { + int i; + + *sub_job = *job; + sub_job->write_offset = progress; + sub_job->source.sector += progress; + sub_job->source.count = count; + + for (i = 0; i < job->num_dests; i++) { + sub_job->dests[i].sector += progress; + sub_job->dests[i].count = count; + } + + sub_job->fn = segment_complete; + sub_job->context = sub_job; + dispatch_job(sub_job); + + } else if (atomic_dec_and_test(&job->sub_jobs)) { + + /* + * Queue the completion callback to the kcopyd thread. + * + * Some callers assume that all the completions are called + * from a single thread and don't race with each other. + * + * We must not call the callback directly here because this + * code may not be executing in the thread. + */ + push(&kc->complete_jobs, job); + wake(kc); + } +} + +/* + * Create some sub jobs to share the work between them. + */ +static void split_job(struct kcopyd_job *master_job) +{ + int i; + + atomic_inc(&master_job->kc->nr_jobs); + + atomic_set(&master_job->sub_jobs, SPLIT_COUNT); + for (i = 0; i < SPLIT_COUNT; i++) { + master_job[i + 1].master_job = master_job; + segment_complete(0, 0u, &master_job[i + 1]); + } +} + +void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, + unsigned int num_dests, struct dm_io_region *dests, + unsigned int flags, dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + int i; + + /* + * Allocate an array of jobs consisting of one master job + * followed by SPLIT_COUNT sub jobs. + */ + job = mempool_alloc(&kc->job_pool, GFP_NOIO); + mutex_init(&job->lock); + + /* + * set up for the read. + */ + job->kc = kc; + job->flags = flags; + job->read_err = 0; + job->write_err = 0; + + job->num_dests = num_dests; + memcpy(&job->dests, dests, sizeof(*dests) * num_dests); + + /* + * If one of the destination is a host-managed zoned block device, + * we need to write sequentially. If one of the destination is a + * host-aware device, then leave it to the caller to choose what to do. + */ + if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { + for (i = 0; i < job->num_dests; i++) { + if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) { + job->flags |= BIT(DM_KCOPYD_WRITE_SEQ); + break; + } + } + } + + /* + * If we need to write sequentially, errors cannot be ignored. + */ + if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && + job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) + job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR); + + if (from) { + job->source = *from; + job->pages = NULL; + job->op = REQ_OP_READ; + } else { + memset(&job->source, 0, sizeof job->source); + job->source.count = job->dests[0].count; + job->pages = &zero_page_list; + + /* + * Use WRITE ZEROES to optimize zeroing if all dests support it. + */ + job->op = REQ_OP_WRITE_ZEROES; + for (i = 0; i < job->num_dests; i++) + if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) { + job->op = REQ_OP_WRITE; + break; + } + } + + job->fn = fn; + job->context = context; + job->master_job = job; + job->write_offset = 0; + + if (job->source.count <= kc->sub_job_size) + dispatch_job(job); + else { + job->progress = 0; + split_job(job); + } +} +EXPORT_SYMBOL(dm_kcopyd_copy); + +void dm_kcopyd_zero(struct dm_kcopyd_client *kc, + unsigned int num_dests, struct dm_io_region *dests, + unsigned int flags, dm_kcopyd_notify_fn fn, void *context) +{ + dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); +} +EXPORT_SYMBOL(dm_kcopyd_zero); + +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + job = mempool_alloc(&kc->job_pool, GFP_NOIO); + + memset(job, 0, sizeof(struct kcopyd_job)); + job->kc = kc; + job->fn = fn; + job->context = context; + job->master_job = job; + + atomic_inc(&kc->nr_jobs); + + return job; +} +EXPORT_SYMBOL(dm_kcopyd_prepare_callback); + +void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) +{ + struct kcopyd_job *job = j; + struct dm_kcopyd_client *kc = job->kc; + + job->read_err = read_err; + job->write_err = write_err; + + push(&kc->callback_jobs, job); + wake(kc); +} +EXPORT_SYMBOL(dm_kcopyd_do_callback); + +/* + * Cancels a kcopyd job, eg. someone might be deactivating a + * mirror. + */ +#if 0 +int kcopyd_cancel(struct kcopyd_job *job, int block) +{ + /* FIXME: finish */ + return -1; +} +#endif /* 0 */ + +/*----------------------------------------------------------------- + * Client setup + *---------------------------------------------------------------*/ +struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) +{ + int r; + unsigned int reserve_pages; + struct dm_kcopyd_client *kc; + + kc = kzalloc(sizeof(*kc), GFP_KERNEL); + if (!kc) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&kc->job_lock); + INIT_LIST_HEAD(&kc->callback_jobs); + INIT_LIST_HEAD(&kc->complete_jobs); + INIT_LIST_HEAD(&kc->io_jobs); + INIT_LIST_HEAD(&kc->pages_jobs); + kc->throttle = throttle; + + r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache); + if (r) + goto bad_slab; + + INIT_WORK(&kc->kcopyd_work, do_work); + kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); + if (!kc->kcopyd_wq) { + r = -ENOMEM; + goto bad_workqueue; + } + + kc->sub_job_size = dm_get_kcopyd_subjob_size(); + reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE); + + kc->pages = NULL; + kc->nr_reserved_pages = kc->nr_free_pages = 0; + r = client_reserve_pages(kc, reserve_pages); + if (r) + goto bad_client_pages; + + kc->io_client = dm_io_client_create(); + if (IS_ERR(kc->io_client)) { + r = PTR_ERR(kc->io_client); + goto bad_io_client; + } + + init_waitqueue_head(&kc->destroyq); + atomic_set(&kc->nr_jobs, 0); + + return kc; + +bad_io_client: + client_free_pages(kc); +bad_client_pages: + destroy_workqueue(kc->kcopyd_wq); +bad_workqueue: + mempool_exit(&kc->job_pool); +bad_slab: + kfree(kc); + + return ERR_PTR(r); +} +EXPORT_SYMBOL(dm_kcopyd_client_create); + +void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) +{ + /* Wait for completion of all jobs submitted by this client. */ + wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); + + BUG_ON(!list_empty(&kc->callback_jobs)); + BUG_ON(!list_empty(&kc->complete_jobs)); + BUG_ON(!list_empty(&kc->io_jobs)); + BUG_ON(!list_empty(&kc->pages_jobs)); + destroy_workqueue(kc->kcopyd_wq); + dm_io_client_destroy(kc->io_client); + client_free_pages(kc); + mempool_exit(&kc->job_pool); + kfree(kc); +} +EXPORT_SYMBOL(dm_kcopyd_client_destroy); + +void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc) +{ + flush_workqueue(kc->kcopyd_wq); +} +EXPORT_SYMBOL(dm_kcopyd_client_flush); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c new file mode 100644 index 000000000..26b1af646 --- /dev/null +++ b/drivers/md/dm-linear.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2001-2003 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "linear" + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { + struct dm_dev *dev; + sector_t start; +}; + +/* + * Construct a linear mapping: + */ +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct linear_c *lc; + unsigned long long tmp; + char dummy; + int ret; + + if (argc != 2) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (lc == NULL) { + ti->error = "Cannot allocate linear context"; + return -ENOMEM; + } + + ret = -EINVAL; + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { + ti->error = "Invalid device sector"; + goto bad; + } + lc->start = tmp; + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev); + if (ret) { + ti->error = "Device lookup failed"; + goto bad; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 1; + ti->num_write_zeroes_bios = 1; + ti->private = lc; + return 0; + + bad: + kfree(lc); + return ret; +} + +static void linear_dtr(struct dm_target *ti) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + dm_put_device(ti, lc->dev); + kfree(lc); +} + +static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct linear_c *lc = ti->private; + + return lc->start + dm_target_offset(ti, bi_sector); +} + +static int linear_map(struct dm_target *ti, struct bio *bio) +{ + struct linear_c *lc = ti->private; + + bio_set_dev(bio, lc->dev->bdev); + bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); + + return DM_MAPIO_REMAPPED; +} + +static void linear_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + size_t sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu", lc->dev->name, (unsigned long long)lc->start); + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",device_name=%s,start=%llu;", lc->dev->name, + (unsigned long long)lc->start); + break; + } +} + +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + struct dm_dev *dev = lc->dev; + + *bdev = dev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (lc->start || ti->len != bdev_nr_sectors(dev->bdev)) + return 1; + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int linear_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct linear_c *lc = ti->private; + + return dm_report_zones(lc->dev->bdev, lc->start, + linear_map_sector(ti, args->next_sector), + args, nr_zones); +} +#else +#define linear_report_zones NULL +#endif + +static int linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct linear_c *lc = ti->private; + + return fn(ti, lc->dev, lc->start, ti->len, data); +} + +#if IS_ENABLED(CONFIG_FS_DAX) +static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) +{ + struct linear_c *lc = ti->private; + sector_t sector = linear_map_sector(ti, *pgoff << PAGE_SECTORS_SHIFT); + + *pgoff = (get_start_sect(lc->dev->bdev) + sector) >> PAGE_SECTORS_SHIFT; + return lc->dev->dax_dev; +} + +static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) +{ + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); +} + +static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, + size_t nr_pages) +{ + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_zero_page_range(dax_dev, pgoff, nr_pages); +} + +static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + +#else +#define linear_dax_direct_access NULL +#define linear_dax_zero_page_range NULL +#define linear_dax_recovery_write NULL +#endif + +static struct target_type linear_target = { + .name = "linear", + .version = {1, 4, 0}, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | + DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, + .report_zones = linear_report_zones, + .module = THIS_MODULE, + .ctr = linear_ctr, + .dtr = linear_dtr, + .map = linear_map, + .status = linear_status, + .prepare_ioctl = linear_prepare_ioctl, + .iterate_devices = linear_iterate_devices, + .direct_access = linear_dax_direct_access, + .dax_zero_page_range = linear_dax_zero_page_range, + .dax_recovery_write = linear_dax_recovery_write, +}; + +int __init dm_linear_init(void) +{ + int r = dm_register_target(&linear_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +void dm_linear_exit(void) +{ + dm_unregister_target(&linear_target); +} diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 000000000..9fc693826 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c @@ -0,0 +1,937 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-log-userspace-transfer.h" + +#define DM_LOG_USERSPACE_VSN "1.3.0" + +#define FLUSH_ENTRY_POOL_SIZE 16 + +struct dm_dirty_log_flush_entry { + int type; + region_t region; + struct list_head list; +}; + +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + +struct log_c { + struct dm_target *ti; + struct dm_dev *log_dev; + + char *usr_argv_str; + uint32_t usr_argc; + + uint32_t region_size; + region_t region_count; + uint64_t luid; + char uuid[DM_UUID_LEN]; + + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ + spinlock_t flush_lock; + struct list_head mark_list; + struct list_head clear_list; + + /* + * in_sync_hint gets set when doing is_remote_recovering. It + * represents the first region that needs recovery. IOW, the + * first zero bit of sync_bits. This can be useful for to limit + * traffic for calls like is_remote_recovering and get_resync_work, + * but be take care in its use for anything else. + */ + uint64_t in_sync_hint; + + /* + * Workqueue for flush of clear region requests. + */ + struct workqueue_struct *dmlog_wq; + struct delayed_work flush_log_work; + atomic_t sched_flush; + + /* + * Combine userspace flush and mark requests for efficiency. + */ + uint32_t integrated_flush; + + mempool_t flush_entry_pool; +}; + +static struct kmem_cache *_flush_entry_cache; + +static int userspace_do_request(struct log_c *lc, const char *uuid, + int request_type, char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r; + + /* + * If the server isn't there, -ESRCH is returned, + * and we must keep trying until the server is + * restored. + */ +retry: + r = dm_consult_userspace(uuid, lc->luid, request_type, data, + data_size, rdata, rdata_size); + + if (r != -ESRCH) + return r; + + DMERR(" Userspace log server not found."); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2*HZ); + DMWARN("Attempting to contact userspace log server..."); + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, + lc->usr_argv_str, + strlen(lc->usr_argv_str) + 1, + NULL, NULL); + if (!r) + break; + } + DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, + 0, NULL, NULL); + if (!r) + goto retry; + + DMERR("Error trying to resume userspace log: %d", r); + + return -ESRCH; +} + +static int build_constructor_string(struct dm_target *ti, + unsigned int argc, char **argv, + char **ctr_str) +{ + int i, str_size; + char *str = NULL; + + *ctr_str = NULL; + + /* + * Determine overall size of the string. + */ + for (i = 0, str_size = 0; i < argc; i++) + str_size += strlen(argv[i]) + 1; /* +1 for space between args */ + + str_size += 20; /* Max number of chars in a printed u64 number */ + + str = kzalloc(str_size, GFP_KERNEL); + if (!str) { + DMWARN("Unable to allocate memory for constructor string"); + return -ENOMEM; + } + + str_size = sprintf(str, "%llu", (unsigned long long)ti->len); + for (i = 0; i < argc; i++) + str_size += sprintf(str + str_size, " %s", argv[i]); + + *ctr_str = str; + return str_size; +} + +static void do_flush(struct work_struct *work) +{ + int r; + struct log_c *lc = container_of(work, struct log_c, flush_log_work.work); + + atomic_set(&lc->sched_flush, 0); + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); + + if (r) + dm_table_event(lc->ti->table); +} + +/* + * userspace_ctr + * + * argv contains: + * [integrated_flush] + * Where 'other args' are the userspace implementation-specific log + * arguments. + * + * Example: + * [integrated_flush] clustered-disk + * [[no]sync] + * + * This module strips off the and uses it for identification + * purposes when communicating with userspace about a log. + * + * If integrated_flush is defined, the kernel combines flush + * and mark requests. + * + * The rest of the line, beginning with 'clustered-disk', is passed + * to the userspace ctr function. + */ +static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + int r = 0; + int str_size; + char *ctr_str = NULL; + struct log_c *lc = NULL; + uint64_t rdata; + size_t rdata_size = sizeof(rdata); + char *devices_rdata = NULL; + size_t devices_rdata_size = DM_NAME_LEN; + + if (argc < 3) { + DMWARN("Too few arguments to userspace dirty log"); + return -EINVAL; + } + + lc = kzalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { + DMWARN("Unable to allocate userspace log context."); + return -ENOMEM; + } + + /* The ptr value is sufficient for local unique id */ + lc->luid = (unsigned long)lc; + + lc->ti = ti; + + if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { + DMWARN("UUID argument too long."); + kfree(lc); + return -EINVAL; + } + + lc->usr_argc = argc; + + strncpy(lc->uuid, argv[0], DM_UUID_LEN); + argc--; + argv++; + spin_lock_init(&lc->flush_lock); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); + + if (!strcasecmp(argv[0], "integrated_flush")) { + lc->integrated_flush = 1; + argc--; + argv++; + } + + str_size = build_constructor_string(ti, argc, argv, &ctr_str); + if (str_size < 0) { + kfree(lc); + return str_size; + } + + devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL); + if (!devices_rdata) { + DMERR("Failed to allocate memory for device information"); + r = -ENOMEM; + goto out; + } + + r = mempool_init_slab_pool(&lc->flush_entry_pool, FLUSH_ENTRY_POOL_SIZE, + _flush_entry_cache); + if (r) { + DMERR("Failed to create flush_entry_pool"); + goto out; + } + + /* + * Send table string and get back any opened device. + */ + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, + ctr_str, str_size, + devices_rdata, &devices_rdata_size); + + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); + goto out; + } + + /* Since the region size does not change, get it now */ + rdata_size = sizeof(rdata); + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, + NULL, 0, (char *)&rdata, &rdata_size); + + if (r) { + DMERR("Failed to get region size of dirty log"); + goto out; + } + + lc->region_size = (uint32_t)rdata; + lc->region_count = dm_sector_div_up(ti->len, lc->region_size); + + if (devices_rdata_size) { + if (devices_rdata[devices_rdata_size - 1] != '\0') { + DMERR("DM_ULOG_CTR device return string not properly terminated"); + r = -EINVAL; + goto out; + } + r = dm_get_device(ti, devices_rdata, + dm_table_get_mode(ti->table), &lc->log_dev); + if (r) + DMERR("Failed to register %s with device-mapper", + devices_rdata); + } + + if (lc->integrated_flush) { + lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0); + if (!lc->dmlog_wq) { + DMERR("couldn't start dmlogd"); + r = -ENOMEM; + goto out; + } + + INIT_DELAYED_WORK(&lc->flush_log_work, do_flush); + atomic_set(&lc->sched_flush, 0); + } + +out: + kfree(devices_rdata); + if (r) { + mempool_exit(&lc->flush_entry_pool); + kfree(lc); + kfree(ctr_str); + } else { + lc->usr_argv_str = ctr_str; + log->context = lc; + } + + return r; +} + +static void userspace_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + if (lc->integrated_flush) { + /* flush workqueue */ + if (atomic_read(&lc->sched_flush)) + flush_delayed_work(&lc->flush_log_work); + + destroy_workqueue(lc->dmlog_wq); + } + + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + NULL, 0, NULL, NULL); + + if (lc->log_dev) + dm_put_device(lc->ti, lc->log_dev); + + mempool_exit(&lc->flush_entry_pool); + + kfree(lc->usr_argv_str); + kfree(lc); + + return; +} + +static int userspace_presuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, + NULL, 0, NULL, NULL); + + return r; +} + +static int userspace_postsuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + /* + * Run planned flush earlier. + */ + if (lc->integrated_flush && atomic_read(&lc->sched_flush)) + flush_delayed_work(&lc->flush_log_work); + + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, + NULL, 0, NULL, NULL); + + return r; +} + +static int userspace_resume(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + lc->in_sync_hint = 0; + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, + NULL, 0, NULL, NULL); + + return r; +} + +static uint32_t userspace_get_region_size(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + return lc->region_size; +} + +/* + * userspace_is_clean + * + * Check whether a region is clean. If there is any sort of + * failure when consulting the server, we return not clean. + * + * Returns: 1 if clean, 0 otherwise + */ +static int userspace_is_clean(struct dm_dirty_log *log, region_t region) +{ + int r; + uint64_t region64 = (uint64_t)region; + int64_t is_clean; + size_t rdata_size; + struct log_c *lc = log->context; + + rdata_size = sizeof(is_clean); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, + (char *)®ion64, sizeof(region64), + (char *)&is_clean, &rdata_size); + + return (r) ? 0 : (int)is_clean; +} + +/* + * userspace_in_sync + * + * Check if the region is in-sync. If there is any sort + * of failure when consulting the server, we assume that + * the region is not in sync. + * + * If 'can_block' is set, return immediately + * + * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK + */ +static int userspace_in_sync(struct dm_dirty_log *log, region_t region, + int can_block) +{ + int r; + uint64_t region64 = region; + int64_t in_sync; + size_t rdata_size; + struct log_c *lc = log->context; + + /* + * We can never respond directly - even if in_sync_hint is + * set. This is because another machine could see a device + * failure and mark the region out-of-sync. If we don't go + * to userspace to ask, we might think the region is in-sync + * and allow a read to pick up data that is stale. (This is + * very unlikely if a device actually fails; but it is very + * likely if a connection to one device from one machine fails.) + * + * There still might be a problem if the mirror caches the region + * state as in-sync... but then this call would not be made. So, + * that is a mirror problem. + */ + if (!can_block) + return -EWOULDBLOCK; + + rdata_size = sizeof(in_sync); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, + (char *)®ion64, sizeof(region64), + (char *)&in_sync, &rdata_size); + return (r) ? 0 : (int)in_sync; +} + +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct dm_dirty_log_flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list, + int flush_with_payload) +{ + int r = 0; + int count; + uint32_t type = 0; + struct dm_dirty_log_flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_move(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + if (flush_with_payload) { + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + /* + * Integrated flush failed. + */ + if (r) + break; + } else { + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* + * Group send failed. Attempt one-by-one. + */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + +/* + * userspace_flush + * + * This function is ok to block. + * The flush happens in two stages. First, it sends all + * clear/mark requests that are on the list. Then it + * tells the server to commit them. This gives the + * server a chance to optimise the commit, instead of + * doing it for every request. + * + * Additionally, we could implement another thread that + * sends the requests up to the server - reducing the + * load on flush. Then the flush would have less in + * the list and be responsible for the finishing commit. + * + * Returns: 0 on success, < 0 on failure + */ +static int userspace_flush(struct dm_dirty_log *log) +{ + int r = 0; + unsigned long flags; + struct log_c *lc = log->context; + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); + int mark_list_is_empty; + int clear_list_is_empty; + struct dm_dirty_log_flush_entry *fe, *tmp_fe; + mempool_t *flush_entry_pool = &lc->flush_entry_pool; + + spin_lock_irqsave(&lc->flush_lock, flags); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + mark_list_is_empty = list_empty(&mark_list); + clear_list_is_empty = list_empty(&clear_list); + + if (mark_list_is_empty && clear_list_is_empty) + return 0; + + r = flush_by_group(lc, &clear_list, 0); + if (r) + goto out; + + if (!lc->integrated_flush) { + r = flush_by_group(lc, &mark_list, 0); + if (r) + goto out; + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, + NULL, 0, NULL, NULL); + goto out; + } + + /* + * Send integrated flush request with mark_list as payload. + */ + r = flush_by_group(lc, &mark_list, 1); + if (r) + goto out; + + if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) { + /* + * When there are only clear region requests, + * we schedule a flush in the future. + */ + queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ); + atomic_set(&lc->sched_flush, 1); + } else { + /* + * Cancel pending flush because we + * have already flushed in mark_region. + */ + cancel_delayed_work(&lc->flush_log_work); + atomic_set(&lc->sched_flush, 0); + } + +out: + /* + * We can safely remove these entries, even after failure. + * Calling code will receive an error and will know that + * the log facility has failed. + */ + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + + if (r) + dm_table_event(lc->ti->table); + + return r; +} + +/* + * userspace_mark_region + * + * This function should avoid blocking unless absolutely required. + * (Memory allocation is valid for blocking.) + */ +static void userspace_mark_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct dm_dirty_log_flush_entry *fe; + + /* Wait for an allocation, but _never_ fail */ + fe = mempool_alloc(&lc->flush_entry_pool, GFP_NOIO); + BUG_ON(!fe); + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_MARK_REGION; + fe->region = region; + list_add(&fe->list, &lc->mark_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_clear_region + * + * This function must not block. + * So, the alloc can't block. In the worst case, it is ok to + * fail. It would simply mean we can't clear the region. + * Does nothing to current sync context, but does mean + * the region will be re-sync'ed on a reload of the mirror + * even though it is in-sync. + */ +static void userspace_clear_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct dm_dirty_log_flush_entry *fe; + + /* + * If we fail to allocate, we skip the clearing of + * the region. This doesn't hurt us in any way, except + * to cause the region to be resync'ed when the + * device is activated next time. + */ + fe = mempool_alloc(&lc->flush_entry_pool, GFP_ATOMIC); + if (!fe) { + DMERR("Failed to allocate memory to clear region."); + return; + } + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_CLEAR_REGION; + fe->region = region; + list_add(&fe->list, &lc->clear_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_get_resync_work + * + * Get a region that needs recovery. It is valid to return + * an error for this function. + * + * Returns: 1 if region filled, 0 if no work, <0 on error + */ +static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) +{ + int r; + size_t rdata_size; + struct log_c *lc = log->context; + struct { + int64_t i; /* 64-bit for mix arch compatibility */ + region_t r; + } pkg; + + if (lc->in_sync_hint >= lc->region_count) + return 0; + + rdata_size = sizeof(pkg); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, + NULL, 0, (char *)&pkg, &rdata_size); + + *region = pkg.r; + return (r) ? r : (int)pkg.i; +} + +/* + * userspace_set_region_sync + * + * Set the sync status of a given region. This function + * must not fail. + */ +static void userspace_set_region_sync(struct dm_dirty_log *log, + region_t region, int in_sync) +{ + struct log_c *lc = log->context; + struct { + region_t r; + int64_t i; + } pkg; + + pkg.r = region; + pkg.i = (int64_t)in_sync; + + (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, + (char *)&pkg, sizeof(pkg), NULL, NULL); + + /* + * It would be nice to be able to report failures. + * However, it is easy enough to detect and resolve. + */ + return; +} + +/* + * userspace_get_sync_count + * + * If there is any sort of failure when consulting the server, + * we assume that the sync count is zero. + * + * Returns: sync count on success, 0 on failure + */ +static region_t userspace_get_sync_count(struct dm_dirty_log *log) +{ + int r; + size_t rdata_size; + uint64_t sync_count; + struct log_c *lc = log->context; + + rdata_size = sizeof(sync_count); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, + NULL, 0, (char *)&sync_count, &rdata_size); + + if (r) + return 0; + + if (sync_count >= lc->region_count) + lc->in_sync_hint = lc->region_count; + + return (region_t)sync_count; +} + +/* + * userspace_status + * + * Returns: amount of space consumed + */ +static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, + char *result, unsigned int maxlen) +{ + int r = 0; + char *table_args; + size_t sz = (size_t)maxlen; + struct log_c *lc = log->context; + + switch (status_type) { + case STATUSTYPE_INFO: + r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, + NULL, 0, result, &sz); + + if (r) { + sz = 0; + DMEMIT("%s 1 COM_FAILURE", log->type->name); + } + break; + case STATUSTYPE_TABLE: + sz = 0; + table_args = strchr(lc->usr_argv_str, ' '); + BUG_ON(!table_args); /* There will always be a ' ' */ + table_args++; + + DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid); + if (lc->integrated_flush) + DMEMIT("integrated_flush "); + DMEMIT("%s ", table_args); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + return (r) ? 0 : (int)sz; +} + +/* + * userspace_is_remote_recovering + * + * Returns: 1 if region recovering, 0 otherwise + */ +static int userspace_is_remote_recovering(struct dm_dirty_log *log, + region_t region) +{ + int r; + uint64_t region64 = region; + struct log_c *lc = log->context; + static unsigned long limit; + struct { + int64_t is_recovering; + uint64_t in_sync_hint; + } pkg; + size_t rdata_size = sizeof(pkg); + + /* + * Once the mirror has been reported to be in-sync, + * it will never again ask for recovery work. So, + * we can safely say there is not a remote machine + * recovering if the device is in-sync. (in_sync_hint + * must be reset at resume time.) + */ + if (region < lc->in_sync_hint) + return 0; + else if (time_after(limit, jiffies)) + return 1; + + limit = jiffies + (HZ / 4); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, + (char *)®ion64, sizeof(region64), + (char *)&pkg, &rdata_size); + if (r) + return 1; + + lc->in_sync_hint = pkg.in_sync_hint; + + return (int)pkg.is_recovering; +} + +static struct dm_dirty_log_type _userspace_type = { + .name = "userspace", + .module = THIS_MODULE, + .ctr = userspace_ctr, + .dtr = userspace_dtr, + .presuspend = userspace_presuspend, + .postsuspend = userspace_postsuspend, + .resume = userspace_resume, + .get_region_size = userspace_get_region_size, + .is_clean = userspace_is_clean, + .in_sync = userspace_in_sync, + .flush = userspace_flush, + .mark_region = userspace_mark_region, + .clear_region = userspace_clear_region, + .get_resync_work = userspace_get_resync_work, + .set_region_sync = userspace_set_region_sync, + .get_sync_count = userspace_get_sync_count, + .status = userspace_status, + .is_remote_recovering = userspace_is_remote_recovering, +}; + +static int __init userspace_dirty_log_init(void) +{ + int r = 0; + + _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0); + if (!_flush_entry_cache) { + DMWARN("Unable to create flush_entry_cache: No memory."); + return -ENOMEM; + } + + r = dm_ulog_tfr_init(); + if (r) { + DMWARN("Unable to initialize userspace log communications"); + kmem_cache_destroy(_flush_entry_cache); + return r; + } + + r = dm_dirty_log_type_register(&_userspace_type); + if (r) { + DMWARN("Couldn't register userspace dirty log type"); + dm_ulog_tfr_exit(); + kmem_cache_destroy(_flush_entry_cache); + return r; + } + + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); + return 0; +} + +static void __exit userspace_dirty_log_exit(void) +{ + dm_dirty_log_type_unregister(&_userspace_type); + dm_ulog_tfr_exit(); + kmem_cache_destroy(_flush_entry_cache); + + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); + return; +} + +module_init(userspace_dirty_log_init); +module_exit(userspace_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); +MODULE_AUTHOR("Jonathan Brassow "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 000000000..ee5586e8e --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c @@ -0,0 +1,286 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-log-userspace-transfer.h" + +static uint32_t dm_ulog_seq; + +/* + * Netlink/Connector is an unreliable protocol. How long should + * we wait for a response before assuming it was lost and retrying? + * (If we do receive a response after this time, it will be discarded + * and the response to the resent request will be waited for. + */ +#define DM_ULOG_RETRY_TIMEOUT (15 * HZ) + +/* + * Pre-allocated space for speed + */ +#define DM_ULOG_PREALLOCED_SIZE 512 +static struct cn_msg *prealloced_cn_msg; +static struct dm_ulog_request *prealloced_ulog_tfr; + +static struct cb_id ulog_cn_id = { + .idx = CN_IDX_DM, + .val = CN_VAL_DM_USERSPACE_LOG +}; + +static DEFINE_MUTEX(dm_ulog_lock); + +struct receiving_pkg { + struct list_head list; + struct completion complete; + + uint32_t seq; + + int error; + size_t *data_size; + char *data; +}; + +static DEFINE_SPINLOCK(receiving_list_lock); +static struct list_head receiving_list; + +static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) +{ + int r; + struct cn_msg *msg = prealloced_cn_msg; + + memset(msg, 0, sizeof(struct cn_msg)); + + msg->id.idx = ulog_cn_id.idx; + msg->id.val = ulog_cn_id.val; + msg->ack = 0; + msg->seq = tfr->seq; + msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; + + r = cn_netlink_send(msg, 0, 0, gfp_any()); + + return r; +} + +/* + * Parameters for this function can be either msg or tfr, but not + * both. This function fills in the reply for a waiting request. + * If just msg is given, then the reply is simply an ACK from userspace + * that the request was received. + * + * Returns: 0 on success, -ENOENT on failure + */ +static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) +{ + uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; + struct receiving_pkg *pkg; + + /* + * The 'receiving_pkg' entries in this list are statically + * allocated on the stack in 'dm_consult_userspace'. + * Each process that is waiting for a reply from the user + * space server will have an entry in this list. + * + * We are safe to do it this way because the stack space + * is unique to each process, but still addressable by + * other processes. + */ + list_for_each_entry(pkg, &receiving_list, list) { + if (rtn_seq != pkg->seq) + continue; + + if (msg) { + pkg->error = -msg->ack; + /* + * If we are trying again, we will need to know our + * storage capacity. Otherwise, along with the + * error code, we make explicit that we have no data. + */ + if (pkg->error != -EAGAIN) + *(pkg->data_size) = 0; + } else if (tfr->data_size > *(pkg->data_size)) { + DMERR("Insufficient space to receive package [%u] (%u vs %zu)", + tfr->request_type, tfr->data_size, *(pkg->data_size)); + + *(pkg->data_size) = 0; + pkg->error = -ENOSPC; + } else { + pkg->error = tfr->error; + memcpy(pkg->data, tfr->data, tfr->data_size); + *(pkg->data_size) = tfr->data_size; + } + complete(&pkg->complete); + return 0; + } + + return -ENOENT; +} + +/* + * This is the connector callback that delivers data + * that was sent from userspace. + */ +static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ + struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); + + if (!capable(CAP_SYS_ADMIN)) + return; + + spin_lock(&receiving_list_lock); + if (msg->len == 0) + fill_pkg(msg, NULL); + else if (msg->len < sizeof(*tfr)) + DMERR("Incomplete message received (expected %u, got %u): [%u]", + (unsigned int)sizeof(*tfr), msg->len, msg->seq); + else + fill_pkg(NULL, tfr); + spin_unlock(&receiving_list_lock); +} + +/** + * dm_consult_userspace + * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size) + * @luid: log's local unique identifier + * @request_type: found in include/linux/dm-log-userspace.h + * @data: data to tx to the server + * @data_size: size of data in bytes + * @rdata: place to put return data from server + * @rdata_size: value-result (amount of space given/amount of space used) + * + * rdata_size is undefined on failure. + * + * Memory used to communicate with userspace is zero'ed + * before populating to ensure that no unwanted bits leak + * from kernel space to user-space. All userspace log communications + * between kernel and user space go through this function. + * + * Returns: 0 on success, -EXXX on failure + **/ +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r = 0; + unsigned long tmo; + size_t dummy = 0; + int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); + struct dm_ulog_request *tfr = prealloced_ulog_tfr; + struct receiving_pkg pkg; + + /* + * Given the space needed to hold the 'struct cn_msg' and + * 'struct dm_ulog_request' - do we have enough payload + * space remaining? + */ + if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { + DMINFO("Size of tfr exceeds preallocated size"); + return -EINVAL; + } + + if (!rdata_size) + rdata_size = &dummy; +resend: + /* + * We serialize the sending of requests so we can + * use the preallocated space. + */ + mutex_lock(&dm_ulog_lock); + + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); + memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; + tfr->luid = luid; + tfr->seq = dm_ulog_seq++; + + /* + * Must be valid request type (all other bits set to + * zero). This reserves other bits for possible future + * use. + */ + tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; + + tfr->data_size = data_size; + if (data && data_size) + memcpy(tfr->data, data, data_size); + + memset(&pkg, 0, sizeof(pkg)); + init_completion(&pkg.complete); + pkg.seq = tfr->seq; + pkg.data_size = rdata_size; + pkg.data = rdata; + spin_lock(&receiving_list_lock); + list_add(&(pkg.list), &receiving_list); + spin_unlock(&receiving_list_lock); + + r = dm_ulog_sendto_server(tfr); + + mutex_unlock(&dm_ulog_lock); + + if (r) { + DMERR("Unable to send log request [%u] to userspace: %d", + request_type, r); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + + goto out; + } + + tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + if (!tmo) { + DMWARN("[%s] Request timed out: [%u/%u] - retrying", + (strlen(uuid) > 8) ? + (uuid + (strlen(uuid) - 8)) : (uuid), + request_type, pkg.seq); + goto resend; + } + + r = pkg.error; + if (r == -EAGAIN) + goto resend; + +out: + return r; +} + +int dm_ulog_tfr_init(void) +{ + int r; + void *prealloced; + + INIT_LIST_HEAD(&receiving_list); + + prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); + if (!prealloced) + return -ENOMEM; + + prealloced_cn_msg = prealloced; + prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); + + r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); + if (r) { + kfree(prealloced_cn_msg); + return r; + } + + return 0; +} + +void dm_ulog_tfr_exit(void) +{ + cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); +} diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 000000000..04ee874f9 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#ifndef __DM_LOG_USERSPACE_TRANSFER_H__ +#define __DM_LOG_USERSPACE_TRANSFER_H__ + +#define DM_MSG_PREFIX "dm-log-userspace" + +int dm_ulog_tfr_init(void); +void dm_ulog_tfr_exit(void); +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size); + +#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c new file mode 100644 index 000000000..efdfb2e18 --- /dev/null +++ b/drivers/md/dm-log-writes.c @@ -0,0 +1,960 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "log-writes" + +/* + * This target will sequentially log all writes to the target device onto the + * log device. This is helpful for replaying writes to check for fs consistency + * at all times. This target provides a mechanism to mark specific events to + * check data at a later time. So for example you would: + * + * write data + * fsync + * dmsetup message /dev/whatever mark mymark + * unmount /mnt/test + * + * Then replay the log up to mymark and check the contents of the replay to + * verify it matches what was written. + * + * We log writes only after they have been flushed, this makes the log describe + * close to the order in which the data hits the actual disk, not its cache. So + * for example the following sequence (W means write, C means complete) + * + * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd + * + * Would result in the log looking like this: + * + * c,a,b,flush,fuad,, + * + * This is meant to help expose problems where file systems do not properly wait + * on data being written before invoking a FLUSH. FUA bypasses cache so once it + * completes it is added to the log as it should be on disk. + * + * We treat DISCARDs as if they don't bypass cache so that they are logged in + * order of completion along with the normal writes. If we didn't do it this + * way we would process all the discards first and then write all the data, when + * in fact we want to do the data and the discard in the order that they + * completed. + */ +#define LOG_FLUSH_FLAG (1 << 0) +#define LOG_FUA_FLAG (1 << 1) +#define LOG_DISCARD_FLAG (1 << 2) +#define LOG_MARK_FLAG (1 << 3) +#define LOG_METADATA_FLAG (1 << 4) + +#define WRITE_LOG_VERSION 1ULL +#define WRITE_LOG_MAGIC 0x6a736677736872ULL +#define WRITE_LOG_SUPER_SECTOR 0 + +/* + * The disk format for this is braindead simple. + * + * At byte 0 we have our super, followed by the following sequence for + * nr_entries: + * + * [ 1 sector ][ entry->nr_sectors ] + * [log_write_entry][ data written ] + * + * The log_write_entry takes up a full sector so we can have arbitrary length + * marks and it leaves us room for extra content in the future. + */ + +/* + * Basic info about the log for userspace. + */ +struct log_write_super { + __le64 magic; + __le64 version; + __le64 nr_entries; + __le32 sectorsize; +}; + +/* + * sector - the sector we wrote. + * nr_sectors - the number of sectors we wrote. + * flags - flags for this log entry. + * data_len - the size of the data in this log entry, this is for private log + * entry stuff, the MARK data provided by userspace for example. + */ +struct log_write_entry { + __le64 sector; + __le64 nr_sectors; + __le64 flags; + __le64 data_len; +}; + +struct log_writes_c { + struct dm_dev *dev; + struct dm_dev *logdev; + u64 logged_entries; + u32 sectorsize; + u32 sectorshift; + atomic_t io_blocks; + atomic_t pending_blocks; + sector_t next_sector; + sector_t end_sector; + bool logging_enabled; + bool device_supports_discard; + spinlock_t blocks_lock; + struct list_head unflushed_blocks; + struct list_head logging_blocks; + wait_queue_head_t wait; + struct task_struct *log_kthread; + struct completion super_done; +}; + +struct pending_block { + int vec_cnt; + u64 flags; + sector_t sector; + sector_t nr_sectors; + char *data; + u32 datalen; + struct list_head list; + struct bio_vec vecs[]; +}; + +struct per_bio_data { + struct pending_block *block; +}; + +static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc, + sector_t sectors) +{ + return sectors >> (lc->sectorshift - SECTOR_SHIFT); +} + +static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc, + sector_t sectors) +{ + return sectors << (lc->sectorshift - SECTOR_SHIFT); +} + +static void put_pending_block(struct log_writes_c *lc) +{ + if (atomic_dec_and_test(&lc->pending_blocks)) { + smp_mb__after_atomic(); + if (waitqueue_active(&lc->wait)) + wake_up(&lc->wait); + } +} + +static void put_io_block(struct log_writes_c *lc) +{ + if (atomic_dec_and_test(&lc->io_blocks)) { + smp_mb__after_atomic(); + if (waitqueue_active(&lc->wait)) + wake_up(&lc->wait); + } +} + +static void log_end_io(struct bio *bio) +{ + struct log_writes_c *lc = bio->bi_private; + + if (bio->bi_status) { + unsigned long flags; + + DMERR("Error writing log block, error=%d", bio->bi_status); + spin_lock_irqsave(&lc->blocks_lock, flags); + lc->logging_enabled = false; + spin_unlock_irqrestore(&lc->blocks_lock, flags); + } + + bio_free_pages(bio); + put_io_block(lc); + bio_put(bio); +} + +static void log_end_super(struct bio *bio) +{ + struct log_writes_c *lc = bio->bi_private; + + complete(&lc->super_done); + log_end_io(bio); +} + +/* + * Meant to be called if there is an error, it will free all the pages + * associated with the block. + */ +static void free_pending_block(struct log_writes_c *lc, + struct pending_block *block) +{ + int i; + + for (i = 0; i < block->vec_cnt; i++) { + if (block->vecs[i].bv_page) + __free_page(block->vecs[i].bv_page); + } + kfree(block->data); + kfree(block); + put_pending_block(lc); +} + +static int write_metadata(struct log_writes_c *lc, void *entry, + size_t entrylen, void *data, size_t datalen, + sector_t sector) +{ + struct bio *bio; + struct page *page; + void *ptr; + size_t ret; + + bio = bio_alloc(lc->logdev->bdev, 1, REQ_OP_WRITE, GFP_KERNEL); + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ? + log_end_super : log_end_io; + bio->bi_private = lc; + + page = alloc_page(GFP_KERNEL); + if (!page) { + DMERR("Couldn't alloc log page"); + bio_put(bio); + goto error; + } + + ptr = kmap_atomic(page); + memcpy(ptr, entry, entrylen); + if (datalen) + memcpy(ptr + entrylen, data, datalen); + memset(ptr + entrylen + datalen, 0, + lc->sectorsize - entrylen - datalen); + kunmap_atomic(ptr); + + ret = bio_add_page(bio, page, lc->sectorsize, 0); + if (ret != lc->sectorsize) { + DMERR("Couldn't add page to the log block"); + goto error_bio; + } + submit_bio(bio); + return 0; +error_bio: + bio_put(bio); + __free_page(page); +error: + put_io_block(lc); + return -1; +} + +static int write_inline_data(struct log_writes_c *lc, void *entry, + size_t entrylen, void *data, size_t datalen, + sector_t sector) +{ + int bio_pages, pg_datalen, pg_sectorlen, i; + struct page *page; + struct bio *bio; + size_t ret; + void *ptr; + + while (datalen) { + bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE)); + + atomic_inc(&lc->io_blocks); + + bio = bio_alloc(lc->logdev->bdev, bio_pages, REQ_OP_WRITE, + GFP_KERNEL); + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + + for (i = 0; i < bio_pages; i++) { + pg_datalen = min_t(int, datalen, PAGE_SIZE); + pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); + + page = alloc_page(GFP_KERNEL); + if (!page) { + DMERR("Couldn't alloc inline data page"); + goto error_bio; + } + + ptr = kmap_atomic(page); + memcpy(ptr, data, pg_datalen); + if (pg_sectorlen > pg_datalen) + memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen); + kunmap_atomic(ptr); + + ret = bio_add_page(bio, page, pg_sectorlen, 0); + if (ret != pg_sectorlen) { + DMERR("Couldn't add page of inline data"); + __free_page(page); + goto error_bio; + } + + datalen -= pg_datalen; + data += pg_datalen; + } + submit_bio(bio); + + sector += bio_pages * PAGE_SECTORS; + } + return 0; +error_bio: + bio_free_pages(bio); + bio_put(bio); + put_io_block(lc); + return -1; +} + +static int log_one_block(struct log_writes_c *lc, + struct pending_block *block, sector_t sector) +{ + struct bio *bio; + struct log_write_entry entry; + size_t metadatalen, ret; + int i; + + entry.sector = cpu_to_le64(block->sector); + entry.nr_sectors = cpu_to_le64(block->nr_sectors); + entry.flags = cpu_to_le64(block->flags); + entry.data_len = cpu_to_le64(block->datalen); + + metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; + if (write_metadata(lc, &entry, sizeof(entry), block->data, + metadatalen, sector)) { + free_pending_block(lc, block); + return -1; + } + + sector += dev_to_bio_sectors(lc, 1); + + if (block->datalen && metadatalen == 0) { + if (write_inline_data(lc, &entry, sizeof(entry), block->data, + block->datalen, sector)) { + free_pending_block(lc, block); + return -1; + } + /* we don't support both inline data & bio data */ + goto out; + } + + if (!block->vec_cnt) + goto out; + + atomic_inc(&lc->io_blocks); + bio = bio_alloc(lc->logdev->bdev, bio_max_segs(block->vec_cnt), + REQ_OP_WRITE, GFP_KERNEL); + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + + for (i = 0; i < block->vec_cnt; i++) { + /* + * The page offset is always 0 because we allocate a new page + * for every bvec in the original bio for simplicity sake. + */ + ret = bio_add_page(bio, block->vecs[i].bv_page, + block->vecs[i].bv_len, 0); + if (ret != block->vecs[i].bv_len) { + atomic_inc(&lc->io_blocks); + submit_bio(bio); + bio = bio_alloc(lc->logdev->bdev, + bio_max_segs(block->vec_cnt - i), + REQ_OP_WRITE, GFP_KERNEL); + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + + ret = bio_add_page(bio, block->vecs[i].bv_page, + block->vecs[i].bv_len, 0); + if (ret != block->vecs[i].bv_len) { + DMERR("Couldn't add page on new bio?"); + bio_put(bio); + goto error; + } + } + sector += block->vecs[i].bv_len >> SECTOR_SHIFT; + } + submit_bio(bio); +out: + kfree(block->data); + kfree(block); + put_pending_block(lc); + return 0; +error: + free_pending_block(lc, block); + put_io_block(lc); + return -1; +} + +static int log_super(struct log_writes_c *lc) +{ + struct log_write_super super; + + super.magic = cpu_to_le64(WRITE_LOG_MAGIC); + super.version = cpu_to_le64(WRITE_LOG_VERSION); + super.nr_entries = cpu_to_le64(lc->logged_entries); + super.sectorsize = cpu_to_le32(lc->sectorsize); + + if (write_metadata(lc, &super, sizeof(super), NULL, 0, + WRITE_LOG_SUPER_SECTOR)) { + DMERR("Couldn't write super"); + return -1; + } + + /* + * Super sector should be writen in-order, otherwise the + * nr_entries could be rewritten incorrectly by an old bio. + */ + wait_for_completion_io(&lc->super_done); + + return 0; +} + +static inline sector_t logdev_last_sector(struct log_writes_c *lc) +{ + return bdev_nr_sectors(lc->logdev->bdev); +} + +static int log_writes_kthread(void *arg) +{ + struct log_writes_c *lc = (struct log_writes_c *)arg; + sector_t sector = 0; + + while (!kthread_should_stop()) { + bool super = false; + bool logging_enabled; + struct pending_block *block = NULL; + int ret; + + spin_lock_irq(&lc->blocks_lock); + if (!list_empty(&lc->logging_blocks)) { + block = list_first_entry(&lc->logging_blocks, + struct pending_block, list); + list_del_init(&block->list); + if (!lc->logging_enabled) + goto next; + + sector = lc->next_sector; + if (!(block->flags & LOG_DISCARD_FLAG)) + lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors); + lc->next_sector += dev_to_bio_sectors(lc, 1); + + /* + * Apparently the size of the device may not be known + * right away, so handle this properly. + */ + if (!lc->end_sector) + lc->end_sector = logdev_last_sector(lc); + if (lc->end_sector && + lc->next_sector >= lc->end_sector) { + DMERR("Ran out of space on the logdev"); + lc->logging_enabled = false; + goto next; + } + lc->logged_entries++; + atomic_inc(&lc->io_blocks); + + super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); + if (super) + atomic_inc(&lc->io_blocks); + } +next: + logging_enabled = lc->logging_enabled; + spin_unlock_irq(&lc->blocks_lock); + if (block) { + if (logging_enabled) { + ret = log_one_block(lc, block, sector); + if (!ret && super) + ret = log_super(lc); + if (ret) { + spin_lock_irq(&lc->blocks_lock); + lc->logging_enabled = false; + spin_unlock_irq(&lc->blocks_lock); + } + } else + free_pending_block(lc, block); + continue; + } + + if (!try_to_freeze()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + list_empty(&lc->logging_blocks)) + schedule(); + __set_current_state(TASK_RUNNING); + } + } + return 0; +} + +/* + * Construct a log-writes mapping: + * log-writes + */ +static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct log_writes_c *lc; + struct dm_arg_set as; + const char *devname, *logdevname; + int ret; + + as.argc = argc; + as.argv = argv; + + if (argc < 2) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL); + if (!lc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + spin_lock_init(&lc->blocks_lock); + INIT_LIST_HEAD(&lc->unflushed_blocks); + INIT_LIST_HEAD(&lc->logging_blocks); + init_waitqueue_head(&lc->wait); + init_completion(&lc->super_done); + atomic_set(&lc->io_blocks, 0); + atomic_set(&lc->pending_blocks, 0); + + devname = dm_shift_arg(&as); + ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev); + if (ret) { + ti->error = "Device lookup failed"; + goto bad; + } + + logdevname = dm_shift_arg(&as); + ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), + &lc->logdev); + if (ret) { + ti->error = "Log device lookup failed"; + dm_put_device(ti, lc->dev); + goto bad; + } + + lc->sectorsize = bdev_logical_block_size(lc->dev->bdev); + lc->sectorshift = ilog2(lc->sectorsize); + lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); + if (IS_ERR(lc->log_kthread)) { + ret = PTR_ERR(lc->log_kthread); + ti->error = "Couldn't alloc kthread"; + dm_put_device(ti, lc->dev); + dm_put_device(ti, lc->logdev); + goto bad; + } + + /* + * next_sector is in 512b sectors to correspond to what bi_sector expects. + * The super starts at sector 0, and the next_sector is the next logical + * one based on the sectorsize of the device. + */ + lc->next_sector = lc->sectorsize >> SECTOR_SHIFT; + lc->logging_enabled = true; + lc->end_sector = logdev_last_sector(lc); + lc->device_supports_discard = true; + + ti->num_flush_bios = 1; + ti->flush_supported = true; + ti->num_discard_bios = 1; + ti->discards_supported = true; + ti->per_io_data_size = sizeof(struct per_bio_data); + ti->private = lc; + return 0; + +bad: + kfree(lc); + return ret; +} + +static int log_mark(struct log_writes_c *lc, char *data) +{ + struct pending_block *block; + size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating pending block"); + return -ENOMEM; + } + + block->data = kstrndup(data, maxsize - 1, GFP_KERNEL); + if (!block->data) { + DMERR("Error copying mark data"); + kfree(block); + return -ENOMEM; + } + atomic_inc(&lc->pending_blocks); + block->datalen = strlen(block->data); + block->flags |= LOG_MARK_FLAG; + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->logging_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + return 0; +} + +static void log_writes_dtr(struct dm_target *ti) +{ + struct log_writes_c *lc = ti->private; + + spin_lock_irq(&lc->blocks_lock); + list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks); + spin_unlock_irq(&lc->blocks_lock); + + /* + * This is just nice to have since it'll update the super to include the + * unflushed blocks, if it fails we don't really care. + */ + log_mark(lc, "dm-log-writes-end"); + wake_up_process(lc->log_kthread); + wait_event(lc->wait, !atomic_read(&lc->io_blocks) && + !atomic_read(&lc->pending_blocks)); + kthread_stop(lc->log_kthread); + + WARN_ON(!list_empty(&lc->logging_blocks)); + WARN_ON(!list_empty(&lc->unflushed_blocks)); + dm_put_device(ti, lc->dev); + dm_put_device(ti, lc->logdev); + kfree(lc); +} + +static void normal_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct log_writes_c *lc = ti->private; + + bio_set_dev(bio, lc->dev->bdev); +} + +static int log_writes_map(struct dm_target *ti, struct bio *bio) +{ + struct log_writes_c *lc = ti->private; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + struct pending_block *block; + struct bvec_iter iter; + struct bio_vec bv; + size_t alloc_size; + int i = 0; + bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); + bool fua_bio = (bio->bi_opf & REQ_FUA); + bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); + bool meta_bio = (bio->bi_opf & REQ_META); + + pb->block = NULL; + + /* Don't bother doing anything if logging has been disabled */ + if (!lc->logging_enabled) + goto map_bio; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* No sectors and not a flush? Don't care */ + if (!bio_sectors(bio) && !flush_bio) + goto map_bio; + + /* + * Discards will have bi_size set but there's no actual data, so just + * allocate the size of the pending block. + */ + if (discard_bio) + alloc_size = sizeof(struct pending_block); + else + alloc_size = struct_size(block, vecs, bio_segments(bio)); + + block = kzalloc(alloc_size, GFP_NOIO); + if (!block) { + DMERR("Error allocating pending block"); + spin_lock_irq(&lc->blocks_lock); + lc->logging_enabled = false; + spin_unlock_irq(&lc->blocks_lock); + return DM_MAPIO_KILL; + } + INIT_LIST_HEAD(&block->list); + pb->block = block; + atomic_inc(&lc->pending_blocks); + + if (flush_bio) + block->flags |= LOG_FLUSH_FLAG; + if (fua_bio) + block->flags |= LOG_FUA_FLAG; + if (discard_bio) + block->flags |= LOG_DISCARD_FLAG; + if (meta_bio) + block->flags |= LOG_METADATA_FLAG; + + block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector); + block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); + + /* We don't need the data, just submit */ + if (discard_bio) { + WARN_ON(flush_bio || fua_bio); + if (lc->device_supports_discard) + goto map_bio; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + /* Flush bio, splice the unflushed blocks onto this list and submit */ + if (flush_bio && !bio_sectors(bio)) { + spin_lock_irq(&lc->blocks_lock); + list_splice_init(&lc->unflushed_blocks, &block->list); + spin_unlock_irq(&lc->blocks_lock); + goto map_bio; + } + + /* + * We will write this bio somewhere else way later so we need to copy + * the actual contents into new pages so we know the data will always be + * there. + * + * We do this because this could be a bio from O_DIRECT in which case we + * can't just hold onto the page until some later point, we have to + * manually copy the contents. + */ + bio_for_each_segment(bv, bio, iter) { + struct page *page; + void *dst; + + page = alloc_page(GFP_NOIO); + if (!page) { + DMERR("Error allocing page"); + free_pending_block(lc, block); + spin_lock_irq(&lc->blocks_lock); + lc->logging_enabled = false; + spin_unlock_irq(&lc->blocks_lock); + return DM_MAPIO_KILL; + } + + dst = kmap_atomic(page); + memcpy_from_bvec(dst, &bv); + kunmap_atomic(dst); + block->vecs[i].bv_page = page; + block->vecs[i].bv_len = bv.bv_len; + block->vec_cnt++; + i++; + } + + /* Had a flush with data in it, weird */ + if (flush_bio) { + spin_lock_irq(&lc->blocks_lock); + list_splice_init(&lc->unflushed_blocks, &block->list); + spin_unlock_irq(&lc->blocks_lock); + } +map_bio: + normal_map_bio(ti, bio); + return DM_MAPIO_REMAPPED; +} + +static int normal_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) +{ + struct log_writes_c *lc = ti->private; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + + if (bio_data_dir(bio) == WRITE && pb->block) { + struct pending_block *block = pb->block; + unsigned long flags; + + spin_lock_irqsave(&lc->blocks_lock, flags); + if (block->flags & LOG_FLUSH_FLAG) { + list_splice_tail_init(&block->list, &lc->logging_blocks); + list_add_tail(&block->list, &lc->logging_blocks); + wake_up_process(lc->log_kthread); + } else if (block->flags & LOG_FUA_FLAG) { + list_add_tail(&block->list, &lc->logging_blocks); + wake_up_process(lc->log_kthread); + } else + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irqrestore(&lc->blocks_lock, flags); + } + + return DM_ENDIO_DONE; +} + +/* + * INFO format: + */ +static void log_writes_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + unsigned int sz = 0; + struct log_writes_c *lc = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu", lc->logged_entries, + (unsigned long long)lc->next_sector - 1); + if (!lc->logging_enabled) + DMEMIT(" logging_disabled"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %s", lc->dev->name, lc->logdev->name); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int log_writes_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev) +{ + struct log_writes_c *lc = ti->private; + struct dm_dev *dev = lc->dev; + + *bdev = dev->bdev; + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (ti->len != bdev_nr_sectors(dev->bdev)) + return 1; + return 0; +} + +static int log_writes_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data) +{ + struct log_writes_c *lc = ti->private; + + return fn(ti, lc->dev, 0, ti->len, data); +} + +/* + * Messages supported: + * mark - specify the marked data. + */ +static int log_writes_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r = -EINVAL; + struct log_writes_c *lc = ti->private; + + if (argc != 2) { + DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc); + return r; + } + + if (!strcasecmp(argv[0], "mark")) + r = log_mark(lc, argv[1]); + else + DMWARN("Unrecognised log writes target message received: %s", argv[0]); + + return r; +} + +static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct log_writes_c *lc = ti->private; + + if (!bdev_max_discard_sectors(lc->dev->bdev)) { + lc->device_supports_discard = false; + limits->discard_granularity = lc->sectorsize; + limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); + } + limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev); + limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev); + limits->io_min = limits->physical_block_size; + limits->dma_alignment = limits->logical_block_size - 1; +} + +#if IS_ENABLED(CONFIG_FS_DAX) +static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, + pgoff_t *pgoff) +{ + struct log_writes_c *lc = ti->private; + + *pgoff += (get_start_sect(lc->dev->bdev) >> PAGE_SECTORS_SHIFT); + return lc->dev->dax_dev; +} + +static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) +{ + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); +} + +static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, + size_t nr_pages) +{ + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); +} + +static size_t log_writes_dax_recovery_write(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + +#else +#define log_writes_dax_direct_access NULL +#define log_writes_dax_zero_page_range NULL +#define log_writes_dax_recovery_write NULL +#endif + +static struct target_type log_writes_target = { + .name = "log-writes", + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = log_writes_ctr, + .dtr = log_writes_dtr, + .map = log_writes_map, + .end_io = normal_end_io, + .status = log_writes_status, + .prepare_ioctl = log_writes_prepare_ioctl, + .message = log_writes_message, + .iterate_devices = log_writes_iterate_devices, + .io_hints = log_writes_io_hints, + .direct_access = log_writes_dax_direct_access, + .dax_zero_page_range = log_writes_dax_zero_page_range, + .dax_recovery_write = log_writes_dax_recovery_write, +}; + +static int __init dm_log_writes_init(void) +{ + int r = dm_register_target(&log_writes_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_log_writes_exit(void) +{ + dm_unregister_target(&log_writes_target); +} + +module_init(dm_log_writes_init); +module_exit(dm_log_writes_exit); + +MODULE_DESCRIPTION(DM_NAME " log writes target"); +MODULE_AUTHOR("Josef Bacik "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c new file mode 100644 index 000000000..05141eea1 --- /dev/null +++ b/drivers/md/dm-log.c @@ -0,0 +1,896 @@ +/* + * Copyright (C) 2003 Sistina Software + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#define DM_MSG_PREFIX "dirty region log" + +static LIST_HEAD(_log_types); +static DEFINE_SPINLOCK(_lock); + +static struct dm_dirty_log_type *__find_dirty_log_type(const char *name) +{ + struct dm_dirty_log_type *log_type; + + list_for_each_entry(log_type, &_log_types, list) + if (!strcmp(name, log_type->name)) + return log_type; + + return NULL; +} + +static struct dm_dirty_log_type *_get_dirty_log_type(const char *name) +{ + struct dm_dirty_log_type *log_type; + + spin_lock(&_lock); + + log_type = __find_dirty_log_type(name); + if (log_type && !try_module_get(log_type->module)) + log_type = NULL; + + spin_unlock(&_lock); + + return log_type; +} + +/* + * get_type + * @type_name + * + * Attempt to retrieve the dm_dirty_log_type by name. If not already + * available, attempt to load the appropriate module. + * + * Log modules are named "dm-log-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-log-", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-disk", it would search + * 'dm-log-clustered-disk' then 'dm-log-clustered'. + * + * Returns: dirty_log_type* on success, NULL on failure + */ +static struct dm_dirty_log_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dm_dirty_log_type *log_type; + + if (!type_name) + return NULL; + + log_type = _get_dirty_log_type(type_name); + if (log_type) + return log_type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMWARN("No memory left to attempt log module load for \"%s\"", + type_name); + return NULL; + } + + while (request_module("dm-log-%s", type_name_dup) || + !(log_type = _get_dirty_log_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!log_type) + DMWARN("Module for logging type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return log_type; +} + +static void put_type(struct dm_dirty_log_type *type) +{ + if (!type) + return; + + spin_lock(&_lock); + if (!__find_dirty_log_type(type->name)) + goto out; + + module_put(type->module); + +out: + spin_unlock(&_lock); +} + +int dm_dirty_log_type_register(struct dm_dirty_log_type *type) +{ + int r = 0; + + spin_lock(&_lock); + if (!__find_dirty_log_type(type->name)) + list_add(&type->list, &_log_types); + else + r = -EEXIST; + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_dirty_log_type_register); + +int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) +{ + spin_lock(&_lock); + + if (!__find_dirty_log_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; + } + + list_del(&type->list); + + spin_unlock(&_lock); + + return 0; +} +EXPORT_SYMBOL(dm_dirty_log_type_unregister); + +struct dm_dirty_log *dm_dirty_log_create(const char *type_name, + struct dm_target *ti, + int (*flush_callback_fn)(struct dm_target *ti), + unsigned int argc, char **argv) +{ + struct dm_dirty_log_type *type; + struct dm_dirty_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return NULL; + + type = get_type(type_name); + if (!type) { + kfree(log); + return NULL; + } + + log->flush_callback_fn = flush_callback_fn; + log->type = type; + if (type->ctr(log, ti, argc, argv)) { + kfree(log); + put_type(type); + return NULL; + } + + return log; +} +EXPORT_SYMBOL(dm_dirty_log_create); + +void dm_dirty_log_destroy(struct dm_dirty_log *log) +{ + log->type->dtr(log); + put_type(log->type); + kfree(log); +} +EXPORT_SYMBOL(dm_dirty_log_destroy); + +/*----------------------------------------------------------------- + * Persistent and core logs share a lot of their implementation. + * FIXME: need a reload method to be called from a resume + *---------------------------------------------------------------*/ +/* + * Magic for persistent mirrors: "MiRr" + */ +#define MIRROR_MAGIC 0x4D695272 + +/* + * The on-disk version of the metadata. + */ +#define MIRROR_DISK_VERSION 2 +#define LOG_OFFSET 2 + +struct log_header_disk { + __le32 magic; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + __le32 version; + __le64 nr_regions; +} __packed; + +struct log_header_core { + uint32_t magic; + uint32_t version; + uint64_t nr_regions; +}; + +struct log_c { + struct dm_target *ti; + int touched_dirtied; + int touched_cleaned; + int flush_failed; + uint32_t region_size; + unsigned int region_count; + region_t sync_count; + + unsigned int bitset_uint32_count; + uint32_t *clean_bits; + uint32_t *sync_bits; + uint32_t *recovering_bits; /* FIXME: this seems excessive */ + + int sync_search; + + /* Resync flag */ + enum sync { + DEFAULTSYNC, /* Synchronize if necessary */ + NOSYNC, /* Devices known to be already in sync */ + FORCESYNC, /* Force a sync to happen */ + } sync; + + struct dm_io_request io_req; + + /* + * Disk log fields + */ + int log_dev_failed; + int log_dev_flush_failed; + struct dm_dev *log_dev; + struct log_header_core header; + + struct dm_io_region header_location; + struct log_header_disk *disk_header; +}; + +/* + * The touched member needs to be updated every time we access + * one of the bitsets. + */ +static inline int log_test_bit(uint32_t *bs, unsigned int bit) +{ + return test_bit_le(bit, bs) ? 1 : 0; +} + +static inline void log_set_bit(struct log_c *l, + uint32_t *bs, unsigned int bit) +{ + __set_bit_le(bit, bs); + l->touched_cleaned = 1; +} + +static inline void log_clear_bit(struct log_c *l, + uint32_t *bs, unsigned int bit) +{ + __clear_bit_le(bit, bs); + l->touched_dirtied = 1; +} + +/*---------------------------------------------------------------- + * Header IO + *--------------------------------------------------------------*/ +static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) +{ + disk->magic = cpu_to_le32(core->magic); + disk->version = cpu_to_le32(core->version); + disk->nr_regions = cpu_to_le64(core->nr_regions); +} + +static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) +{ + core->magic = le32_to_cpu(disk->magic); + core->version = le32_to_cpu(disk->version); + core->nr_regions = le64_to_cpu(disk->nr_regions); +} + +static int rw_header(struct log_c *lc, enum req_op op) +{ + lc->io_req.bi_opf = op; + + return dm_io(&lc->io_req, 1, &lc->header_location, NULL); +} + +static int flush_header(struct log_c *lc) +{ + struct dm_io_region null_location = { + .bdev = lc->header_location.bdev, + .sector = 0, + .count = 0, + }; + + lc->io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + return dm_io(&lc->io_req, 1, &null_location, NULL); +} + +static int read_header(struct log_c *log) +{ + int r; + + r = rw_header(log, REQ_OP_READ); + if (r) + return r; + + header_from_disk(&log->header, log->disk_header); + + /* New log required? */ + if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { + log->header.magic = MIRROR_MAGIC; + log->header.version = MIRROR_DISK_VERSION; + log->header.nr_regions = 0; + } + +#ifdef __LITTLE_ENDIAN + if (log->header.version == 1) + log->header.version = 2; +#endif + + if (log->header.version != MIRROR_DISK_VERSION) { + DMWARN("incompatible disk log version"); + return -EINVAL; + } + + return 0; +} + +static int _check_region_size(struct dm_target *ti, uint32_t region_size) +{ + if (region_size < 2 || region_size > ti->len) + return 0; + + if (!is_power_of_2(region_size)) + return 0; + + return 1; +} + +/*---------------------------------------------------------------- + * core log constructor/destructor + * + * argv contains region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ +#define BYTE_SHIFT 3 +static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv, + struct dm_dev *dev) +{ + enum sync sync = DEFAULTSYNC; + + struct log_c *lc; + uint32_t region_size; + unsigned int region_count; + size_t bitset_size, buf_size; + int r; + char dummy; + + if (argc < 1 || argc > 2) { + DMWARN("wrong number of arguments to dirty region log"); + return -EINVAL; + } + + if (argc > 1) { + if (!strcmp(argv[1], "sync")) + sync = FORCESYNC; + else if (!strcmp(argv[1], "nosync")) + sync = NOSYNC; + else { + DMWARN("unrecognised sync argument to dirty region log: %s", argv[1]); + return -EINVAL; + } + } + + if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || + !_check_region_size(ti, region_size)) { + DMWARN("invalid region size %s", argv[0]); + return -EINVAL; + } + + region_count = dm_sector_div_up(ti->len, region_size); + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { + DMWARN("couldn't allocate core log"); + return -ENOMEM; + } + + lc->ti = ti; + lc->touched_dirtied = 0; + lc->touched_cleaned = 0; + lc->flush_failed = 0; + lc->region_size = region_size; + lc->region_count = region_count; + lc->sync = sync; + + /* + * Work out how many "unsigned long"s we need to hold the bitset. + */ + bitset_size = dm_round_up(region_count, BITS_PER_LONG); + bitset_size >>= BYTE_SHIFT; + + lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits); + + /* + * Disk log? + */ + if (!dev) { + lc->clean_bits = vmalloc(bitset_size); + if (!lc->clean_bits) { + DMWARN("couldn't allocate clean bitset"); + kfree(lc); + return -ENOMEM; + } + lc->disk_header = NULL; + } else { + lc->log_dev = dev; + lc->log_dev_failed = 0; + lc->log_dev_flush_failed = 0; + lc->header_location.bdev = lc->log_dev->bdev; + lc->header_location.sector = 0; + + /* + * Buffer holds both header and bitset. + */ + buf_size = + dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, + bdev_logical_block_size(lc->header_location. + bdev)); + + if (buf_size > bdev_nr_bytes(dev->bdev)) { + DMWARN("log device %s too small: need %llu bytes", + dev->name, (unsigned long long)buf_size); + kfree(lc); + return -EINVAL; + } + + lc->header_location.count = buf_size >> SECTOR_SHIFT; + + lc->io_req.mem.type = DM_IO_VMA; + lc->io_req.notify.fn = NULL; + lc->io_req.client = dm_io_client_create(); + if (IS_ERR(lc->io_req.client)) { + r = PTR_ERR(lc->io_req.client); + DMWARN("couldn't allocate disk io client"); + kfree(lc); + return r; + } + + lc->disk_header = vmalloc(buf_size); + if (!lc->disk_header) { + DMWARN("couldn't allocate disk log buffer"); + dm_io_client_destroy(lc->io_req.client); + kfree(lc); + return -ENOMEM; + } + + lc->io_req.mem.ptr.vma = lc->disk_header; + lc->clean_bits = (void *)lc->disk_header + + (LOG_OFFSET << SECTOR_SHIFT); + } + + memset(lc->clean_bits, -1, bitset_size); + + lc->sync_bits = vmalloc(bitset_size); + if (!lc->sync_bits) { + DMWARN("couldn't allocate sync bitset"); + if (!dev) + vfree(lc->clean_bits); + else + dm_io_client_destroy(lc->io_req.client); + vfree(lc->disk_header); + kfree(lc); + return -ENOMEM; + } + memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); + lc->sync_count = (sync == NOSYNC) ? region_count : 0; + + lc->recovering_bits = vzalloc(bitset_size); + if (!lc->recovering_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(lc->sync_bits); + if (!dev) + vfree(lc->clean_bits); + else + dm_io_client_destroy(lc->io_req.client); + vfree(lc->disk_header); + kfree(lc); + return -ENOMEM; + } + lc->sync_search = 0; + log->context = lc; + + return 0; +} + +static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + return create_log_context(log, ti, argc, argv, NULL); +} + +static void destroy_log_context(struct log_c *lc) +{ + vfree(lc->sync_bits); + vfree(lc->recovering_bits); + kfree(lc); +} + +static void core_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + vfree(lc->clean_bits); + destroy_log_context(lc); +} + +/*---------------------------------------------------------------- + * disk log constructor/destructor + * + * argv contains log_device region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ +static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc < 2 || argc > 3) { + DMWARN("wrong number of arguments to disk dirty region log"); + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); + if (r) + return r; + + r = create_log_context(log, ti, argc - 1, argv + 1, dev); + if (r) { + dm_put_device(ti, dev); + return r; + } + + return 0; +} + +static void disk_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + dm_put_device(lc->ti, lc->log_dev); + vfree(lc->disk_header); + dm_io_client_destroy(lc->io_req.client); + destroy_log_context(lc); +} + +static void fail_log_device(struct log_c *lc) +{ + if (lc->log_dev_failed) + return; + + lc->log_dev_failed = 1; + dm_table_event(lc->ti->table); +} + +static int disk_resume(struct dm_dirty_log *log) +{ + int r; + unsigned int i; + struct log_c *lc = (struct log_c *) log->context; + size_t size = lc->bitset_uint32_count * sizeof(uint32_t); + + /* read the disk header */ + r = read_header(lc); + if (r) { + DMWARN("%s: Failed to read header on dirty region log device", + lc->log_dev->name); + fail_log_device(lc); + /* + * If the log device cannot be read, we must assume + * all regions are out-of-sync. If we simply return + * here, the state will be uninitialized and could + * lead us to return 'in-sync' status for regions + * that are actually 'out-of-sync'. + */ + lc->header.nr_regions = 0; + } + + /* set or clear any new bits -- device has grown */ + if (lc->sync == NOSYNC) + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_set_bit(lc, lc->clean_bits, i); + else + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_clear_bit(lc, lc->clean_bits, i); + + /* clear any old bits -- device has shrunk */ + for (i = lc->region_count; i % BITS_PER_LONG; i++) + log_clear_bit(lc, lc->clean_bits, i); + + /* copy clean across to sync */ + memcpy(lc->sync_bits, lc->clean_bits, size); + lc->sync_count = memweight(lc->clean_bits, + lc->bitset_uint32_count * sizeof(uint32_t)); + lc->sync_search = 0; + + /* set the correct number of regions in the header */ + lc->header.nr_regions = lc->region_count; + + header_to_disk(&lc->header, lc->disk_header); + + /* write the new header */ + r = rw_header(lc, REQ_OP_WRITE); + if (!r) { + r = flush_header(lc); + if (r) + lc->log_dev_flush_failed = 1; + } + if (r) { + DMWARN("%s: Failed to write header on dirty region log device", + lc->log_dev->name); + fail_log_device(lc); + } + + return r; +} + +static uint32_t core_get_region_size(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + return lc->region_size; +} + +static int core_resume(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + lc->sync_search = 0; + return 0; +} + +static int core_is_clean(struct dm_dirty_log *log, region_t region) +{ + struct log_c *lc = (struct log_c *) log->context; + return log_test_bit(lc->clean_bits, region); +} + +static int core_in_sync(struct dm_dirty_log *log, region_t region, int block) +{ + struct log_c *lc = (struct log_c *) log->context; + return log_test_bit(lc->sync_bits, region); +} + +static int core_flush(struct dm_dirty_log *log) +{ + /* no op */ + return 0; +} + +static int disk_flush(struct dm_dirty_log *log) +{ + int r, i; + struct log_c *lc = log->context; + + /* only write if the log has changed */ + if (!lc->touched_cleaned && !lc->touched_dirtied) + return 0; + + if (lc->touched_cleaned && log->flush_callback_fn && + log->flush_callback_fn(lc->ti)) { + /* + * At this point it is impossible to determine which + * regions are clean and which are dirty (without + * re-reading the log off disk). So mark all of them + * dirty. + */ + lc->flush_failed = 1; + for (i = 0; i < lc->region_count; i++) + log_clear_bit(lc, lc->clean_bits, i); + } + + r = rw_header(lc, REQ_OP_WRITE); + if (r) + fail_log_device(lc); + else { + if (lc->touched_dirtied) { + r = flush_header(lc); + if (r) { + lc->log_dev_flush_failed = 1; + fail_log_device(lc); + } else + lc->touched_dirtied = 0; + } + lc->touched_cleaned = 0; + } + + return r; +} + +static void core_mark_region(struct dm_dirty_log *log, region_t region) +{ + struct log_c *lc = (struct log_c *) log->context; + log_clear_bit(lc, lc->clean_bits, region); +} + +static void core_clear_region(struct dm_dirty_log *log, region_t region) +{ + struct log_c *lc = (struct log_c *) log->context; + if (likely(!lc->flush_failed)) + log_set_bit(lc, lc->clean_bits, region); +} + +static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) +{ + struct log_c *lc = (struct log_c *) log->context; + + if (lc->sync_search >= lc->region_count) + return 0; + + do { + *region = find_next_zero_bit_le(lc->sync_bits, + lc->region_count, + lc->sync_search); + lc->sync_search = *region + 1; + + if (*region >= lc->region_count) + return 0; + + } while (log_test_bit(lc->recovering_bits, *region)); + + log_set_bit(lc, lc->recovering_bits, *region); + return 1; +} + +static void core_set_region_sync(struct dm_dirty_log *log, region_t region, + int in_sync) +{ + struct log_c *lc = (struct log_c *) log->context; + + log_clear_bit(lc, lc->recovering_bits, region); + if (in_sync) { + log_set_bit(lc, lc->sync_bits, region); + lc->sync_count++; + } else if (log_test_bit(lc->sync_bits, region)) { + lc->sync_count--; + log_clear_bit(lc, lc->sync_bits, region); + } +} + +static region_t core_get_sync_count(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + return lc->sync_count; +} + +#define DMEMIT_SYNC \ + if (lc->sync != DEFAULTSYNC) \ + DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") + +static int core_status(struct dm_dirty_log *log, status_type_t status, + char *result, unsigned int maxlen) +{ + int sz = 0; + struct log_c *lc = log->context; + + switch(status) { + case STATUSTYPE_INFO: + DMEMIT("1 %s", log->type->name); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %u %u ", log->type->name, + lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); + DMEMIT_SYNC; + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return sz; +} + +static int disk_status(struct dm_dirty_log *log, status_type_t status, + char *result, unsigned int maxlen) +{ + int sz = 0; + struct log_c *lc = log->context; + + switch(status) { + case STATUSTYPE_INFO: + DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, + lc->log_dev_flush_failed ? 'F' : + lc->log_dev_failed ? 'D' : + 'A'); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %u %s %u ", log->type->name, + lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name, + lc->region_size); + DMEMIT_SYNC; + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return sz; +} + +static struct dm_dirty_log_type _core_type = { + .name = "core", + .module = THIS_MODULE, + .ctr = core_ctr, + .dtr = core_dtr, + .resume = core_resume, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = core_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .set_region_sync = core_set_region_sync, + .get_sync_count = core_get_sync_count, + .status = core_status, +}; + +static struct dm_dirty_log_type _disk_type = { + .name = "disk", + .module = THIS_MODULE, + .ctr = disk_ctr, + .dtr = disk_dtr, + .postsuspend = disk_flush, + .resume = disk_resume, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = disk_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .set_region_sync = core_set_region_sync, + .get_sync_count = core_get_sync_count, + .status = disk_status, +}; + +static int __init dm_dirty_log_init(void) +{ + int r; + + r = dm_dirty_log_type_register(&_core_type); + if (r) + DMWARN("couldn't register core log"); + + r = dm_dirty_log_type_register(&_disk_type); + if (r) { + DMWARN("couldn't register disk type"); + dm_dirty_log_type_unregister(&_core_type); + } + + return r; +} + +static void __exit dm_dirty_log_exit(void) +{ + dm_dirty_log_type_unregister(&_disk_type); + dm_dirty_log_type_unregister(&_core_type); +} + +module_init(dm_dirty_log_init); +module_exit(dm_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " dirty region log"); +MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c new file mode 100644 index 000000000..66032ab3c --- /dev/null +++ b/drivers/md/dm-mpath.c @@ -0,0 +1,2256 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include + +#include "dm-rq.h" +#include "dm-bio-record.h" +#include "dm-path-selector.h" +#include "dm-uevent.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "multipath" +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned int) -1) +#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0 + +static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT; + +/* Path properties */ +struct pgpath { + struct list_head list; + + struct priority_group *pg; /* Owning PG */ + unsigned int fail_count; /* Cumulative failure count */ + + struct dm_path path; + struct delayed_work activate_path; + + bool is_active:1; /* Path status */ +}; + +#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) + +/* + * Paths are grouped into Priority Groups and numbered from 1 upwards. + * Each has a path selector which controls which path gets used. + */ +struct priority_group { + struct list_head list; + + struct multipath *m; /* Owning multipath instance */ + struct path_selector ps; + + unsigned int pg_num; /* Reference number */ + unsigned int nr_pgpaths; /* Number of paths in PG */ + struct list_head pgpaths; + + bool bypassed:1; /* Temporarily bypass this PG? */ +}; + +/* Multipath context */ +struct multipath { + unsigned long flags; /* Multipath state flags */ + + spinlock_t lock; + enum dm_queue_mode queue_mode; + + struct pgpath *current_pgpath; + struct priority_group *current_pg; + struct priority_group *next_pg; /* Switch to this PG if set */ + + atomic_t nr_valid_paths; /* Total number of usable paths */ + unsigned int nr_priority_groups; + struct list_head priority_groups; + + const char *hw_handler_name; + char *hw_handler_params; + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned int pg_init_retries; /* Number of times to retry pg_init */ + unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ + atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ + atomic_t pg_init_count; /* Number of times pg_init called */ + + struct mutex work_mutex; + struct work_struct trigger_event; + struct dm_target *ti; + + struct work_struct process_queued_bios; + struct bio_list queued_bios; + + struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ +}; + +/* + * Context information attached to each io we process. + */ +struct dm_mpath_io { + struct pgpath *pgpath; + size_t nr_bytes; + u64 start_time_ns; +}; + +typedef int (*action_fn) (struct pgpath *pgpath); + +static struct workqueue_struct *kmultipathd, *kmpath_handlerd; +static void trigger_event(struct work_struct *work); +static void activate_or_offline_path(struct pgpath *pgpath); +static void activate_path_work(struct work_struct *work); +static void process_queued_bios(struct work_struct *work); +static void queue_if_no_path_timeout_work(struct timer_list *t); + +/*----------------------------------------------- + * Multipath state flags. + *-----------------------------------------------*/ + +#define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ +#define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ +#define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ +#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ +#define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ +#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ +#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ + +static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) +{ + bool r = test_bit(MPATHF_bit, &m->flags); + + if (r) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + r = test_bit(MPATHF_bit, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + } + + return r; +} + +/*----------------------------------------------- + * Allocation routines + *-----------------------------------------------*/ + +static struct pgpath *alloc_pgpath(void) +{ + struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); + + if (!pgpath) + return NULL; + + pgpath->is_active = true; + + return pgpath; +} + +static void free_pgpath(struct pgpath *pgpath) +{ + kfree(pgpath); +} + +static struct priority_group *alloc_priority_group(void) +{ + struct priority_group *pg; + + pg = kzalloc(sizeof(*pg), GFP_KERNEL); + + if (pg) + INIT_LIST_HEAD(&pg->pgpaths); + + return pg; +} + +static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) +{ + struct pgpath *pgpath, *tmp; + + list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { + list_del(&pgpath->list); + dm_put_device(ti, pgpath->path.dev); + free_pgpath(pgpath); + } +} + +static void free_priority_group(struct priority_group *pg, + struct dm_target *ti) +{ + struct path_selector *ps = &pg->ps; + + if (ps->type) { + ps->type->destroy(ps); + dm_put_path_selector(ps->type); + } + + free_pgpaths(&pg->pgpaths, ti); + kfree(pg); +} + +static struct multipath *alloc_multipath(struct dm_target *ti) +{ + struct multipath *m; + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (m) { + INIT_LIST_HEAD(&m->priority_groups); + spin_lock_init(&m->lock); + atomic_set(&m->nr_valid_paths, 0); + INIT_WORK(&m->trigger_event, trigger_event); + mutex_init(&m->work_mutex); + + m->queue_mode = DM_TYPE_NONE; + + m->ti = ti; + ti->private = m; + + timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0); + } + + return m; +} + +static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) +{ + if (m->queue_mode == DM_TYPE_NONE) { + m->queue_mode = DM_TYPE_REQUEST_BASED; + } else if (m->queue_mode == DM_TYPE_BIO_BASED) { + INIT_WORK(&m->process_queued_bios, process_queued_bios); + /* + * bio-based doesn't support any direct scsi_dh management; + * it just discovers if a scsi_dh is attached. + */ + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + } + + dm_table_set_type(ti->table, m->queue_mode); + + /* + * Init fields that are only used when a scsi_dh is attached + * - must do this unconditionally (really doesn't hurt non-SCSI uses) + */ + set_bit(MPATHF_QUEUE_IO, &m->flags); + atomic_set(&m->pg_init_in_progress, 0); + atomic_set(&m->pg_init_count, 0); + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; + init_waitqueue_head(&m->pg_init_wait); + + return 0; +} + +static void free_multipath(struct multipath *m) +{ + struct priority_group *pg, *tmp; + + list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { + list_del(&pg->list); + free_priority_group(pg, m->ti); + } + + kfree(m->hw_handler_name); + kfree(m->hw_handler_params); + mutex_destroy(&m->work_mutex); + kfree(m); +} + +static struct dm_mpath_io *get_mpio(union map_info *info) +{ + return info->ptr; +} + +static size_t multipath_per_bio_data_size(void) +{ + return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); +} + +static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) +{ + return dm_per_bio_data(bio, multipath_per_bio_data_size()); +} + +static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio) +{ + /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ + void *bio_details = mpio + 1; + return bio_details; +} + +static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p) +{ + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); + struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio); + + mpio->nr_bytes = bio->bi_iter.bi_size; + mpio->pgpath = NULL; + mpio->start_time_ns = 0; + *mpio_p = mpio; + + dm_bio_record(bio_details, bio); +} + +/*----------------------------------------------- + * Path selection + *-----------------------------------------------*/ + +static int __pg_init_all_paths(struct multipath *m) +{ + struct pgpath *pgpath; + unsigned long pg_init_delay = 0; + + lockdep_assert_held(&m->lock); + + if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) + return 0; + + atomic_inc(&m->pg_init_count); + clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); + + /* Check here to reset pg_init_required */ + if (!m->current_pg) + return 0; + + if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags)) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); + list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { + /* Skip failed paths */ + if (!pgpath->is_active) + continue; + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) + atomic_inc(&m->pg_init_in_progress); + } + return atomic_read(&m->pg_init_in_progress); +} + +static int pg_init_all_paths(struct multipath *m) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + ret = __pg_init_all_paths(m); + spin_unlock_irqrestore(&m->lock, flags); + + return ret; +} + +static void __switch_pg(struct multipath *m, struct priority_group *pg) +{ + lockdep_assert_held(&m->lock); + + m->current_pg = pg; + + /* Must we initialise the PG first, and queue I/O till it's ready? */ + if (m->hw_handler_name) { + set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); + set_bit(MPATHF_QUEUE_IO, &m->flags); + } else { + clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); + clear_bit(MPATHF_QUEUE_IO, &m->flags); + } + + atomic_set(&m->pg_init_count, 0); +} + +static struct pgpath *choose_path_in_pg(struct multipath *m, + struct priority_group *pg, + size_t nr_bytes) +{ + unsigned long flags; + struct dm_path *path; + struct pgpath *pgpath; + + path = pg->ps.type->select_path(&pg->ps, nr_bytes); + if (!path) + return ERR_PTR(-ENXIO); + + pgpath = path_to_pgpath(path); + + if (unlikely(READ_ONCE(m->current_pg) != pg)) { + /* Only update current_pgpath if pg changed */ + spin_lock_irqsave(&m->lock, flags); + m->current_pgpath = pgpath; + __switch_pg(m, pg); + spin_unlock_irqrestore(&m->lock, flags); + } + + return pgpath; +} + +static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) +{ + unsigned long flags; + struct priority_group *pg; + struct pgpath *pgpath; + unsigned int bypassed = 1; + + if (!atomic_read(&m->nr_valid_paths)) { + spin_lock_irqsave(&m->lock, flags); + clear_bit(MPATHF_QUEUE_IO, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + goto failed; + } + + /* Were we instructed to switch PG? */ + if (READ_ONCE(m->next_pg)) { + spin_lock_irqsave(&m->lock, flags); + pg = m->next_pg; + if (!pg) { + spin_unlock_irqrestore(&m->lock, flags); + goto check_current_pg; + } + m->next_pg = NULL; + spin_unlock_irqrestore(&m->lock, flags); + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + + /* Don't change PG until it has no remaining paths */ +check_current_pg: + pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + + /* + * Loop through priority groups until we find a valid path. + * First time we skip PGs marked 'bypassed'. + * Second time we only try the ones we skipped, but set + * pg_init_delay_retry so we do not hammer controllers. + */ + do { + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed == !!bypassed) + continue; + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) { + if (!bypassed) { + spin_lock_irqsave(&m->lock, flags); + set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + } + return pgpath; + } + } + } while (bypassed--); + +failed: + spin_lock_irqsave(&m->lock, flags); + m->current_pgpath = NULL; + m->current_pg = NULL; + spin_unlock_irqrestore(&m->lock, flags); + + return NULL; +} + +/* + * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited() + * report the function name and line number of the function from which + * it has been invoked. + */ +#define dm_report_EIO(m) \ +do { \ + DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \ + dm_table_device_name((m)->ti->table), \ + test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ + dm_noflush_suspending((m)->ti)); \ +} while (0) + +/* + * Check whether bios must be queued in the device-mapper core rather + * than here in the target. + */ +static bool __must_push_back(struct multipath *m) +{ + return dm_noflush_suspending(m->ti); +} + +static bool must_push_back_rq(struct multipath *m) +{ + unsigned long flags; + bool ret; + + spin_lock_irqsave(&m->lock, flags); + ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m)); + spin_unlock_irqrestore(&m->lock, flags); + + return ret; +} + +/* + * Map cloned requests (request-based multipath) + */ +static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **__clone) +{ + struct multipath *m = ti->private; + size_t nr_bytes = blk_rq_bytes(rq); + struct pgpath *pgpath; + struct block_device *bdev; + struct dm_mpath_io *mpio = get_mpio(map_context); + struct request_queue *q; + struct request *clone; + + /* Do we need to select a new pgpath? */ + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) + pgpath = choose_pgpath(m, nr_bytes); + + if (!pgpath) { + if (must_push_back_rq(m)) + return DM_MAPIO_DELAY_REQUEUE; + dm_report_EIO(m); /* Failed */ + return DM_MAPIO_KILL; + } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || + mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { + pg_init_all_paths(m); + return DM_MAPIO_DELAY_REQUEUE; + } + + mpio->pgpath = pgpath; + mpio->nr_bytes = nr_bytes; + + bdev = pgpath->path.dev->bdev; + q = bdev_get_queue(bdev); + clone = blk_mq_alloc_request(q, rq->cmd_flags | REQ_NOMERGE, + BLK_MQ_REQ_NOWAIT); + if (IS_ERR(clone)) { + /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ + if (blk_queue_dying(q)) { + atomic_inc(&m->pg_init_in_progress); + activate_or_offline_path(pgpath); + return DM_MAPIO_DELAY_REQUEUE; + } + + /* + * blk-mq's SCHED_RESTART can cover this requeue, so we + * needn't deal with it by DELAY_REQUEUE. More importantly, + * we have to return DM_MAPIO_REQUEUE so that blk-mq can + * get the queue busy feedback (via BLK_STS_RESOURCE), + * otherwise I/O merging can suffer. + */ + return DM_MAPIO_REQUEUE; + } + clone->bio = clone->biotail = NULL; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + *__clone = clone; + + if (pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, + &pgpath->path, + nr_bytes); + return DM_MAPIO_REMAPPED; +} + +static void multipath_release_clone(struct request *clone, + union map_info *map_context) +{ + if (unlikely(map_context)) { + /* + * non-NULL map_context means caller is still map + * method; must undo multipath_clone_and_map() + */ + struct dm_mpath_io *mpio = get_mpio(map_context); + struct pgpath *pgpath = mpio->pgpath; + + if (pgpath && pgpath->pg->ps.type->end_io) + pgpath->pg->ps.type->end_io(&pgpath->pg->ps, + &pgpath->path, + mpio->nr_bytes, + clone->io_start_time_ns); + } + + blk_mq_free_request(clone); +} + +/* + * Map cloned bios (bio-based multipath) + */ + +static void __multipath_queue_bio(struct multipath *m, struct bio *bio) +{ + /* Queue for the daemon to resubmit */ + bio_list_add(&m->queued_bios, bio); + if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) + queue_work(kmultipathd, &m->process_queued_bios); +} + +static void multipath_queue_bio(struct multipath *m, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + __multipath_queue_bio(m, bio); + spin_unlock_irqrestore(&m->lock, flags); +} + +static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) +{ + struct pgpath *pgpath; + unsigned long flags; + + /* Do we need to select a new pgpath? */ + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) + pgpath = choose_pgpath(m, bio->bi_iter.bi_size); + + if (!pgpath) { + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + __multipath_queue_bio(m, bio); + pgpath = ERR_PTR(-EAGAIN); + } + spin_unlock_irqrestore(&m->lock, flags); + + } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || + mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { + multipath_queue_bio(m, bio); + pg_init_all_paths(m); + return ERR_PTR(-EAGAIN); + } + + return pgpath; +} + +static int __multipath_map_bio(struct multipath *m, struct bio *bio, + struct dm_mpath_io *mpio) +{ + struct pgpath *pgpath = __map_bio(m, bio); + + if (IS_ERR(pgpath)) + return DM_MAPIO_SUBMITTED; + + if (!pgpath) { + if (__must_push_back(m)) + return DM_MAPIO_REQUEUE; + dm_report_EIO(m); + return DM_MAPIO_KILL; + } + + mpio->pgpath = pgpath; + + if (dm_ps_use_hr_timer(pgpath->pg->ps.type)) + mpio->start_time_ns = ktime_get_ns(); + + bio->bi_status = 0; + bio_set_dev(bio, pgpath->path.dev->bdev); + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; + + if (pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, + &pgpath->path, + mpio->nr_bytes); + return DM_MAPIO_REMAPPED; +} + +static int multipath_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct multipath *m = ti->private; + struct dm_mpath_io *mpio = NULL; + + multipath_init_per_bio_data(bio, &mpio); + return __multipath_map_bio(m, bio, mpio); +} + +static void process_queued_io_list(struct multipath *m) +{ + if (m->queue_mode == DM_TYPE_REQUEST_BASED) + dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); + else if (m->queue_mode == DM_TYPE_BIO_BASED) + queue_work(kmultipathd, &m->process_queued_bios); +} + +static void process_queued_bios(struct work_struct *work) +{ + int r; + unsigned long flags; + struct bio *bio; + struct bio_list bios; + struct blk_plug plug; + struct multipath *m = + container_of(work, struct multipath, process_queued_bios); + + bio_list_init(&bios); + + spin_lock_irqsave(&m->lock, flags); + + if (bio_list_empty(&m->queued_bios)) { + spin_unlock_irqrestore(&m->lock, flags); + return; + } + + bio_list_merge(&bios, &m->queued_bios); + bio_list_init(&m->queued_bios); + + spin_unlock_irqrestore(&m->lock, flags); + + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bios))) { + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); + dm_bio_restore(get_bio_details_from_mpio(mpio), bio); + r = __multipath_map_bio(m, bio, mpio); + switch (r) { + case DM_MAPIO_KILL: + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + case DM_MAPIO_REQUEUE: + bio->bi_status = BLK_STS_DM_REQUEUE; + bio_endio(bio); + break; + case DM_MAPIO_REMAPPED: + submit_bio_noacct(bio); + break; + case DM_MAPIO_SUBMITTED: + break; + default: + WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r); + } + } + blk_finish_plug(&plug); +} + +/* + * If we run out of usable paths, should we queue I/O or error it? + */ +static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, + bool save_old_value, const char *caller) +{ + unsigned long flags; + bool queue_if_no_path_bit, saved_queue_if_no_path_bit; + const char *dm_dev_name = dm_table_device_name(m->ti->table); + + DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d", + dm_dev_name, __func__, caller, queue_if_no_path, save_old_value); + + spin_lock_irqsave(&m->lock, flags); + + queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + + if (save_old_value) { + if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) { + DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!", + dm_dev_name); + } else + assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit); + } else if (!queue_if_no_path && saved_queue_if_no_path_bit) { + /* due to "fail_if_no_path" message, need to honor it. */ + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } + assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); + + DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d", + dm_dev_name, __func__, + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags), + dm_noflush_suspending(m->ti)); + + spin_unlock_irqrestore(&m->lock, flags); + + if (!queue_if_no_path) { + dm_table_run_md_queue_async(m->ti->table); + process_queued_io_list(m); + } + + return 0; +} + +/* + * If the queue_if_no_path timeout fires, turn off queue_if_no_path and + * process any queued I/O. + */ +static void queue_if_no_path_timeout_work(struct timer_list *t) +{ + struct multipath *m = from_timer(m, t, nopath_timer); + + DMWARN("queue_if_no_path timeout on %s, failing queued IO", + dm_table_device_name(m->ti->table)); + queue_if_no_path(m, false, false, __func__); +} + +/* + * Enable the queue_if_no_path timeout if necessary. + * Called with m->lock held. + */ +static void enable_nopath_timeout(struct multipath *m) +{ + unsigned long queue_if_no_path_timeout = + READ_ONCE(queue_if_no_path_timeout_secs) * HZ; + + lockdep_assert_held(&m->lock); + + if (queue_if_no_path_timeout > 0 && + atomic_read(&m->nr_valid_paths) == 0 && + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + mod_timer(&m->nopath_timer, + jiffies + queue_if_no_path_timeout); + } +} + +static void disable_nopath_timeout(struct multipath *m) +{ + del_timer_sync(&m->nopath_timer); +} + +/* + * An event is triggered whenever a path is taken out of use. + * Includes path failure and PG bypass. + */ +static void trigger_event(struct work_struct *work) +{ + struct multipath *m = + container_of(work, struct multipath, trigger_event); + + dm_table_event(m->ti->table); +} + +/*----------------------------------------------------------------- + * Constructor/argument parsing: + * <#multipath feature args> []* + * <#hw_handler args> [hw_handler []*] + * <#priority groups> + * + * [ <#selector args> []* + * <#paths> <#per-path selector args> + * [ []* ]+ ]+ + *---------------------------------------------------------------*/ +static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, + struct dm_target *ti) +{ + int r; + struct path_selector_type *pst; + unsigned int ps_argc; + + static const struct dm_arg _args[] = { + {0, 1024, "invalid number of path selector args"}, + }; + + pst = dm_get_path_selector(dm_shift_arg(as)); + if (!pst) { + ti->error = "unknown path selector type"; + return -EINVAL; + } + + r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); + if (r) { + dm_put_path_selector(pst); + return -EINVAL; + } + + r = pst->create(&pg->ps, ps_argc, as->argv); + if (r) { + dm_put_path_selector(pst); + ti->error = "path selector constructor failed"; + return r; + } + + pg->ps.type = pst; + dm_consume_args(as, ps_argc); + + return 0; +} + +static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, + const char **attached_handler_name, char **error) +{ + struct request_queue *q = bdev_get_queue(bdev); + int r; + + if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) { +retain: + if (*attached_handler_name) { + /* + * Clear any hw_handler_params associated with a + * handler that isn't already attached. + */ + if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) { + kfree(m->hw_handler_params); + m->hw_handler_params = NULL; + } + + /* + * Reset hw_handler_name to match the attached handler + * + * NB. This modifies the table line to show the actual + * handler instead of the original table passed in. + */ + kfree(m->hw_handler_name); + m->hw_handler_name = *attached_handler_name; + *attached_handler_name = NULL; + } + } + + if (m->hw_handler_name) { + r = scsi_dh_attach(q, m->hw_handler_name); + if (r == -EBUSY) { + DMINFO("retaining handler on device %pg", bdev); + goto retain; + } + if (r < 0) { + *error = "error attaching hardware handler"; + return r; + } + + if (m->hw_handler_params) { + r = scsi_dh_set_params(q, m->hw_handler_params); + if (r < 0) { + *error = "unable to set hardware handler parameters"; + return r; + } + } + } + + return 0; +} + +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, + struct dm_target *ti) +{ + int r; + struct pgpath *p; + struct multipath *m = ti->private; + struct request_queue *q; + const char *attached_handler_name = NULL; + + /* we need at least a path arg */ + if (as->argc < 1) { + ti->error = "no device given"; + return ERR_PTR(-EINVAL); + } + + p = alloc_pgpath(); + if (!p) + return ERR_PTR(-ENOMEM); + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &p->path.dev); + if (r) { + ti->error = "error getting device"; + goto bad; + } + + q = bdev_get_queue(p->path.dev->bdev); + attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); + if (attached_handler_name || m->hw_handler_name) { + INIT_DELAYED_WORK(&p->activate_path, activate_path_work); + r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); + kfree(attached_handler_name); + if (r) { + dm_put_device(ti, p->path.dev); + goto bad; + } + } + + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); + if (r) { + dm_put_device(ti, p->path.dev); + goto bad; + } + + return p; + bad: + free_pgpath(p); + return ERR_PTR(r); +} + +static struct priority_group *parse_priority_group(struct dm_arg_set *as, + struct multipath *m) +{ + static const struct dm_arg _args[] = { + {1, 1024, "invalid number of paths"}, + {0, 1024, "invalid number of selector args"} + }; + + int r; + unsigned int i, nr_selector_args, nr_args; + struct priority_group *pg; + struct dm_target *ti = m->ti; + + if (as->argc < 2) { + as->argc = 0; + ti->error = "not enough priority group arguments"; + return ERR_PTR(-EINVAL); + } + + pg = alloc_priority_group(); + if (!pg) { + ti->error = "couldn't allocate priority group"; + return ERR_PTR(-ENOMEM); + } + pg->m = m; + + r = parse_path_selector(as, pg, ti); + if (r) + goto bad; + + /* + * read the paths + */ + r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); + if (r) + goto bad; + + nr_args = 1 + nr_selector_args; + for (i = 0; i < pg->nr_pgpaths; i++) { + struct pgpath *pgpath; + struct dm_arg_set path_args; + + if (as->argc < nr_args) { + ti->error = "not enough path parameters"; + r = -EINVAL; + goto bad; + } + + path_args.argc = nr_args; + path_args.argv = as->argv; + + pgpath = parse_path(&path_args, &pg->ps, ti); + if (IS_ERR(pgpath)) { + r = PTR_ERR(pgpath); + goto bad; + } + + pgpath->pg = pg; + list_add_tail(&pgpath->list, &pg->pgpaths); + dm_consume_args(as, nr_args); + } + + return pg; + + bad: + free_priority_group(pg, ti); + return ERR_PTR(r); +} + +static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) +{ + unsigned int hw_argc; + int ret; + struct dm_target *ti = m->ti; + + static const struct dm_arg _args[] = { + {0, 1024, "invalid number of hardware handler args"}, + }; + + if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) + return -EINVAL; + + if (!hw_argc) + return 0; + + if (m->queue_mode == DM_TYPE_BIO_BASED) { + dm_consume_args(as, hw_argc); + DMERR("bio-based multipath doesn't allow hardware handler args"); + return 0; + } + + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); + if (!m->hw_handler_name) + return -EINVAL; + + if (hw_argc > 1) { + char *p; + int i, j, len = 4; + + for (i = 0; i <= hw_argc - 2; i++) + len += strlen(as->argv[i]) + 1; + p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); + if (!p) { + ti->error = "memory allocation failed"; + ret = -ENOMEM; + goto fail; + } + j = sprintf(p, "%d", hw_argc - 1); + for (i = 0, p += j + 1; i <= hw_argc - 2; i++, p += j + 1) + j = sprintf(p, "%s", as->argv[i]); + } + dm_consume_args(as, hw_argc - 1); + + return 0; +fail: + kfree(m->hw_handler_name); + m->hw_handler_name = NULL; + return ret; +} + +static int parse_features(struct dm_arg_set *as, struct multipath *m) +{ + int r; + unsigned int argc; + struct dm_target *ti = m->ti; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, 8, "invalid number of feature args"}, + {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, + }; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + do { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "queue_if_no_path")) { + r = queue_if_no_path(m, true, false, __func__); + continue; + } + + if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + continue; + } + + if (!strcasecmp(arg_name, "pg_init_retries") && + (argc >= 1)) { + r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); + argc--; + continue; + } + + if (!strcasecmp(arg_name, "pg_init_delay_msecs") && + (argc >= 1)) { + r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); + argc--; + continue; + } + + if (!strcasecmp(arg_name, "queue_mode") && + (argc >= 1)) { + const char *queue_mode_name = dm_shift_arg(as); + + if (!strcasecmp(queue_mode_name, "bio")) + m->queue_mode = DM_TYPE_BIO_BASED; + else if (!strcasecmp(queue_mode_name, "rq") || + !strcasecmp(queue_mode_name, "mq")) + m->queue_mode = DM_TYPE_REQUEST_BASED; + else { + ti->error = "Unknown 'queue_mode' requested"; + r = -EINVAL; + } + argc--; + continue; + } + + ti->error = "Unrecognised multipath feature request"; + r = -EINVAL; + } while (argc && !r); + + return r; +} + +static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + /* target arguments */ + static const struct dm_arg _args[] = { + {0, 1024, "invalid number of priority groups"}, + {0, 1024, "invalid initial priority group number"}, + }; + + int r; + struct multipath *m; + struct dm_arg_set as; + unsigned int pg_count = 0; + unsigned int next_pg_num; + unsigned long flags; + + as.argc = argc; + as.argv = argv; + + m = alloc_multipath(ti); + if (!m) { + ti->error = "can't allocate multipath"; + return -EINVAL; + } + + r = parse_features(&as, m); + if (r) + goto bad; + + r = alloc_multipath_stage2(ti, m); + if (r) + goto bad; + + r = parse_hw_handler(&as, m); + if (r) + goto bad; + + r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); + if (r) + goto bad; + + if ((!m->nr_priority_groups && next_pg_num) || + (m->nr_priority_groups && !next_pg_num)) { + ti->error = "invalid initial priority group"; + r = -EINVAL; + goto bad; + } + + /* parse the priority groups */ + while (as.argc) { + struct priority_group *pg; + unsigned int nr_valid_paths = atomic_read(&m->nr_valid_paths); + + pg = parse_priority_group(&as, m); + if (IS_ERR(pg)) { + r = PTR_ERR(pg); + goto bad; + } + + nr_valid_paths += pg->nr_pgpaths; + atomic_set(&m->nr_valid_paths, nr_valid_paths); + + list_add_tail(&pg->list, &m->priority_groups); + pg_count++; + pg->pg_num = pg_count; + if (!--next_pg_num) + m->next_pg = pg; + } + + if (pg_count != m->nr_priority_groups) { + ti->error = "priority group count mismatch"; + r = -EINVAL; + goto bad; + } + + spin_lock_irqsave(&m->lock, flags); + enable_nopath_timeout(m); + spin_unlock_irqrestore(&m->lock, flags); + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_zeroes_bios = 1; + if (m->queue_mode == DM_TYPE_BIO_BASED) + ti->per_io_data_size = multipath_per_bio_data_size(); + else + ti->per_io_data_size = sizeof(struct dm_mpath_io); + + return 0; + + bad: + free_multipath(m); + return r; +} + +static void multipath_wait_for_pg_init_completion(struct multipath *m) +{ + DEFINE_WAIT(wait); + + while (1) { + prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE); + + if (!atomic_read(&m->pg_init_in_progress)) + break; + + io_schedule(); + } + finish_wait(&m->pg_init_wait, &wait); +} + +static void flush_multipath_work(struct multipath *m) +{ + if (m->hw_handler_name) { + unsigned long flags; + + if (!atomic_read(&m->pg_init_in_progress)) + goto skip; + + spin_lock_irqsave(&m->lock, flags); + if (atomic_read(&m->pg_init_in_progress) && + !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) { + spin_unlock_irqrestore(&m->lock, flags); + + flush_workqueue(kmpath_handlerd); + multipath_wait_for_pg_init_completion(m); + + spin_lock_irqsave(&m->lock, flags); + clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); + } + spin_unlock_irqrestore(&m->lock, flags); + } +skip: + if (m->queue_mode == DM_TYPE_BIO_BASED) + flush_work(&m->process_queued_bios); + flush_work(&m->trigger_event); +} + +static void multipath_dtr(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + disable_nopath_timeout(m); + flush_multipath_work(m); + free_multipath(m); +} + +/* + * Take a path out of use. + */ +static int fail_path(struct pgpath *pgpath) +{ + unsigned long flags; + struct multipath *m = pgpath->pg->m; + + spin_lock_irqsave(&m->lock, flags); + + if (!pgpath->is_active) + goto out; + + DMWARN("%s: Failing path %s.", + dm_table_device_name(m->ti->table), + pgpath->path.dev->name); + + pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); + pgpath->is_active = false; + pgpath->fail_count++; + + atomic_dec(&m->nr_valid_paths); + + if (pgpath == m->current_pgpath) + m->current_pgpath = NULL; + + dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, + pgpath->path.dev->name, atomic_read(&m->nr_valid_paths)); + + schedule_work(&m->trigger_event); + + enable_nopath_timeout(m); + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +/* + * Reinstate a previously-failed path + */ +static int reinstate_path(struct pgpath *pgpath) +{ + int r = 0, run_queue = 0; + unsigned long flags; + struct multipath *m = pgpath->pg->m; + unsigned int nr_valid_paths; + + spin_lock_irqsave(&m->lock, flags); + + if (pgpath->is_active) + goto out; + + DMWARN("%s: Reinstating path %s.", + dm_table_device_name(m->ti->table), + pgpath->path.dev->name); + + r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); + if (r) + goto out; + + pgpath->is_active = true; + + nr_valid_paths = atomic_inc_return(&m->nr_valid_paths); + if (nr_valid_paths == 1) { + m->current_pgpath = NULL; + run_queue = 1; + } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) + atomic_inc(&m->pg_init_in_progress); + } + + dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, + pgpath->path.dev->name, nr_valid_paths); + + schedule_work(&m->trigger_event); + +out: + spin_unlock_irqrestore(&m->lock, flags); + if (run_queue) { + dm_table_run_md_queue_async(m->ti->table); + process_queued_io_list(m); + } + + if (pgpath->is_active) + disable_nopath_timeout(m); + + return r; +} + +/* + * Fail or reinstate all paths that match the provided struct dm_dev. + */ +static int action_dev(struct multipath *m, struct dm_dev *dev, + action_fn action) +{ + int r = -EINVAL; + struct pgpath *pgpath; + struct priority_group *pg; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pgpath->path.dev == dev) + r = action(pgpath); + } + } + + return r; +} + +/* + * Temporarily try to avoid having to use the specified PG + */ +static void bypass_pg(struct multipath *m, struct priority_group *pg, + bool bypassed) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + pg->bypassed = bypassed; + m->current_pgpath = NULL; + m->current_pg = NULL; + + spin_unlock_irqrestore(&m->lock, flags); + + schedule_work(&m->trigger_event); +} + +/* + * Switch to using the specified PG from the next I/O that gets mapped + */ +static int switch_pg_num(struct multipath *m, const char *pgstr) +{ + struct priority_group *pg; + unsigned int pgnum; + unsigned long flags; + char dummy; + + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || + !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { + DMWARN("invalid PG number supplied to switch_pg_num"); + return -EINVAL; + } + + spin_lock_irqsave(&m->lock, flags); + list_for_each_entry(pg, &m->priority_groups, list) { + pg->bypassed = false; + if (--pgnum) + continue; + + m->current_pgpath = NULL; + m->current_pg = NULL; + m->next_pg = pg; + } + spin_unlock_irqrestore(&m->lock, flags); + + schedule_work(&m->trigger_event); + return 0; +} + +/* + * Set/clear bypassed status of a PG. + * PGs are numbered upwards from 1 in the order they were declared. + */ +static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) +{ + struct priority_group *pg; + unsigned int pgnum; + char dummy; + + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || + !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { + DMWARN("invalid PG number supplied to bypass_pg"); + return -EINVAL; + } + + list_for_each_entry(pg, &m->priority_groups, list) { + if (!--pgnum) + break; + } + + bypass_pg(m, pg, bypassed); + return 0; +} + +/* + * Should we retry pg_init immediately? + */ +static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) +{ + unsigned long flags; + bool limit_reached = false; + + spin_lock_irqsave(&m->lock, flags); + + if (atomic_read(&m->pg_init_count) <= m->pg_init_retries && + !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) + set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); + else + limit_reached = true; + + spin_unlock_irqrestore(&m->lock, flags); + + return limit_reached; +} + +static void pg_init_done(void *data, int errors) +{ + struct pgpath *pgpath = data; + struct priority_group *pg = pgpath->pg; + struct multipath *m = pg->m; + unsigned long flags; + bool delay_retry = false; + + /* device or driver problems */ + switch (errors) { + case SCSI_DH_OK: + break; + case SCSI_DH_NOSYS: + if (!m->hw_handler_name) { + errors = 0; + break; + } + DMERR("Could not failover the device: Handler scsi_dh_%s " + "Error %d.", m->hw_handler_name, errors); + /* + * Fail path for now, so we do not ping pong + */ + fail_path(pgpath); + break; + case SCSI_DH_DEV_TEMP_BUSY: + /* + * Probably doing something like FW upgrade on the + * controller so try the other pg. + */ + bypass_pg(m, pg, true); + break; + case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = true; + fallthrough; + case SCSI_DH_IMM_RETRY: + case SCSI_DH_RES_TEMP_UNAVAIL: + if (pg_init_limit_reached(m, pgpath)) + fail_path(pgpath); + errors = 0; + break; + case SCSI_DH_DEV_OFFLINED: + default: + /* + * We probably do not want to fail the path for a device + * error, but this is what the old dm did. In future + * patches we can do more advanced handling. + */ + fail_path(pgpath); + } + + spin_lock_irqsave(&m->lock, flags); + if (errors) { + if (pgpath == m->current_pgpath) { + DMERR("Could not failover device. Error %d.", errors); + m->current_pgpath = NULL; + m->current_pg = NULL; + } + } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) + pg->bypassed = false; + + if (atomic_dec_return(&m->pg_init_in_progress) > 0) + /* Activations of other paths are still on going */ + goto out; + + if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { + if (delay_retry) + set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + else + clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + + if (__pg_init_all_paths(m)) + goto out; + } + clear_bit(MPATHF_QUEUE_IO, &m->flags); + + process_queued_io_list(m); + + /* + * Wake up any thread waiting to suspend. + */ + wake_up(&m->pg_init_wait); + +out: + spin_unlock_irqrestore(&m->lock, flags); +} + +static void activate_or_offline_path(struct pgpath *pgpath) +{ + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); + + if (pgpath->is_active && !blk_queue_dying(q)) + scsi_dh_activate(q, pg_init_done, pgpath); + else + pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); +} + +static void activate_path_work(struct work_struct *work) +{ + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path.work); + + activate_or_offline_path(pgpath); +} + +static int multipath_end_io(struct dm_target *ti, struct request *clone, + blk_status_t error, union map_info *map_context) +{ + struct dm_mpath_io *mpio = get_mpio(map_context); + struct pgpath *pgpath = mpio->pgpath; + int r = DM_ENDIO_DONE; + + /* + * We don't queue any clone request inside the multipath target + * during end I/O handling, since those clone requests don't have + * bio clones. If we queue them inside the multipath target, + * we need to make bio clones, that requires memory allocation. + * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests + * don't have bio clones.) + * Instead of queueing the clone request here, we queue the original + * request into dm core, which will remake a clone request and + * clone bios for it and resubmit it later. + */ + if (error && blk_path_error(error)) { + struct multipath *m = ti->private; + + if (error == BLK_STS_RESOURCE) + r = DM_ENDIO_DELAY_REQUEUE; + else + r = DM_ENDIO_REQUEUE; + + if (pgpath) + fail_path(pgpath); + + if (!atomic_read(&m->nr_valid_paths) && + !must_push_back_rq(m)) { + if (error == BLK_STS_IOERR) + dm_report_EIO(m); + /* complete with the original error */ + r = DM_ENDIO_DONE; + } + } + + if (pgpath) { + struct path_selector *ps = &pgpath->pg->ps; + + if (ps->type->end_io) + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + clone->io_start_time_ns); + } + + return r; +} + +static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, + blk_status_t *error) +{ + struct multipath *m = ti->private; + struct dm_mpath_io *mpio = get_mpio_from_bio(clone); + struct pgpath *pgpath = mpio->pgpath; + unsigned long flags; + int r = DM_ENDIO_DONE; + + if (!*error || !blk_path_error(*error)) + goto done; + + if (pgpath) + fail_path(pgpath); + + if (!atomic_read(&m->nr_valid_paths)) { + spin_lock_irqsave(&m->lock, flags); + if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + if (__must_push_back(m)) { + r = DM_ENDIO_REQUEUE; + } else { + dm_report_EIO(m); + *error = BLK_STS_IOERR; + } + spin_unlock_irqrestore(&m->lock, flags); + goto done; + } + spin_unlock_irqrestore(&m->lock, flags); + } + + multipath_queue_bio(m, clone); + r = DM_ENDIO_INCOMPLETE; +done: + if (pgpath) { + struct path_selector *ps = &pgpath->pg->ps; + + if (ps->type->end_io) + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + (mpio->start_time_ns ?: + dm_start_time_ns_from_clone(clone))); + } + + return r; +} + +/* + * Suspend with flush can't complete until all the I/O is processed + * so if the last path fails we must error any remaining I/O. + * - Note that if the freeze_bdev fails while suspending, the + * queue_if_no_path state is lost - userspace should reset it. + * Otherwise, during noflush suspend, queue_if_no_path will not change. + */ +static void multipath_presuspend(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ + if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) + queue_if_no_path(m, false, true, __func__); +} + +static void multipath_postsuspend(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + mutex_lock(&m->work_mutex); + flush_multipath_work(m); + mutex_unlock(&m->work_mutex); +} + +/* + * Restore the queue_if_no_path setting. + */ +static void multipath_resume(struct dm_target *ti) +{ + struct multipath *m = ti->private; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { + set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } + + DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d", + dm_table_device_name(m->ti->table), __func__, + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); + + spin_unlock_irqrestore(&m->lock, flags); +} + +/* + * Info output has the following format: + * num_multipath_feature_args [multipath_feature_args]* + * num_handler_status_args [handler_status_args]* + * num_groups init_group_number + * [A|D|E num_ps_status_args [ps_status_args]* + * num_paths num_selector_args + * [path_dev A|F fail_count [selector_args]* ]+ ]+ + * + * Table output has the following format (identical to the constructor string): + * num_feature_args [features_args]* + * num_handler_args hw_handler [hw_handler_args]* + * num_groups init_group_number + * [priority selector-name num_ps_args [ps_args]* + * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ + */ +static void multipath_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + int sz = 0, pg_counter, pgpath_counter; + unsigned long flags; + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *p; + unsigned int pg_num; + char state; + + spin_lock_irqsave(&m->lock, flags); + + /* Features */ + if (type == STATUSTYPE_INFO) + DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), + atomic_read(&m->pg_init_count)); + else { + DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + + test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) + + (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2); + + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + DMEMIT("queue_if_no_path "); + if (m->pg_init_retries) + DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); + if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) + DMEMIT("retain_attached_hw_handler "); + if (m->queue_mode != DM_TYPE_REQUEST_BASED) { + switch(m->queue_mode) { + case DM_TYPE_BIO_BASED: + DMEMIT("queue_mode bio "); + break; + default: + WARN_ON_ONCE(true); + break; + } + } + } + + if (!m->hw_handler_name || type == STATUSTYPE_INFO) + DMEMIT("0 "); + else + DMEMIT("1 %s ", m->hw_handler_name); + + DMEMIT("%u ", m->nr_priority_groups); + + if (m->next_pg) + pg_num = m->next_pg->pg_num; + else if (m->current_pg) + pg_num = m->current_pg->pg_num; + else + pg_num = (m->nr_priority_groups ? 1 : 0); + + DMEMIT("%u ", pg_num); + + switch (type) { + case STATUSTYPE_INFO: + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed) + state = 'D'; /* Disabled */ + else if (pg == m->current_pg) + state = 'A'; /* Currently Active */ + else + state = 'E'; /* Enabled */ + + DMEMIT("%c ", state); + + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, NULL, type, + result + sz, + maxlen - sz); + else + DMEMIT("0 "); + + DMEMIT("%u %u ", pg->nr_pgpaths, + pg->ps.type->info_args); + + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT("%s %s %u ", p->path.dev->name, + p->is_active ? "A" : "F", + p->fail_count); + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, + &p->path, type, result + sz, + maxlen - sz); + } + } + break; + + case STATUSTYPE_TABLE: + list_for_each_entry(pg, &m->priority_groups, list) { + DMEMIT("%s ", pg->ps.type->name); + + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, NULL, type, + result + sz, + maxlen - sz); + else + DMEMIT("0 "); + + DMEMIT("%u %u ", pg->nr_pgpaths, + pg->ps.type->table_args); + + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT("%s ", p->path.dev->name); + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, + &p->path, type, result + sz, + maxlen - sz); + } + } + break; + + case STATUSTYPE_IMA: + sz = 0; /*reset the result pointer*/ + + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",nr_priority_groups=%u", m->nr_priority_groups); + + pg_counter = 0; + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed) + state = 'D'; /* Disabled */ + else if (pg == m->current_pg) + state = 'A'; /* Currently Active */ + else + state = 'E'; /* Enabled */ + DMEMIT(",pg_state_%d=%c", pg_counter, state); + DMEMIT(",nr_pgpaths_%d=%u", pg_counter, pg->nr_pgpaths); + DMEMIT(",path_selector_name_%d=%s", pg_counter, pg->ps.type->name); + + pgpath_counter = 0; + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT(",path_name_%d_%d=%s,is_active_%d_%d=%c,fail_count_%d_%d=%u", + pg_counter, pgpath_counter, p->path.dev->name, + pg_counter, pgpath_counter, p->is_active ? 'A' : 'F', + pg_counter, pgpath_counter, p->fail_count); + if (pg->ps.type->status) { + DMEMIT(",path_selector_status_%d_%d=", + pg_counter, pgpath_counter); + sz += pg->ps.type->status(&pg->ps, &p->path, + type, result + sz, + maxlen - sz); + } + pgpath_counter++; + } + pg_counter++; + } + DMEMIT(";"); + break; + } + + spin_unlock_irqrestore(&m->lock, flags); +} + +static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r = -EINVAL; + struct dm_dev *dev; + struct multipath *m = ti->private; + action_fn action; + unsigned long flags; + + mutex_lock(&m->work_mutex); + + if (dm_suspended(ti)) { + r = -EBUSY; + goto out; + } + + if (argc == 1) { + if (!strcasecmp(argv[0], "queue_if_no_path")) { + r = queue_if_no_path(m, true, false, __func__); + spin_lock_irqsave(&m->lock, flags); + enable_nopath_timeout(m); + spin_unlock_irqrestore(&m->lock, flags); + goto out; + } else if (!strcasecmp(argv[0], "fail_if_no_path")) { + r = queue_if_no_path(m, false, false, __func__); + disable_nopath_timeout(m); + goto out; + } + } + + if (argc != 2) { + DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); + goto out; + } + + if (!strcasecmp(argv[0], "disable_group")) { + r = bypass_pg_num(m, argv[1], true); + goto out; + } else if (!strcasecmp(argv[0], "enable_group")) { + r = bypass_pg_num(m, argv[1], false); + goto out; + } else if (!strcasecmp(argv[0], "switch_group")) { + r = switch_pg_num(m, argv[1]); + goto out; + } else if (!strcasecmp(argv[0], "reinstate_path")) + action = reinstate_path; + else if (!strcasecmp(argv[0], "fail_path")) + action = fail_path; + else { + DMWARN("Unrecognised multipath message received: %s", argv[0]); + goto out; + } + + r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); + if (r) { + DMWARN("message: error getting device %s", + argv[1]); + goto out; + } + + r = action_dev(m, dev, action); + + dm_put_device(ti, dev); + +out: + mutex_unlock(&m->work_mutex); + return r; +} + +static int multipath_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev) +{ + struct multipath *m = ti->private; + struct pgpath *pgpath; + unsigned long flags; + int r; + + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) + pgpath = choose_pgpath(m, 0); + + if (pgpath) { + if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) { + *bdev = pgpath->path.dev->bdev; + r = 0; + } else { + /* pg_init has not started or completed */ + r = -ENOTCONN; + } + } else { + /* No path is available */ + r = -EIO; + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + r = -ENOTCONN; + spin_unlock_irqrestore(&m->lock, flags); + } + + if (r == -ENOTCONN) { + if (!READ_ONCE(m->current_pg)) { + /* Path status changed, redo selection */ + (void) choose_pgpath(m, 0); + } + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) + (void) __pg_init_all_paths(m); + spin_unlock_irqrestore(&m->lock, flags); + dm_table_run_md_queue_async(m->ti->table); + process_queued_io_list(m); + } + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (!r && ti->len != bdev_nr_sectors((*bdev))) + return 1; + return r; +} + +static int multipath_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *p; + int ret = 0; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(p, &pg->pgpaths, list) { + ret = fn(ti, p->path.dev, ti->begin, ti->len, data); + if (ret) + goto out; + } + } + +out: + return ret; +} + +static int pgpath_busy(struct pgpath *pgpath) +{ + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); + + return blk_lld_busy(q); +} + +/* + * We return "busy", only when we can map I/Os but underlying devices + * are busy (so even if we map I/Os now, the I/Os will wait on + * the underlying queue). + * In other words, if we want to kill I/Os or queue them inside us + * due to map unavailability, we don't return "busy". Otherwise, + * dm core won't give us the I/Os and we can't do what we want. + */ +static int multipath_busy(struct dm_target *ti) +{ + bool busy = false, has_active = false; + struct multipath *m = ti->private; + struct priority_group *pg, *next_pg; + struct pgpath *pgpath; + + /* pg_init in progress */ + if (atomic_read(&m->pg_init_in_progress)) + return true; + + /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ + if (!atomic_read(&m->nr_valid_paths)) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + spin_unlock_irqrestore(&m->lock, flags); + return (m->queue_mode != DM_TYPE_REQUEST_BASED); + } + spin_unlock_irqrestore(&m->lock, flags); + } + + /* Guess which priority_group will be used at next mapping time */ + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) + pg = next_pg; + + if (!pg) { + /* + * We don't know which pg will be used at next mapping time. + * We don't call choose_pgpath() here to avoid to trigger + * pg_init just by busy checking. + * So we don't know whether underlying devices we will be using + * at next mapping time are busy or not. Just try mapping. + */ + return busy; + } + + /* + * If there is one non-busy active path at least, the path selector + * will be able to select it. So we consider such a pg as not busy. + */ + busy = true; + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pgpath->is_active) { + has_active = true; + if (!pgpath_busy(pgpath)) { + busy = false; + break; + } + } + } + + if (!has_active) { + /* + * No active path in this pg, so this pg won't be used and + * the current_pg will be changed at next mapping time. + * We need to try mapping to determine it. + */ + busy = false; + } + + return busy; +} + +/*----------------------------------------------------------------- + * Module setup + *---------------------------------------------------------------*/ +static struct target_type multipath_target = { + .name = "multipath", + .version = {1, 14, 0}, + .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | + DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, + .ctr = multipath_ctr, + .dtr = multipath_dtr, + .clone_and_map_rq = multipath_clone_and_map, + .release_clone_rq = multipath_release_clone, + .rq_end_io = multipath_end_io, + .map = multipath_map_bio, + .end_io = multipath_end_io_bio, + .presuspend = multipath_presuspend, + .postsuspend = multipath_postsuspend, + .resume = multipath_resume, + .status = multipath_status, + .message = multipath_message, + .prepare_ioctl = multipath_prepare_ioctl, + .iterate_devices = multipath_iterate_devices, + .busy = multipath_busy, +}; + +static int __init dm_multipath_init(void) +{ + int r; + + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); + if (!kmultipathd) { + DMERR("failed to create workqueue kmpathd"); + r = -ENOMEM; + goto bad_alloc_kmultipathd; + } + + /* + * A separate workqueue is used to handle the device handlers + * to avoid overloading existing workqueue. Overloading the + * old workqueue would also create a bottleneck in the + * path of the storage hardware device activation. + */ + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); + if (!kmpath_handlerd) { + DMERR("failed to create workqueue kmpath_handlerd"); + r = -ENOMEM; + goto bad_alloc_kmpath_handlerd; + } + + r = dm_register_target(&multipath_target); + if (r < 0) { + DMERR("request-based register failed %d", r); + r = -EINVAL; + goto bad_register_target; + } + + return 0; + +bad_register_target: + destroy_workqueue(kmpath_handlerd); +bad_alloc_kmpath_handlerd: + destroy_workqueue(kmultipathd); +bad_alloc_kmultipathd: + return r; +} + +static void __exit dm_multipath_exit(void) +{ + destroy_workqueue(kmpath_handlerd); + destroy_workqueue(kmultipathd); + + dm_unregister_target(&multipath_target); +} + +module_init(dm_multipath_init); +module_exit(dm_multipath_exit); + +module_param_named(queue_if_no_path_timeout_secs, + queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds"); + +MODULE_DESCRIPTION(DM_NAME " multipath target"); +MODULE_AUTHOR("Sistina Software "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h new file mode 100644 index 000000000..5343698fe --- /dev/null +++ b/drivers/md/dm-mpath.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath. + */ + +#ifndef DM_MPATH_H +#define DM_MPATH_H + +struct dm_dev; + +struct dm_path { + struct dm_dev *dev; /* Read-only */ + void *pscontext; /* For path-selector use */ +}; + +/* Callback for hwh_pg_init_fn to use when complete */ +void dm_pg_init_complete(struct dm_path *path, unsigned int err_flags); + +#endif diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c new file mode 100644 index 000000000..fa0ccc585 --- /dev/null +++ b/drivers/md/dm-path-selector.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path selector registration. + */ + +#include +#include + +#include "dm-path-selector.h" + +#include + +struct ps_internal { + struct path_selector_type pst; + struct list_head list; +}; + +#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) + +static LIST_HEAD(_path_selectors); +static DECLARE_RWSEM(_ps_lock); + +static struct ps_internal *__find_path_selector_type(const char *name) +{ + struct ps_internal *psi; + + list_for_each_entry(psi, &_path_selectors, list) { + if (!strcmp(name, psi->pst.name)) + return psi; + } + + return NULL; +} + +static struct ps_internal *get_path_selector(const char *name) +{ + struct ps_internal *psi; + + down_read(&_ps_lock); + psi = __find_path_selector_type(name); + if (psi && !try_module_get(psi->pst.module)) + psi = NULL; + up_read(&_ps_lock); + + return psi; +} + +struct path_selector_type *dm_get_path_selector(const char *name) +{ + struct ps_internal *psi; + + if (!name) + return NULL; + + psi = get_path_selector(name); + if (!psi) { + request_module("dm-%s", name); + psi = get_path_selector(name); + } + + return psi ? &psi->pst : NULL; +} + +void dm_put_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi; + + if (!pst) + return; + + down_read(&_ps_lock); + psi = __find_path_selector_type(pst->name); + if (!psi) + goto out; + + module_put(psi->pst.module); +out: + up_read(&_ps_lock); +} + +static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi = kzalloc(sizeof(*psi), GFP_KERNEL); + + if (psi) + psi->pst = *pst; + + return psi; +} + +int dm_register_path_selector(struct path_selector_type *pst) +{ + int r = 0; + struct ps_internal *psi = _alloc_path_selector(pst); + + if (!psi) + return -ENOMEM; + + down_write(&_ps_lock); + + if (__find_path_selector_type(pst->name)) { + kfree(psi); + r = -EEXIST; + } else + list_add(&psi->list, &_path_selectors); + + up_write(&_ps_lock); + + return r; +} + +int dm_unregister_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi; + + down_write(&_ps_lock); + + psi = __find_path_selector_type(pst->name); + if (!psi) { + up_write(&_ps_lock); + return -EINVAL; + } + + list_del(&psi->list); + + up_write(&_ps_lock); + + kfree(psi); + + return 0; +} + +EXPORT_SYMBOL_GPL(dm_register_path_selector); +EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h new file mode 100644 index 000000000..0f2b37af8 --- /dev/null +++ b/drivers/md/dm-path-selector.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path-Selector registration. + */ + +#ifndef DM_PATH_SELECTOR_H +#define DM_PATH_SELECTOR_H + +#include + +#include "dm-mpath.h" + +/* + * We provide an abstraction for the code that chooses which path + * to send some io down. + */ +struct path_selector_type; +struct path_selector { + struct path_selector_type *type; + void *context; +}; + +/* + * If a path selector uses this flag, a high resolution timer is used + * (via ktime_get_ns) to account for IO start time in BIO-based mpath. + * This improves performance of some path selectors (i.e. HST), in + * exchange for slightly higher overhead when submitting the BIO. + * The extra cost is usually offset by improved path selection for + * some benchmarks. + * + * This has no effect for request-based mpath, since it already uses a + * higher precision timer by default. + */ +#define DM_PS_USE_HR_TIMER 0x00000001 +#define dm_ps_use_hr_timer(type) ((type)->features & DM_PS_USE_HR_TIMER) + +/* Information about a path selector type */ +struct path_selector_type { + char *name; + struct module *module; + + unsigned int features; + unsigned int table_args; + unsigned int info_args; + + /* + * Constructs a path selector object, takes custom arguments + */ + int (*create) (struct path_selector *ps, unsigned int argc, char **argv); + void (*destroy) (struct path_selector *ps); + + /* + * Add an opaque path object, along with some selector specific + * path args (eg, path priority). + */ + int (*add_path) (struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error); + + /* + * Chooses a path for this io, if no paths are available then + * NULL will be returned. + */ + struct dm_path *(*select_path) (struct path_selector *ps, + size_t nr_bytes); + + /* + * Notify the selector that a path has failed. + */ + void (*fail_path) (struct path_selector *ps, struct dm_path *p); + + /* + * Ask selector to reinstate a path. + */ + int (*reinstate_path) (struct path_selector *ps, struct dm_path *p); + + /* + * Table content based on parameters added in ps_add_path_fn + * or path selector status + */ + int (*status) (struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen); + + int (*start_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); + int (*end_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time); +}; + +/* Register a path selector */ +int dm_register_path_selector(struct path_selector_type *type); + +/* Unregister a path selector */ +int dm_unregister_path_selector(struct path_selector_type *type); + +/* Returns a registered path selector type */ +struct path_selector_type *dm_get_path_selector(const char *name); + +/* Releases a path selector */ +void dm_put_path_selector(struct path_selector_type *pst); + +#endif diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c new file mode 100644 index 000000000..1d82c95d3 --- /dev/null +++ b/drivers/md/dm-ps-historical-service-time.c @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Historical Service Time + * + * Keeps a time-weighted exponential moving average of the historical + * service time. Estimates future service time based on the historical + * service time and the number of outstanding requests. + * + * Marks paths stale if they have not finished within hst * + * num_paths. If a path is stale and unused, we will send a single + * request to probe in case the path has improved. This situation + * generally arises if the path is so much worse than others that it + * will never have the best estimated service time, or if the entire + * multipath device is unused. If a path is stale and in use, limit the + * number of requests it can receive with the assumption that the path + * has become degraded. + * + * To avoid repeatedly calculating exponents for time weighting, times + * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) + * ns, and the weighting is pre-calculated. + * + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include +#include + + +#define DM_MSG_PREFIX "multipath historical-service-time" +#define HST_MIN_IO 1 +#define HST_VERSION "0.1.1" + +#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ +#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) +#define HST_FIXED_1 (1 << HST_FIXED_SHIFT) +#define HST_FIXED_95 972 +#define HST_MAX_INFLIGHT HST_FIXED_1 +#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ +#define HST_WEIGHT_COUNT 64ULL + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + int valid_count; + spinlock_t lock; + + unsigned int weights[HST_WEIGHT_COUNT]; + unsigned int threshold_multiplier; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + + spinlock_t lock; + + u64 historical_service_time; /* Fixed point */ + + u64 stale_after; + u64 last_finish; + + u64 outstanding; +}; + +/** + * fixed_power - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + * + * (see: kernel/sched/loadavg.c) + */ +static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * Calculate the next value of an exponential moving average + * a_1 = a_0 * e + a * (1 - e) + * + * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @weight: [0, HST_FIXED_1] + * + * Note: + * To account for multiple periods in the same calculation, + * a_n = a_0 * e^n + a * (1 - e^n), + * so call fixed_ema(last, next, pow(weight, N)) + */ +static u64 fixed_ema(u64 last, u64 next, u64 weight) +{ + last *= weight; + last += next * (HST_FIXED_1 - weight); + last += 1ULL << (HST_FIXED_SHIFT - 1); + return last >> HST_FIXED_SHIFT; +} + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + s->valid_count = 0; + } + + return s; +} + +/* + * Get the weight for a given time span. + */ +static u64 hst_weight(struct path_selector *ps, u64 delta) +{ + struct selector *s = ps->context; + int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, + HST_WEIGHT_COUNT - 1); + + return s->weights[bucket]; +} + +/* + * Set up the weights array. + * + * weights[len-1] = 0 + * weights[n] = base ^ (n + 1) + */ +static void hst_set_weights(struct path_selector *ps, unsigned int base) +{ + struct selector *s = ps->context; + int i; + + if (base >= HST_FIXED_1) + return; + + for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) + s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); + s->weights[HST_WEIGHT_COUNT - 1] = 0; +} + +static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s; + unsigned int base_weight = HST_FIXED_95; + unsigned int threshold_multiplier = 0; + char dummy; + + /* + * Arguments: [ []] + * : Base weight for ema [0, 1024) 10-bit fixed point. A + * value of 0 will completely ignore any history. + * If not given, default (HST_FIXED_95) is used. + * : Minimum threshold multiplier for paths to + * be considered different. That is, a path is + * considered different iff (p1 > N * p2) where p1 + * is the path with higher service time. A threshold + * of 1 or 0 has no effect. Defaults to 0. + */ + if (argc > 2) + return -EINVAL; + + if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || + base_weight >= HST_FIXED_1)) { + return -EINVAL; + } + + if (argc > 1 && (sscanf(argv[1], "%u%c", + &threshold_multiplier, &dummy) != 1)) { + return -EINVAL; + } + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + + hst_set_weights(ps, base_weight); + s->threshold_multiplier = threshold_multiplier; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void hst_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int hst_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct path_info *pi; + + if (!path) { + struct selector *s = ps->context; + + DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); + } else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu %llu ", pi->historical_service_time, + pi->outstanding, pi->stale_after); + break; + case STATUSTYPE_TABLE: + DMEMIT("0 "); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + } + + return sz; +} + +static int hst_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = HST_MIN_IO; + char dummy; + unsigned long flags; + + /* + * Arguments: [] + * : The number of I/Os before switching path. + * If not given, default (HST_MIN_IO) is used. + */ + if (argc > 1) { + *error = "historical-service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "historical-service-time ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "historical-service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + pi->historical_service_time = HST_FIXED_1; + + spin_lock_init(&pi->lock); + pi->outstanding = 0; + + pi->stale_after = 0; + pi->last_finish = 0; + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + s->valid_count--; + spin_unlock_irqrestore(&s->lock, flags); +} + +static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fill_compare(struct path_info *pi, u64 *hst, + u64 *out, u64 *stale) +{ + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + *hst = pi->historical_service_time; + *out = pi->outstanding; + *stale = pi->stale_after; + spin_unlock_irqrestore(&pi->lock, flags); +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + */ +static long long hst_compare(struct path_info *pi1, struct path_info *pi2, + u64 time_now, struct path_selector *ps) +{ + struct selector *s = ps->context; + u64 hst1, hst2; + long long out1, out2, stale1, stale2; + int pi2_better, over_threshold; + + hst_fill_compare(pi1, &hst1, &out1, &stale1); + hst_fill_compare(pi2, &hst2, &out2, &stale2); + + /* Check here if estimated latency for two paths are too similar. + * If this is the case, we skip extra calculation and just compare + * outstanding requests. In this case, any unloaded paths will + * be preferred. + */ + if (hst1 > hst2) + over_threshold = hst1 > (s->threshold_multiplier * hst2); + else + over_threshold = hst2 > (s->threshold_multiplier * hst1); + + if (!over_threshold) + return out1 - out2; + + /* + * If an unloaded path is stale, choose it. If both paths are unloaded, + * choose path that is the most stale. + * (If one path is loaded, choose the other) + */ + if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || + (!out1 && !out2)) + return (!out2 * stale1) - (!out1 * stale2); + + /* Compare estimated service time. If outstanding is the same, we + * don't need to multiply + */ + if (out1 == out2) { + pi2_better = hst1 > hst2; + } else { + /* Potential overflow with out >= 1024 */ + if (unlikely(out1 >= HST_MAX_INFLIGHT || + out2 >= HST_MAX_INFLIGHT)) { + /* If over 1023 in-flights, we may overflow if hst + * is at max. (With this shift we still overflow at + * 1048576 in-flights, which is high enough). + */ + hst1 >>= HST_FIXED_SHIFT; + hst2 >>= HST_FIXED_SHIFT; + } + pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; + } + + /* In the case that the 'winner' is stale, limit to equal usage. */ + if (pi2_better) { + if (stale2 < time_now) + return out1 - out2; + return 1; + } + if (stale1 < time_now) + return out1 - out2; + return -1; +} + +static struct dm_path *hst_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + u64 time_now = ktime_get_ns(); + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || (hst_compare(pi, best, time_now, ps) < 0)) + best = pi; + } + + if (!best) + goto out; + + /* Move last used path to end (least preferred in case of ties) */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; + +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int hst_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + pi->outstanding++; + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static u64 path_service_time(struct path_info *pi, u64 start_time) +{ + u64 now = ktime_get_ns(); + + /* if a previous disk request has finished after this IO was + * sent to the hardware, pretend the submission happened + * serially. + */ + if (time_after64(pi->last_finish, start_time)) + start_time = pi->last_finish; + + pi->last_finish = now; + if (time_before64(now, start_time)) + return 0; + + return now - start_time; +} + +static int hst_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + struct selector *s = ps->context; + unsigned long flags; + u64 st; + + spin_lock_irqsave(&pi->lock, flags); + + st = path_service_time(pi, start_time); + pi->outstanding--; + pi->historical_service_time = + fixed_ema(pi->historical_service_time, + min(st * HST_FIXED_1, HST_FIXED_MAX), + hst_weight(ps, st)); + + /* + * On request end, mark path as fresh. If a path hasn't + * finished any requests within the fresh period, the estimated + * service time is considered too optimistic and we limit the + * maximum requests on that path. + */ + pi->stale_after = pi->last_finish + + (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); + + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static struct path_selector_type hst_ps = { + .name = "historical-service-time", + .module = THIS_MODULE, + .features = DM_PS_USE_HR_TIMER, + .table_args = 1, + .info_args = 3, + .create = hst_create, + .destroy = hst_destroy, + .status = hst_status, + .add_path = hst_add_path, + .fail_path = hst_fail_path, + .reinstate_path = hst_reinstate_path, + .select_path = hst_select_path, + .start_io = hst_start_io, + .end_io = hst_end_io, +}; + +static int __init dm_hst_init(void) +{ + int r = dm_register_path_selector(&hst_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " HST_VERSION " loaded"); + + return r; +} + +static void __exit dm_hst_exit(void) +{ + int r = dm_unregister_path_selector(&hst_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_hst_init); +module_exit(dm_hst_exit); + +MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); +MODULE_AUTHOR("Khazhismel Kumykov "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c new file mode 100644 index 000000000..76ce4ce87 --- /dev/null +++ b/drivers/md/dm-ps-io-affinity.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Oracle Corporation + * + * Module Author: Mike Christie + */ +#include "dm-path-selector.h" + +#include +#include + +#define DM_MSG_PREFIX "multipath io-affinity" + +struct path_info { + struct dm_path *path; + cpumask_var_t cpumask; + refcount_t refcount; + bool failed; +}; + +struct selector { + struct path_info **path_map; + cpumask_var_t path_mask; + atomic_t map_misses; +}; + +static void ioa_free_path(struct selector *s, unsigned int cpu) +{ + struct path_info *pi = s->path_map[cpu]; + + if (!pi) + return; + + if (refcount_dec_and_test(&pi->refcount)) { + cpumask_clear_cpu(cpu, s->path_mask); + free_cpumask_var(pi->cpumask); + kfree(pi); + + s->path_map[cpu] = NULL; + } +} + +static int ioa_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL; + unsigned int cpu; + int ret; + + if (argc != 1) { + *error = "io-affinity ps: invalid number of arguments"; + return -EINVAL; + } + + pi = kzalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "io-affinity ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + path->pscontext = pi; + refcount_set(&pi->refcount, 1); + + if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) { + *error = "io-affinity ps: Error allocating cpumask context"; + ret = -ENOMEM; + goto free_pi; + } + + ret = cpumask_parse(argv[0], pi->cpumask); + if (ret) { + *error = "io-affinity ps: invalid cpumask"; + ret = -EINVAL; + goto free_mask; + } + + for_each_cpu(cpu, pi->cpumask) { + if (cpu >= nr_cpu_ids) { + DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u", + cpu, nr_cpu_ids); + break; + } + + if (s->path_map[cpu]) { + DMWARN("CPU mapping for %u exists. Ignoring.", cpu); + continue; + } + + cpumask_set_cpu(cpu, s->path_mask); + s->path_map[cpu] = pi; + refcount_inc(&pi->refcount); + } + + if (refcount_dec_and_test(&pi->refcount)) { + *error = "io-affinity ps: No new/valid CPU mapping found"; + ret = -EINVAL; + goto free_mask; + } + + return 0; + +free_mask: + free_cpumask_var(pi->cpumask); +free_pi: + kfree(pi); + return ret; +} + +static int ioa_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *), + GFP_KERNEL); + if (!s->path_map) + goto free_selector; + + if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL)) + goto free_map; + + atomic_set(&s->map_misses, 0); + ps->context = s; + return 0; + +free_map: + kfree(s->path_map); +free_selector: + kfree(s); + return -ENOMEM; +} + +static void ioa_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + unsigned int cpu; + + for_each_cpu(cpu, s->path_mask) + ioa_free_path(s, cpu); + + free_cpumask_var(s->path_mask); + kfree(s->path_map); + kfree(s); + + ps->context = NULL; +} + +static int ioa_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct selector *s = ps->context; + struct path_info *pi; + int sz = 0; + + if (!path) { + DMEMIT("0 "); + return sz; + } + + switch(type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&s->map_misses)); + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask)); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return sz; +} + +static void ioa_fail_path(struct path_selector *ps, struct dm_path *p) +{ + struct path_info *pi = p->pscontext; + + pi->failed = true; +} + +static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p) +{ + struct path_info *pi = p->pscontext; + + pi->failed = false; + return 0; +} + +static struct dm_path *ioa_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + unsigned int cpu, node; + struct selector *s = ps->context; + const struct cpumask *cpumask; + struct path_info *pi; + int i; + + cpu = get_cpu(); + + pi = s->path_map[cpu]; + if (pi && !pi->failed) + goto done; + + /* + * Perf is not optimal, but we at least try the local node then just + * try not to fail. + */ + if (!pi) + atomic_inc(&s->map_misses); + + node = cpu_to_node(cpu); + cpumask = cpumask_of_node(node); + for_each_cpu(i, cpumask) { + pi = s->path_map[i]; + if (pi && !pi->failed) + goto done; + } + + for_each_cpu(i, s->path_mask) { + pi = s->path_map[i]; + if (pi && !pi->failed) + goto done; + } + pi = NULL; + +done: + put_cpu(); + return pi ? pi->path : NULL; +} + +static struct path_selector_type ioa_ps = { + .name = "io-affinity", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ioa_create, + .destroy = ioa_destroy, + .status = ioa_status, + .add_path = ioa_add_path, + .fail_path = ioa_fail_path, + .reinstate_path = ioa_reinstate_path, + .select_path = ioa_select_path, +}; + +static int __init dm_ioa_init(void) +{ + int ret = dm_register_path_selector(&ioa_ps); + + if (ret < 0) + DMERR("register failed %d", ret); + return ret; +} + +static void __exit dm_ioa_exit(void) +{ + int ret = dm_unregister_path_selector(&ioa_ps); + + if (ret < 0) + DMERR("unregister failed %d", ret); +} + +module_init(dm_ioa_init); +module_exit(dm_ioa_exit); + +MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on"); +MODULE_AUTHOR("Mike Christie "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c new file mode 100644 index 000000000..6fbec9fc2 --- /dev/null +++ b/drivers/md/dm-ps-queue-length.c @@ -0,0 +1,286 @@ +/* + * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. + * Copyright (C) 2006-2009 NEC Corporation. + * + * dm-queue-length.c + * + * Module Author: Stefan Bader, IBM + * Modified by: Kiyoshi Ueda, NEC + * + * This file is released under the GPL. + * + * queue-length path selector - choose a path with the least number of + * in-flight I/Os. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "multipath queue-length" +#define QL_MIN_IO 1 +#define QL_VERSION "0.2.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + spinlock_t lock; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + atomic_t qlen; /* the number of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + } + + return s; +} + +static int ql_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void ql_free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void ql_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + ql_free_paths(&s->valid_paths); + ql_free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int ql_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct path_info *pi; + + /* When called with NULL path, return selector status/args. */ + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&pi->qlen)); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + } + + return sz; +} + +static int ql_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = QL_MIN_IO; + char dummy; + unsigned long flags; + + /* + * Arguments: [] + * : The number of I/Os before switching path. + * If not given, default (QL_MIN_IO) is used. + */ + if (argc > 1) { + *error = "queue-length ps: incorrect number of arguments"; + return -EINVAL; + } + + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "queue-length ps: invalid repeat count"; + return -EINVAL; + } + + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + + /* Allocate the path information structure */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "queue-length ps: Error allocating path information"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + atomic_set(&pi->qlen, 0); + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void ql_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); +} + +static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +/* + * Select a path having the minimum number of in-flight I/Os + */ +static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || + (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) + best = pi; + + if (!atomic_read(&best->qlen)) + break; + } + + if (!best) + goto out; + + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int ql_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_inc(&pi->qlen); + + return 0; +} + +static int ql_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + + atomic_dec(&pi->qlen); + + return 0; +} + +static struct path_selector_type ql_ps = { + .name = "queue-length", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ql_create, + .destroy = ql_destroy, + .status = ql_status, + .add_path = ql_add_path, + .fail_path = ql_fail_path, + .reinstate_path = ql_reinstate_path, + .select_path = ql_select_path, + .start_io = ql_start_io, + .end_io = ql_end_io, +}; + +static int __init dm_ql_init(void) +{ + int r = dm_register_path_selector(&ql_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " QL_VERSION " loaded"); + + return r; +} + +static void __exit dm_ql_exit(void) +{ + int r = dm_unregister_path_selector(&ql_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_ql_init); +module_exit(dm_ql_exit); + +MODULE_AUTHOR("Stefan Bader "); +MODULE_DESCRIPTION( + "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" + DM_NAME " path selector to balance the number of in-flight I/Os" +); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c new file mode 100644 index 000000000..1d07392b5 --- /dev/null +++ b/drivers/md/dm-ps-round-robin.c @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Round-robin path selector. + */ + +#include + +#include "dm-path-selector.h" + +#include +#include + +#define DM_MSG_PREFIX "multipath round-robin" +#define RR_MIN_IO 1 +#define RR_VERSION "1.2.0" + +/*----------------------------------------------------------------- + * Path-handling code, paths are held in lists + *---------------------------------------------------------------*/ +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; +}; + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +/*----------------------------------------------------------------- + * Round-robin selector + *---------------------------------------------------------------*/ + +struct selector { + struct list_head valid_paths; + struct list_head invalid_paths; + spinlock_t lock; +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + } + + return s; +} + +static int rr_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s; + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void rr_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->invalid_paths); + kfree(s); + ps->context = NULL; +} + +static int rr_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct path_info *pi; + int sz = 0; + + if (!path) + DMEMIT("0 "); + else { + switch(type) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%u ", pi->repeat_count); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + } + + return sz; +} + +/* + * Called during initialisation to register each path with an + * optional repeat_count. + */ +static int rr_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = RR_MIN_IO; + char dummy; + unsigned long flags; + + if (argc > 1) { + *error = "round-robin ps: incorrect number of arguments"; + return -EINVAL; + } + + /* First path argument is number of I/Os before switching path */ + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "round-robin ps: invalid repeat count"; + return -EINVAL; + } + + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "round-robin ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void rr_fail_path(struct path_selector *ps, struct dm_path *p) +{ + unsigned long flags; + struct selector *s = ps->context; + struct path_info *pi = p->pscontext; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->invalid_paths); + spin_unlock_irqrestore(&s->lock, flags); +} + +static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) +{ + unsigned long flags; + struct selector *s = ps->context; + struct path_info *pi = p->pscontext; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) +{ + unsigned long flags; + struct selector *s = ps->context; + struct path_info *pi = NULL; + + spin_lock_irqsave(&s->lock, flags); + if (!list_empty(&s->valid_paths)) { + pi = list_entry(s->valid_paths.next, struct path_info, list); + list_move_tail(&pi->list, &s->valid_paths); + } + spin_unlock_irqrestore(&s->lock, flags); + + return pi ? pi->path : NULL; +} + +static struct path_selector_type rr_ps = { + .name = "round-robin", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 0, + .create = rr_create, + .destroy = rr_destroy, + .status = rr_status, + .add_path = rr_add_path, + .fail_path = rr_fail_path, + .reinstate_path = rr_reinstate_path, + .select_path = rr_select_path, +}; + +static int __init dm_rr_init(void) +{ + int r = dm_register_path_selector(&rr_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " RR_VERSION " loaded"); + + return r; +} + +static void __exit dm_rr_exit(void) +{ + int r = dm_unregister_path_selector(&rr_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_rr_init); +module_exit(dm_rr_exit); + +MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); +MODULE_AUTHOR("Sistina Software "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c new file mode 100644 index 000000000..eba2293be --- /dev/null +++ b/drivers/md/dm-ps-service-time.c @@ -0,0 +1,364 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 1 +#define ST_MAX_RELATIVE_THROUGHPUT 100 +#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 +#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) +#define ST_VERSION "0.3.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + spinlock_t lock; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + unsigned int relative_throughput; + atomic_t in_flight_size; /* Total size of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + pi->relative_throughput); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %u ", pi->repeat_count, + pi->relative_throughput); + break; + case STATUSTYPE_IMA: + result[0] = '\0'; + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = ST_MIN_IO; + unsigned int relative_throughput = 1; + char dummy; + unsigned long flags; + + /* + * Arguments: [ []] + * : The number of I/Os before switching path. + * If not given, default (ST_MIN_IO) is used. + * : The relative throughput value of + * the path among all paths in the path-group. + * The valid range: 0- + * If not given, minimum value '1' is used. + * If '0' is given, the path isn't selected while + * other paths having a positive value are available. + */ + if (argc > 2) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + + if ((argc == 2) && + (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || + relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { + *error = "service-time ps: invalid relative_throughput value"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + pi->relative_throughput = relative_throughput; + atomic_set(&pi->in_flight_size, 0); + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + * Description: + * Basically, the service time is estimated by: + * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' + * To reduce the calculation, some optimizations are made. + * (See comments inline) + */ +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t incoming) +{ + size_t sz1, sz2, st1, st2; + + sz1 = atomic_read(&pi1->in_flight_size); + sz2 = atomic_read(&pi2->in_flight_size); + + /* + * Case 1: Both have same throughput value. Choose less loaded path. + */ + if (pi1->relative_throughput == pi2->relative_throughput) + return sz1 - sz2; + + /* + * Case 2a: Both have same load. Choose higher throughput path. + * Case 2b: One path has no throughput value. Choose the other one. + */ + if (sz1 == sz2 || + !pi1->relative_throughput || !pi2->relative_throughput) + return pi2->relative_throughput - pi1->relative_throughput; + + /* + * Case 3: Calculate service time. Choose faster path. + * Service time using pi1: + * st1 = (sz1 + incoming) / pi1->relative_throughput + * Service time using pi2: + * st2 = (sz2 + incoming) / pi2->relative_throughput + * + * To avoid the division, transform the expression to use + * multiplication. + * Because ->relative_throughput > 0 here, if st1 < st2, + * the expressions below are the same meaning: + * (sz1 + incoming) / pi1->relative_throughput < + * (sz2 + incoming) / pi2->relative_throughput + * (sz1 + incoming) * pi2->relative_throughput < + * (sz2 + incoming) * pi1->relative_throughput + * So use the later one. + */ + sz1 += incoming; + sz2 += incoming; + if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || + sz2 >= ST_MAX_INFLIGHT_SIZE)) { + /* + * Size may be too big for multiplying pi->relative_throughput + * and overflow. + * To avoid the overflow and mis-selection, shift down both. + */ + sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + } + st1 = sz1 * pi2->relative_throughput; + st2 = sz2 * pi1->relative_throughput; + if (st1 != st2) + return st1 - st2; + + /* + * Case 4: Service time is equal. Choose higher throughput path. + */ + return pi2->relative_throughput - pi1->relative_throughput; +} + +static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) + if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + best = pi; + + if (!best) + goto out; + + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 2, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 000000000..4b7528dc2 --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,4112 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include +#include + +#include "md.h" +#include "raid1.h" +#include "raid5.h" +#include "raid10.h" +#include "md-bitmap.h" + +#include + +#define DM_MSG_PREFIX "raid" +#define MAX_RAID_DEVICES 253 /* md-raid kernel limit */ + +/* + * Minimum sectors of free reshape space per raid device + */ +#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096) + +/* + * Minimum journal space 4 MiB in sectors. + */ +#define MIN_RAID456_JOURNAL_SPACE (4*2048) + +static bool devices_handle_discard_safely = false; + +/* + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. + */ +#define FirstUse 10 /* rdev flag */ + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct md_rdev rdev; +}; + +/* + * Bits for establishing rs->ctr_flags + * + * 1 = no flag value + * 2 = flag with value + */ +#define __CTR_FLAG_SYNC 0 /* 1 */ /* Not with raid0! */ +#define __CTR_FLAG_NOSYNC 1 /* 1 */ /* Not with raid0! */ +#define __CTR_FLAG_REBUILD 2 /* 2 */ /* Not with raid0! */ +#define __CTR_FLAG_DAEMON_SLEEP 3 /* 2 */ /* Not with raid0! */ +#define __CTR_FLAG_MIN_RECOVERY_RATE 4 /* 2 */ /* Not with raid0! */ +#define __CTR_FLAG_MAX_RECOVERY_RATE 5 /* 2 */ /* Not with raid0! */ +#define __CTR_FLAG_MAX_WRITE_BEHIND 6 /* 2 */ /* Only with raid1! */ +#define __CTR_FLAG_WRITE_MOSTLY 7 /* 2 */ /* Only with raid1! */ +#define __CTR_FLAG_STRIPE_CACHE 8 /* 2 */ /* Only with raid4/5/6! */ +#define __CTR_FLAG_REGION_SIZE 9 /* 2 */ /* Not with raid0! */ +#define __CTR_FLAG_RAID10_COPIES 10 /* 2 */ /* Only with raid10 */ +#define __CTR_FLAG_RAID10_FORMAT 11 /* 2 */ /* Only with raid10 */ +/* New for v1.9.0 */ +#define __CTR_FLAG_DELTA_DISKS 12 /* 2 */ /* Only with reshapable raid1/4/5/6/10! */ +#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */ +#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ + +/* New for v1.10.0 */ +#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */ + +/* New for v1.11.1 */ +#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */ + +/* + * Flags for rs->ctr_flags field. + */ +#define CTR_FLAG_SYNC (1 << __CTR_FLAG_SYNC) +#define CTR_FLAG_NOSYNC (1 << __CTR_FLAG_NOSYNC) +#define CTR_FLAG_REBUILD (1 << __CTR_FLAG_REBUILD) +#define CTR_FLAG_DAEMON_SLEEP (1 << __CTR_FLAG_DAEMON_SLEEP) +#define CTR_FLAG_MIN_RECOVERY_RATE (1 << __CTR_FLAG_MIN_RECOVERY_RATE) +#define CTR_FLAG_MAX_RECOVERY_RATE (1 << __CTR_FLAG_MAX_RECOVERY_RATE) +#define CTR_FLAG_MAX_WRITE_BEHIND (1 << __CTR_FLAG_MAX_WRITE_BEHIND) +#define CTR_FLAG_WRITE_MOSTLY (1 << __CTR_FLAG_WRITE_MOSTLY) +#define CTR_FLAG_STRIPE_CACHE (1 << __CTR_FLAG_STRIPE_CACHE) +#define CTR_FLAG_REGION_SIZE (1 << __CTR_FLAG_REGION_SIZE) +#define CTR_FLAG_RAID10_COPIES (1 << __CTR_FLAG_RAID10_COPIES) +#define CTR_FLAG_RAID10_FORMAT (1 << __CTR_FLAG_RAID10_FORMAT) +#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS) +#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) +#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) +#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) +#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) + +/* + * Definitions of various constructor flags to + * be used in checks of valid / invalid flags + * per raid level. + */ +/* Define all any sync flags */ +#define CTR_FLAGS_ANY_SYNC (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC) + +/* Define flags for options without argument (e.g. 'nosync') */ +#define CTR_FLAG_OPTIONS_NO_ARGS (CTR_FLAGS_ANY_SYNC | \ + CTR_FLAG_RAID10_USE_NEAR_SETS) + +/* Define flags for options with one argument (e.g. 'delta_disks +2') */ +#define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \ + CTR_FLAG_WRITE_MOSTLY | \ + CTR_FLAG_DAEMON_SLEEP | \ + CTR_FLAG_MIN_RECOVERY_RATE | \ + CTR_FLAG_MAX_RECOVERY_RATE | \ + CTR_FLAG_MAX_WRITE_BEHIND | \ + CTR_FLAG_STRIPE_CACHE | \ + CTR_FLAG_REGION_SIZE | \ + CTR_FLAG_RAID10_COPIES | \ + CTR_FLAG_RAID10_FORMAT | \ + CTR_FLAG_DELTA_DISKS | \ + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) + +/* Valid options definitions per raid level... */ + +/* "raid0" does only accept data offset */ +#define RAID0_VALID_FLAGS (CTR_FLAG_DATA_OFFSET) + +/* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */ +#define RAID1_VALID_FLAGS (CTR_FLAGS_ANY_SYNC | \ + CTR_FLAG_REBUILD | \ + CTR_FLAG_WRITE_MOSTLY | \ + CTR_FLAG_DAEMON_SLEEP | \ + CTR_FLAG_MIN_RECOVERY_RATE | \ + CTR_FLAG_MAX_RECOVERY_RATE | \ + CTR_FLAG_MAX_WRITE_BEHIND | \ + CTR_FLAG_REGION_SIZE | \ + CTR_FLAG_DELTA_DISKS | \ + CTR_FLAG_DATA_OFFSET) + +/* "raid10" does not accept any raid1 or stripe cache options */ +#define RAID10_VALID_FLAGS (CTR_FLAGS_ANY_SYNC | \ + CTR_FLAG_REBUILD | \ + CTR_FLAG_DAEMON_SLEEP | \ + CTR_FLAG_MIN_RECOVERY_RATE | \ + CTR_FLAG_MAX_RECOVERY_RATE | \ + CTR_FLAG_REGION_SIZE | \ + CTR_FLAG_RAID10_COPIES | \ + CTR_FLAG_RAID10_FORMAT | \ + CTR_FLAG_DELTA_DISKS | \ + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_RAID10_USE_NEAR_SETS) + +/* + * "raid4/5/6" do not accept any raid1 or raid10 specific options + * + * "raid6" does not accept "nosync", because it is not guaranteed + * that both parity and q-syndrome are being written properly with + * any writes + */ +#define RAID45_VALID_FLAGS (CTR_FLAGS_ANY_SYNC | \ + CTR_FLAG_REBUILD | \ + CTR_FLAG_DAEMON_SLEEP | \ + CTR_FLAG_MIN_RECOVERY_RATE | \ + CTR_FLAG_MAX_RECOVERY_RATE | \ + CTR_FLAG_STRIPE_CACHE | \ + CTR_FLAG_REGION_SIZE | \ + CTR_FLAG_DELTA_DISKS | \ + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) + +#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ + CTR_FLAG_REBUILD | \ + CTR_FLAG_DAEMON_SLEEP | \ + CTR_FLAG_MIN_RECOVERY_RATE | \ + CTR_FLAG_MAX_RECOVERY_RATE | \ + CTR_FLAG_STRIPE_CACHE | \ + CTR_FLAG_REGION_SIZE | \ + CTR_FLAG_DELTA_DISKS | \ + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) +/* ...valid options definitions per raid level */ + +/* + * Flags for rs->runtime_flags field + * (RT_FLAG prefix meaning "runtime flag") + * + * These are all internal and used to define runtime state, + * e.g. to prevent another resume from preresume processing + * the raid set all over again. + */ +#define RT_FLAG_RS_PRERESUMED 0 +#define RT_FLAG_RS_RESUMED 1 +#define RT_FLAG_RS_BITMAP_LOADED 2 +#define RT_FLAG_UPDATE_SBS 3 +#define RT_FLAG_RESHAPE_RS 4 +#define RT_FLAG_RS_SUSPENDED 5 +#define RT_FLAG_RS_IN_SYNC 6 +#define RT_FLAG_RS_RESYNCING 7 +#define RT_FLAG_RS_GROW 8 + +/* Array elements of 64 bit needed for rebuild/failed disk bits */ +#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) + +/* + * raid set level, layout and chunk sectors backup/restore + */ +struct rs_layout { + int new_level; + int new_layout; + int new_chunk_sectors; +}; + +struct raid_set { + struct dm_target *ti; + + uint32_t stripe_cache_entries; + unsigned long ctr_flags; + unsigned long runtime_flags; + + uint64_t rebuild_disks[DISKS_ARRAY_ELEMS]; + + int raid_disks; + int delta_disks; + int data_offset; + int raid10_copies; + int requested_bitmap_chunk_sectors; + + struct mddev md; + struct raid_type *raid_type; + + sector_t array_sectors; + sector_t dev_sectors; + + /* Optional raid4/5/6 journal device */ + struct journal_dev { + struct dm_dev *dev; + struct md_rdev rdev; + int mode; + } journal_dev; + + struct raid_dev dev[]; +}; + +static void rs_config_backup(struct raid_set *rs, struct rs_layout *l) +{ + struct mddev *mddev = &rs->md; + + l->new_level = mddev->new_level; + l->new_layout = mddev->new_layout; + l->new_chunk_sectors = mddev->new_chunk_sectors; +} + +static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) +{ + struct mddev *mddev = &rs->md; + + mddev->new_level = l->new_level; + mddev->new_layout = l->new_layout; + mddev->new_chunk_sectors = l->new_chunk_sectors; +} + +/* raid10 algorithms (i.e. formats) */ +#define ALGORITHM_RAID10_DEFAULT 0 +#define ALGORITHM_RAID10_NEAR 1 +#define ALGORITHM_RAID10_OFFSET 2 +#define ALGORITHM_RAID10_FAR 3 + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned int parity_devs; /* # of parity devices. */ + const unsigned int minimal_devs;/* minimal # of devices in set. */ + const unsigned int level; /* RAID level. */ + const unsigned int algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid0", "raid0 (striping)", 0, 2, 0, 0 /* NONE */}, + {"raid1", "raid1 (mirroring)", 0, 2, 1, 0 /* NONE */}, + {"raid10_far", "raid10 far (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_FAR}, + {"raid10_offset", "raid10 offset (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_OFFSET}, + {"raid10_near", "raid10 near (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_NEAR}, + {"raid10", "raid10 (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_DEFAULT}, + {"raid4", "raid4 (dedicated first parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, /* raid4 layout = raid5_0 */ + {"raid5_n", "raid5 (dedicated last parity disk)", 1, 2, 5, ALGORITHM_PARITY_N}, + {"raid5_ls", "raid5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "raid5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid5_la", "raid5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "raid5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid6_zr", "raid6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "raid6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "raid6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}, + {"raid6_n_6", "raid6 (dedicated parity/Q n/6)", 2, 4, 6, ALGORITHM_PARITY_N_6}, + {"raid6_ls_6", "raid6 (left symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_SYMMETRIC_6}, + {"raid6_rs_6", "raid6 (right symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_SYMMETRIC_6}, + {"raid6_la_6", "raid6 (left asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_ASYMMETRIC_6}, + {"raid6_ra_6", "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_ASYMMETRIC_6} +}; + +/* True, if @v is in inclusive range [@min, @max] */ +static bool __within_range(long v, long min, long max) +{ + return v >= min && v <= max; +} + +/* All table line arguments are defined here */ +static struct arg_name_flag { + const unsigned long flag; + const char *name; +} __arg_name_flags[] = { + { CTR_FLAG_SYNC, "sync"}, + { CTR_FLAG_NOSYNC, "nosync"}, + { CTR_FLAG_REBUILD, "rebuild"}, + { CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"}, + { CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"}, + { CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"}, + { CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"}, + { CTR_FLAG_WRITE_MOSTLY, "write_mostly"}, + { CTR_FLAG_STRIPE_CACHE, "stripe_cache"}, + { CTR_FLAG_REGION_SIZE, "region_size"}, + { CTR_FLAG_RAID10_COPIES, "raid10_copies"}, + { CTR_FLAG_RAID10_FORMAT, "raid10_format"}, + { CTR_FLAG_DATA_OFFSET, "data_offset"}, + { CTR_FLAG_DELTA_DISKS, "delta_disks"}, + { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, + { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, + { CTR_FLAG_JOURNAL_MODE, "journal_mode" }, +}; + +/* Return argument name string for given @flag */ +static const char *dm_raid_arg_name_by_flag(const uint32_t flag) +{ + if (hweight32(flag) == 1) { + struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags); + + while (anf-- > __arg_name_flags) + if (flag & anf->flag) + return anf->name; + + } else + DMERR("%s called with more than one flag!", __func__); + + return NULL; +} + +/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */ +static struct { + const int mode; + const char *param; +} _raid456_journal_mode[] = { + { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" }, + { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" } +}; + +/* Return MD raid4/5/6 journal mode for dm @journal_mode one */ +static int dm_raid_journal_mode_to_md(const char *mode) +{ + int m = ARRAY_SIZE(_raid456_journal_mode); + + while (m--) + if (!strcasecmp(mode, _raid456_journal_mode[m].param)) + return _raid456_journal_mode[m].mode; + + return -EINVAL; +} + +/* Return dm-raid raid4/5/6 journal mode string for @mode */ +static const char *md_journal_mode_to_dm_raid(const int mode) +{ + int m = ARRAY_SIZE(_raid456_journal_mode); + + while (m--) + if (mode == _raid456_journal_mode[m].mode) + return _raid456_journal_mode[m].param; + + return "unknown"; +} + +/* + * Bool helpers to test for various raid levels of a raid set. + * It's level as reported by the superblock rather than + * the requested raid_type passed to the constructor. + */ +/* Return true, if raid set in @rs is raid0 */ +static bool rs_is_raid0(struct raid_set *rs) +{ + return !rs->md.level; +} + +/* Return true, if raid set in @rs is raid1 */ +static bool rs_is_raid1(struct raid_set *rs) +{ + return rs->md.level == 1; +} + +/* Return true, if raid set in @rs is raid10 */ +static bool rs_is_raid10(struct raid_set *rs) +{ + return rs->md.level == 10; +} + +/* Return true, if raid set in @rs is level 6 */ +static bool rs_is_raid6(struct raid_set *rs) +{ + return rs->md.level == 6; +} + +/* Return true, if raid set in @rs is level 4, 5 or 6 */ +static bool rs_is_raid456(struct raid_set *rs) +{ + return __within_range(rs->md.level, 4, 6); +} + +/* Return true, if raid set in @rs is reshapable */ +static bool __is_raid10_far(int layout); +static bool rs_is_reshapable(struct raid_set *rs) +{ + return rs_is_raid456(rs) || + (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout)); +} + +/* Return true, if raid set in @rs is recovering */ +static bool rs_is_recovering(struct raid_set *rs) +{ + return rs->md.recovery_cp < rs->md.dev_sectors; +} + +/* Return true, if raid set in @rs is reshaping */ +static bool rs_is_reshaping(struct raid_set *rs) +{ + return rs->md.reshape_position != MaxSector; +} + +/* + * bool helpers to test for various raid levels of a raid type @rt + */ + +/* Return true, if raid type in @rt is raid0 */ +static bool rt_is_raid0(struct raid_type *rt) +{ + return !rt->level; +} + +/* Return true, if raid type in @rt is raid1 */ +static bool rt_is_raid1(struct raid_type *rt) +{ + return rt->level == 1; +} + +/* Return true, if raid type in @rt is raid10 */ +static bool rt_is_raid10(struct raid_type *rt) +{ + return rt->level == 10; +} + +/* Return true, if raid type in @rt is raid4/5 */ +static bool rt_is_raid45(struct raid_type *rt) +{ + return __within_range(rt->level, 4, 5); +} + +/* Return true, if raid type in @rt is raid6 */ +static bool rt_is_raid6(struct raid_type *rt) +{ + return rt->level == 6; +} + +/* Return true, if raid type in @rt is raid4/5/6 */ +static bool rt_is_raid456(struct raid_type *rt) +{ + return __within_range(rt->level, 4, 6); +} +/* END: raid level bools */ + +/* Return valid ctr flags for the raid level of @rs */ +static unsigned long __valid_flags(struct raid_set *rs) +{ + if (rt_is_raid0(rs->raid_type)) + return RAID0_VALID_FLAGS; + else if (rt_is_raid1(rs->raid_type)) + return RAID1_VALID_FLAGS; + else if (rt_is_raid10(rs->raid_type)) + return RAID10_VALID_FLAGS; + else if (rt_is_raid45(rs->raid_type)) + return RAID45_VALID_FLAGS; + else if (rt_is_raid6(rs->raid_type)) + return RAID6_VALID_FLAGS; + + return 0; +} + +/* + * Check for valid flags set on @rs + * + * Has to be called after parsing of the ctr flags! + */ +static int rs_check_for_valid_flags(struct raid_set *rs) +{ + if (rs->ctr_flags & ~__valid_flags(rs)) { + rs->ti->error = "Invalid flags combination"; + return -EINVAL; + } + + return 0; +} + +/* MD raid10 bit definitions and helpers */ +#define RAID10_OFFSET (1 << 16) /* stripes with data copies area adjacent on devices */ +#define RAID10_BROCKEN_USE_FAR_SETS (1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */ +#define RAID10_USE_FAR_SETS (1 << 18) /* Use sets instead of whole stripe rotation */ +#define RAID10_FAR_COPIES_SHIFT 8 /* raid10 # far copies shift (2nd byte of layout) */ + +/* Return md raid10 near copies for @layout */ +static unsigned int __raid10_near_copies(int layout) +{ + return layout & 0xFF; +} + +/* Return md raid10 far copies for @layout */ +static unsigned int __raid10_far_copies(int layout) +{ + return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT); +} + +/* Return true if md raid10 offset for @layout */ +static bool __is_raid10_offset(int layout) +{ + return !!(layout & RAID10_OFFSET); +} + +/* Return true if md raid10 near for @layout */ +static bool __is_raid10_near(int layout) +{ + return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1; +} + +/* Return true if md raid10 far for @layout */ +static bool __is_raid10_far(int layout) +{ + return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1; +} + +/* Return md raid10 layout string for @layout */ +static const char *raid10_md_layout_to_format(int layout) +{ + /* + * Bit 16 stands for "offset" + * (i.e. adjacent stripes hold copies) + * + * Refer to MD's raid10.c for details + */ + if (__is_raid10_offset(layout)) + return "offset"; + + if (__raid10_near_copies(layout) > 1) + return "near"; + + if (__raid10_far_copies(layout) > 1) + return "far"; + + return "unknown"; +} + +/* Return md raid10 algorithm for @name */ +static int raid10_name_to_format(const char *name) +{ + if (!strcasecmp(name, "near")) + return ALGORITHM_RAID10_NEAR; + else if (!strcasecmp(name, "offset")) + return ALGORITHM_RAID10_OFFSET; + else if (!strcasecmp(name, "far")) + return ALGORITHM_RAID10_FAR; + + return -EINVAL; +} + +/* Return md raid10 copies for @layout */ +static unsigned int raid10_md_layout_to_copies(int layout) +{ + return max(__raid10_near_copies(layout), __raid10_far_copies(layout)); +} + +/* Return md raid10 format id for @format string */ +static int raid10_format_to_md_layout(struct raid_set *rs, + unsigned int algorithm, + unsigned int copies) +{ + unsigned int n = 1, f = 1, r = 0; + + /* + * MD resilienece flaw: + * + * enabling use_far_sets for far/offset formats causes copies + * to be colocated on the same devs together with their origins! + * + * -> disable it for now in the definition above + */ + if (algorithm == ALGORITHM_RAID10_DEFAULT || + algorithm == ALGORITHM_RAID10_NEAR) + n = copies; + + else if (algorithm == ALGORITHM_RAID10_OFFSET) { + f = copies; + r = RAID10_OFFSET; + if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) + r |= RAID10_USE_FAR_SETS; + + } else if (algorithm == ALGORITHM_RAID10_FAR) { + f = copies; + if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) + r |= RAID10_USE_FAR_SETS; + + } else + return -EINVAL; + + return r | (f << RAID10_FAR_COPIES_SHIFT) | n; +} +/* END: MD raid10 bit definitions and helpers */ + +/* Check for any of the raid10 algorithms */ +static bool __got_raid10(struct raid_type *rtp, const int layout) +{ + if (rtp->level == 10) { + switch (rtp->algorithm) { + case ALGORITHM_RAID10_DEFAULT: + case ALGORITHM_RAID10_NEAR: + return __is_raid10_near(layout); + case ALGORITHM_RAID10_OFFSET: + return __is_raid10_offset(layout); + case ALGORITHM_RAID10_FAR: + return __is_raid10_far(layout); + default: + break; + } + } + + return false; +} + +/* Return raid_type for @name */ +static struct raid_type *get_raid_type(const char *name) +{ + struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types); + + while (rtp-- > raid_types) + if (!strcasecmp(rtp->name, name)) + return rtp; + + return NULL; +} + +/* Return raid_type for @name based derived from @level and @layout */ +static struct raid_type *get_raid_type_by_ll(const int level, const int layout) +{ + struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types); + + while (rtp-- > raid_types) { + /* RAID10 special checks based on @layout flags/properties */ + if (rtp->level == level && + (__got_raid10(rtp, layout) || rtp->algorithm == layout)) + return rtp; + } + + return NULL; +} + +/* Adjust rdev sectors */ +static void rs_set_rdev_sectors(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + struct md_rdev *rdev; + + /* + * raid10 sets rdev->sector to the device size, which + * is unintended in case of out-of-place reshaping + */ + rdev_for_each(rdev, mddev) + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = mddev->dev_sectors; +} + +/* + * Change bdev capacity of @rs in case of a disk add/remove reshape + */ +static void rs_set_capacity(struct raid_set *rs) +{ + struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); + + set_capacity_and_notify(gendisk, rs->md.array_sectors); +} + +/* + * Set the mddev properties in @rs to the current + * ones retrieved from the freshest superblock + */ +static void rs_set_cur(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; +} + +/* + * Set the mddev properties in @rs to the new + * ones requested by the ctr + */ +static void rs_set_new(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->raid_disks = rs->raid_disks; + mddev->delta_disks = 0; +} + +static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type, + unsigned int raid_devs) +{ + unsigned int i; + struct raid_set *rs; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(struct_size(rs, dev, raid_devs), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->raid_disks = raid_devs; + rs->delta_disks = 0; + + rs->ti = ti; + rs->raid_type = raid_type; + rs->stripe_cache_entries = 256; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = MaxSector; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + * rs->md.dev_sectors + */ + + return rs; +} + +/* Free all @rs allocations */ +static void raid_set_free(struct raid_set *rs) +{ + int i; + + if (rs->journal_dev.dev) { + md_rdev_clear(&rs->journal_dev.rdev); + dm_put_device(rs->ti, rs->journal_dev.dev); + } + + for (i = 0; i < rs->raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + md_rdev_clear(&rs->dev[i].rdev); + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + } + + kfree(rs); +} + +/* + * For every device we have two words + * : meta device name or '-' if missing + * : data device name or '-' if missing + * + * The following are permitted: + * - - + * - + * + * + * The following is not allowed: + * - + * + * This code parses those words. If there is a failure, + * the caller must use raid_set_free() to unwind the operations. + */ +static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int r = 0; + const char *arg; + + /* Put off the number of raid devices argument to get to dev pairs */ + arg = dm_shift_arg(as); + if (!arg) + return -EINVAL; + + for (i = 0; i < rs->raid_disks; i++) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets initially. + * Out of place reshape will set them accordingly. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.new_data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + arg = dm_shift_arg(as); + if (!arg) + return -EINVAL; + + if (strcmp(arg, "-")) { + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + if (r) { + rs->ti->error = "RAID metadata device lookup failure"; + return r; + } + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) { + rs->ti->error = "Failed to allocate superblock page"; + return -ENOMEM; + } + } + + arg = dm_shift_arg(as); + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + if (rs->dev[i].meta_dev) { + rs->ti->error = "No data device supplied with metadata device"; + return -EINVAL; + } + + continue; + } + + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (r) { + rs->ti->error = "RAID device lookup failure"; + return r; + } + + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (rs->journal_dev.dev) + list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks); + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + rs->ti->error = "Unable to rebuild drive while array is not in-sync"; + return -EINVAL; + } + + return 0; +} + +/* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (rs_is_raid0(rs)) + return 0; + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + /* If not a power of 2, make it the next power of 2 */ + region_size = roundup_pow_of_two(min_region_size); + DMINFO("Choosing default region size of %lu sectors", + region_size); + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = to_bytes(region_size); + + return 0; +} + +/* + * validate_raid_redundancy + * @rs + * + * Determine if there are enough devices in the array that haven't + * failed (or are being rebuilt) to form a usable array. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_raid_redundancy(struct raid_set *rs) +{ + unsigned int i, rebuild_cnt = 0; + unsigned int rebuilds_per_group = 0, copies, raid_disks; + unsigned int group_size, last_group_start; + + for (i = 0; i < rs->raid_disks; i++) + if (!test_bit(FirstUse, &rs->dev[i].rdev.flags) && + ((!test_bit(In_sync, &rs->dev[i].rdev.flags) || + !rs->dev[i].rdev.sb_page))) + rebuild_cnt++; + + switch (rs->md.level) { + case 0: + break; + case 1: + if (rebuild_cnt >= rs->md.raid_disks) + goto too_many; + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) + goto too_many; + break; + case 10: + copies = raid10_md_layout_to_copies(rs->md.new_layout); + if (copies < 2) { + DMERR("Bogus raid10 data copies < 2!"); + return -EINVAL; + } + + if (rebuild_cnt < copies) + break; + + /* + * It is possible to have a higher rebuild count for RAID10, + * as long as the failed devices occur in different mirror + * groups (i.e. different stripes). + * + * When checking "near" format, make sure no adjacent devices + * have failed beyond what can be handled. In addition to the + * simple case where the number of devices is a multiple of the + * number of copies, we must also handle cases where the number + * of devices is not a multiple of the number of copies. + * E.g. dev1 dev2 dev3 dev4 dev5 + * A A B B C + * C D D E E + */ + raid_disks = min(rs->raid_disks, rs->md.raid_disks); + if (__is_raid10_near(rs->md.new_layout)) { + for (i = 0; i < raid_disks; i++) { + if (!(i % copies)) + rebuilds_per_group = 0; + if ((!rs->dev[i].rdev.sb_page || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + } + + /* + * When checking "far" and "offset" formats, we need to ensure + * that the device that holds its copy is not also dead or + * being rebuilt. (Note that "far" and "offset" formats only + * support two copies right now. These formats also only ever + * use the 'use_far_sets' variant.) + * + * This check is somewhat complicated by the need to account + * for arrays that are not a multiple of (far) copies. This + * results in the need to treat the last (potentially larger) + * set differently. + */ + group_size = (raid_disks / copies); + last_group_start = (raid_disks / group_size) - 1; + last_group_start *= group_size; + for (i = 0; i < raid_disks; i++) { + if (!(i % copies) && !(i > last_group_start)) + rebuilds_per_group = 0; + if ((!rs->dev[i].rdev.sb_page || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + default: + if (rebuild_cnt) + return -EINVAL; + } + + return 0; + +too_many: + return -EINVAL; +} + +/* + * Possible arguments are... + * [optional_args] + * + * Argument definitions + * The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array + * [rebuild ] Rebuild the drive indicated by the index + * [daemon_sleep ] Time between bitmap daemon work to + * clear bits + * [min_recovery_rate ] Throttle RAID initialization + * [max_recovery_rate ] Throttle RAID initialization + * [write_mostly ] Indicate a write mostly drive via index + * [max_write_behind ] See '-write-behind=' (man mdadm) + * [stripe_cache ] Stripe cache size for higher RAIDs + * [region_size ] Defines granularity of bitmap + * [journal_dev ] raid4/5/6 journaling deviice + * (i.e. write hole closing log) + * + * RAID10-only options: + * [raid10_copies <# copies>] Number of copies. (Default: 2) + * [raid10_format ] Layout algorithm. (Default: near) + */ +static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, + unsigned int num_raid_params) +{ + int value, raid10_format = ALGORITHM_RAID10_DEFAULT; + unsigned int raid10_copies = 2; + unsigned int i, write_mostly = 0; + unsigned int region_size = 0; + sector_t max_io_len; + const char *arg, *key; + struct raid_dev *rd; + struct raid_type *rt = rs->raid_type; + + arg = dm_shift_arg(as); + num_raid_params--; /* Account for chunk_size argument */ + + if (kstrtoint(arg, 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given for chunk_size"; + return -EINVAL; + } + + /* + * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. + */ + if (rt_is_raid1(rt)) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + + /* + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. + */ + for (i = 0; i < rs->raid_disks; i++) { + set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < num_raid_params; i++) { + key = dm_shift_arg(as); + if (!key) { + rs->ti->error = "Not enough raid parameters given"; + return -EINVAL; + } + + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) { + if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) { + rs->ti->error = "Only one 'nosync' argument allowed"; + return -EINVAL; + } + continue; + } + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) { + if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) { + rs->ti->error = "Only one 'sync' argument allowed"; + return -EINVAL; + } + continue; + } + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) { + if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) { + rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed"; + return -EINVAL; + } + continue; + } + + arg = dm_shift_arg(as); + i++; /* Account for the argument pairs */ + if (!arg) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + /* + * Parameters that take a string value are checked here. + */ + /* "raid10_format {near|offset|far} */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) { + if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) { + rs->ti->error = "Only one 'raid10_format' argument pair allowed"; + return -EINVAL; + } + if (!rt_is_raid10(rt)) { + rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; + return -EINVAL; + } + raid10_format = raid10_name_to_format(arg); + if (raid10_format < 0) { + rs->ti->error = "Invalid 'raid10_format' value given"; + return raid10_format; + } + continue; + } + + /* "journal_dev " */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { + int r; + struct md_rdev *jdev; + + if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 set journaling device allowed"; + return -EINVAL; + } + if (!rt_is_raid456(rt)) { + rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type"; + return -EINVAL; + } + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), + &rs->journal_dev.dev); + if (r) { + rs->ti->error = "raid4/5/6 journal device lookup failure"; + return r; + } + jdev = &rs->journal_dev.rdev; + md_rdev_init(jdev); + jdev->mddev = &rs->md; + jdev->bdev = rs->journal_dev.dev->bdev; + jdev->sectors = bdev_nr_sectors(jdev->bdev); + if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) { + rs->ti->error = "No space for raid4/5/6 journal"; + return -ENOSPC; + } + rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + set_bit(Journal, &jdev->flags); + continue; + } + + /* "journal_mode " ("journal_dev" mandatory!) */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) { + int r; + + if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'"; + return -EINVAL; + } + if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed"; + return -EINVAL; + } + r = dm_raid_journal_mode_to_md(arg); + if (r < 0) { + rs->ti->error = "Invalid 'journal_mode' argument"; + return r; + } + rs->journal_dev.mode = r; + continue; + } + + /* + * Parameters with number values from here on. + */ + if (kstrtoint(arg, 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) { + /* + * "rebuild" is being passed in by userspace to provide + * indexes of replaced devices and to set up additional + * devices on raid level takeover. + */ + if (!__within_range(value, 0, rs->raid_disks - 1)) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + + if (test_and_set_bit(value, (void *) rs->rebuild_disks)) { + rs->ti->error = "rebuild for this index already given"; + return -EINVAL; + } + + rd = rs->dev + value; + clear_bit(In_sync, &rd->rdev.flags); + clear_bit(Faulty, &rd->rdev.flags); + rd->rdev.recovery_offset = 0; + set_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags); + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) { + if (!rt_is_raid1(rt)) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + + if (!__within_range(value, 0, rs->md.raid_disks - 1)) { + rs->ti->error = "Invalid write_mostly index given"; + return -EINVAL; + } + + write_mostly++; + set_bit(WriteMostly, &rs->dev[value].rdev.flags); + set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags); + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) { + if (!rt_is_raid1(rt)) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } + + if (test_and_set_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) { + rs->ti->error = "Only one max_write_behind argument pair allowed"; + return -EINVAL; + } + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + if (value < 0 || value / 2 > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + + rs->md.bitmap_info.max_write_behind = value / 2; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) { + if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) { + rs->ti->error = "Only one daemon_sleep argument pair allowed"; + return -EINVAL; + } + if (value < 0) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) { + /* Userspace passes new data_offset after having extended the data image LV */ + if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) { + rs->ti->error = "Only one data_offset argument pair allowed"; + return -EINVAL; + } + /* Ensure sensible data offset */ + if (value < 0 || + (value && (value < MIN_FREE_RESHAPE_SPACE || value % to_sector(PAGE_SIZE)))) { + rs->ti->error = "Bogus data_offset value"; + return -EINVAL; + } + rs->data_offset = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS))) { + /* Define the +/-# of disks to add to/remove from the given raid set */ + if (test_and_set_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) { + rs->ti->error = "Only one delta_disks argument pair allowed"; + return -EINVAL; + } + /* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */ + if (!__within_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) { + rs->ti->error = "Too many delta_disk requested"; + return -EINVAL; + } + + rs->delta_disks = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE))) { + if (test_and_set_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) { + rs->ti->error = "Only one stripe_cache argument pair allowed"; + return -EINVAL; + } + + if (!rt_is_raid456(rt)) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + + if (value < 0) { + rs->ti->error = "Bogus stripe cache entries value"; + return -EINVAL; + } + rs->stripe_cache_entries = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) { + if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { + rs->ti->error = "Only one min_recovery_rate argument pair allowed"; + return -EINVAL; + } + + if (value < 0) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) { + if (test_and_set_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) { + rs->ti->error = "Only one max_recovery_rate argument pair allowed"; + return -EINVAL; + } + + if (value < 0) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) { + if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) { + rs->ti->error = "Only one region_size argument pair allowed"; + return -EINVAL; + } + + region_size = value; + rs->requested_bitmap_chunk_sectors = value; + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) { + if (test_and_set_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) { + rs->ti->error = "Only one raid10_copies argument pair allowed"; + return -EINVAL; + } + + if (!__within_range(value, 2, rs->md.raid_disks)) { + rs->ti->error = "Bad value for 'raid10_copies'"; + return -EINVAL; + } + + raid10_copies = value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameter"; + return -EINVAL; + } + } + + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) && + test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) { + rs->ti->error = "sync and nosync are mutually exclusive"; + return -EINVAL; + } + + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && + (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) || + test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))) { + rs->ti->error = "sync/nosync and rebuild are mutually exclusive"; + return -EINVAL; + } + + if (write_mostly >= rs->md.raid_disks) { + rs->ti->error = "Can't set all raid1 devices to write_mostly"; + return -EINVAL; + } + + if (rs->md.sync_speed_max && + rs->md.sync_speed_min > rs->md.sync_speed_max) { + rs->ti->error = "Bogus recovery rates"; + return -EINVAL; + } + + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + max_io_len = rs->md.chunk_sectors; + else + max_io_len = region_size; + + if (dm_set_target_max_io_len(rs->ti, max_io_len)) + return -EINVAL; + + if (rt_is_raid10(rt)) { + if (raid10_copies > rs->md.raid_disks) { + rs->ti->error = "Not enough devices to satisfy specification"; + return -EINVAL; + } + + rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies); + if (rs->md.new_layout < 0) { + rs->ti->error = "Error getting raid10 format"; + return rs->md.new_layout; + } + + rt = get_raid_type_by_ll(10, rs->md.new_layout); + if (!rt) { + rs->ti->error = "Failed to recognize new raid10 layout"; + return -EINVAL; + } + + if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT || + rt->algorithm == ALGORITHM_RAID10_NEAR) && + test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) { + rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible"; + return -EINVAL; + } + } + + rs->raid10_copies = raid10_copies; + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + /* Check, if any invalid ctr arguments have been passed in for the raid level */ + return rs_check_for_valid_flags(rs); +} + +/* Set raid4/5/6 cache size */ +static int rs_set_raid456_stripe_cache(struct raid_set *rs) +{ + int r; + struct r5conf *conf; + struct mddev *mddev = &rs->md; + uint32_t min_stripes = max(mddev->chunk_sectors, mddev->new_chunk_sectors) / 2; + uint32_t nr_stripes = rs->stripe_cache_entries; + + if (!rt_is_raid456(rs->raid_type)) { + rs->ti->error = "Inappropriate raid level; cannot change stripe_cache size"; + return -EINVAL; + } + + if (nr_stripes < min_stripes) { + DMINFO("Adjusting requested %u stripe cache entries to %u to suit stripe size", + nr_stripes, min_stripes); + nr_stripes = min_stripes; + } + + conf = mddev->private; + if (!conf) { + rs->ti->error = "Cannot change stripe_cache size on inactive RAID set"; + return -EINVAL; + } + + /* Try setting number of stripes in raid456 stripe cache */ + if (conf->min_nr_stripes != nr_stripes) { + r = raid5_set_cache_size(mddev, nr_stripes); + if (r) { + rs->ti->error = "Failed to set raid4/5/6 stripe cache size"; + return r; + } + + DMINFO("%u stripe cache entries", nr_stripes); + } + + return 0; +} + +/* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */ +static unsigned int mddev_data_stripes(struct raid_set *rs) +{ + return rs->md.raid_disks - rs->raid_type->parity_devs; +} + +/* Return # of data stripes of @rs (i.e. as of ctr) */ +static unsigned int rs_data_stripes(struct raid_set *rs) +{ + return rs->raid_disks - rs->raid_type->parity_devs; +} + +/* + * Retrieve rdev->sectors from any valid raid device of @rs + * to allow userpace to pass in arbitray "- -" device tupples. + */ +static sector_t __rdev_sectors(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->raid_disks; i++) { + struct md_rdev *rdev = &rs->dev[i].rdev; + + if (!test_bit(Journal, &rdev->flags) && + rdev->bdev && rdev->sectors) + return rdev->sectors; + } + + return 0; +} + +/* Check that calculated dev_sectors fits all component devices. */ +static int _check_data_dev_sectors(struct raid_set *rs) +{ + sector_t ds = ~0; + struct md_rdev *rdev; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags) && rdev->bdev) { + ds = min(ds, bdev_nr_sectors(rdev->bdev)); + if (ds < rs->md.dev_sectors) { + rs->ti->error = "Component device(s) too small"; + return -EINVAL; + } + } + + return 0; +} + +/* Calculate the sectors per device and per array used for @rs */ +static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) +{ + int delta_disks; + unsigned int data_stripes; + sector_t array_sectors = sectors, dev_sectors = sectors; + struct mddev *mddev = &rs->md; + + if (use_mddev) { + delta_disks = mddev->delta_disks; + data_stripes = mddev_data_stripes(rs); + } else { + delta_disks = rs->delta_disks; + data_stripes = rs_data_stripes(rs); + } + + /* Special raid1 case w/o delta_disks support (yet) */ + if (rt_is_raid1(rs->raid_type)) + ; + else if (rt_is_raid10(rs->raid_type)) { + if (rs->raid10_copies < 2 || + delta_disks < 0) { + rs->ti->error = "Bogus raid10 data copies or delta disks"; + return -EINVAL; + } + + dev_sectors *= rs->raid10_copies; + if (sector_div(dev_sectors, data_stripes)) + goto bad; + + array_sectors = (data_stripes + delta_disks) * dev_sectors; + if (sector_div(array_sectors, rs->raid10_copies)) + goto bad; + + } else if (sector_div(dev_sectors, data_stripes)) + goto bad; + + else + /* Striped layouts */ + array_sectors = (data_stripes + delta_disks) * dev_sectors; + + mddev->array_sectors = array_sectors; + mddev->dev_sectors = dev_sectors; + rs_set_rdev_sectors(rs); + + return _check_data_dev_sectors(rs); +bad: + rs->ti->error = "Target length not divisible by number of data devices"; + return -EINVAL; +} + +/* Setup recovery on @rs */ +static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) +{ + /* raid0 does not recover */ + if (rs_is_raid0(rs)) + rs->md.recovery_cp = MaxSector; + /* + * A raid6 set has to be recovered either + * completely or for the grown part to + * ensure proper parity and Q-Syndrome + */ + else if (rs_is_raid6(rs)) + rs->md.recovery_cp = dev_sectors; + /* + * Other raid set types may skip recovery + * depending on the 'nosync' flag. + */ + else + rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) + ? MaxSector : dev_sectors; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + smp_rmb(); /* Make sure we access most actual mddev properties */ + if (!rs_is_reshaping(rs)) { + if (rs_is_raid10(rs)) + rs_set_rdev_sectors(rs); + rs_set_capacity(rs); + } + dm_table_event(rs->ti->table); +} + +/* + * Make sure a valid takover (level switch) is being requested on @rs + * + * Conversions of raid sets from one MD personality to another + * have to conform to restrictions which are enforced here. + */ +static int rs_check_takeover(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + unsigned int near_copies; + + if (rs->md.degraded) { + rs->ti->error = "Can't takeover degraded raid set"; + return -EPERM; + } + + if (rs_is_reshaping(rs)) { + rs->ti->error = "Can't takeover reshaping raid set"; + return -EPERM; + } + + switch (mddev->level) { + case 0: + /* raid0 -> raid1/5 with one disk */ + if ((mddev->new_level == 1 || mddev->new_level == 5) && + mddev->raid_disks == 1) + return 0; + + /* raid0 -> raid10 */ + if (mddev->new_level == 10 && + !(rs->raid_disks % mddev->raid_disks)) + return 0; + + /* raid0 with multiple disks -> raid4/5/6 */ + if (__within_range(mddev->new_level, 4, 6) && + mddev->new_layout == ALGORITHM_PARITY_N && + mddev->raid_disks > 1) + return 0; + + break; + + case 10: + /* Can't takeover raid10_offset! */ + if (__is_raid10_offset(mddev->layout)) + break; + + near_copies = __raid10_near_copies(mddev->layout); + + /* raid10* -> raid0 */ + if (mddev->new_level == 0) { + /* Can takeover raid10_near with raid disks divisable by data copies! */ + if (near_copies > 1 && + !(mddev->raid_disks % near_copies)) { + mddev->raid_disks /= near_copies; + mddev->delta_disks = mddev->raid_disks; + return 0; + } + + /* Can takeover raid10_far */ + if (near_copies == 1 && + __raid10_far_copies(mddev->layout) > 1) + return 0; + + break; + } + + /* raid10_{near,far} -> raid1 */ + if (mddev->new_level == 1 && + max(near_copies, __raid10_far_copies(mddev->layout)) == mddev->raid_disks) + return 0; + + /* raid10_{near,far} with 2 disks -> raid4/5 */ + if (__within_range(mddev->new_level, 4, 5) && + mddev->raid_disks == 2) + return 0; + break; + + case 1: + /* raid1 with 2 disks -> raid4/5 */ + if (__within_range(mddev->new_level, 4, 5) && + mddev->raid_disks == 2) { + mddev->degraded = 1; + return 0; + } + + /* raid1 -> raid0 */ + if (mddev->new_level == 0 && + mddev->raid_disks == 1) + return 0; + + /* raid1 -> raid10 */ + if (mddev->new_level == 10) + return 0; + break; + + case 4: + /* raid4 -> raid0 */ + if (mddev->new_level == 0) + return 0; + + /* raid4 -> raid1/5 with 2 disks */ + if ((mddev->new_level == 1 || mddev->new_level == 5) && + mddev->raid_disks == 2) + return 0; + + /* raid4 -> raid5/6 with parity N */ + if (__within_range(mddev->new_level, 5, 6) && + mddev->layout == ALGORITHM_PARITY_N) + return 0; + break; + + case 5: + /* raid5 with parity N -> raid0 */ + if (mddev->new_level == 0 && + mddev->layout == ALGORITHM_PARITY_N) + return 0; + + /* raid5 with parity N -> raid4 */ + if (mddev->new_level == 4 && + mddev->layout == ALGORITHM_PARITY_N) + return 0; + + /* raid5 with 2 disks -> raid1/4/10 */ + if ((mddev->new_level == 1 || mddev->new_level == 4 || mddev->new_level == 10) && + mddev->raid_disks == 2) + return 0; + + /* raid5_* -> raid6_*_6 with Q-Syndrome N (e.g. raid5_ra -> raid6_ra_6 */ + if (mddev->new_level == 6 && + ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) || + __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6))) + return 0; + break; + + case 6: + /* raid6 with parity N -> raid0 */ + if (mddev->new_level == 0 && + mddev->layout == ALGORITHM_PARITY_N) + return 0; + + /* raid6 with parity N -> raid4 */ + if (mddev->new_level == 4 && + mddev->layout == ALGORITHM_PARITY_N) + return 0; + + /* raid6_*_n with Q-Syndrome N -> raid5_* */ + if (mddev->new_level == 5 && + ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) || + __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC))) + return 0; + break; + + default: + break; + } + + rs->ti->error = "takeover not possible"; + return -EINVAL; +} + +/* True if @rs requested to be taken over */ +static bool rs_takeover_requested(struct raid_set *rs) +{ + return rs->md.new_level != rs->md.level; +} + +/* True if layout is set to reshape. */ +static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev) +{ + return (use_mddev ? rs->md.delta_disks : rs->delta_disks) || + rs->md.new_layout != rs->md.layout || + rs->md.new_chunk_sectors != rs->md.chunk_sectors; +} + +/* True if @rs is requested to reshape by ctr */ +static bool rs_reshape_requested(struct raid_set *rs) +{ + bool change; + struct mddev *mddev = &rs->md; + + if (rs_takeover_requested(rs)) + return false; + + if (rs_is_raid0(rs)) + return false; + + change = rs_is_layout_change(rs, false); + + /* Historical case to support raid1 reshape without delta disks */ + if (rs_is_raid1(rs)) { + if (rs->delta_disks) + return !!rs->delta_disks; + + return !change && + mddev->raid_disks != rs->raid_disks; + } + + if (rs_is_raid10(rs)) + return change && + !__is_raid10_far(mddev->new_layout) && + rs->delta_disks >= 0; + + return change; +} + +/* Features */ +#define FEATURE_FLAG_SUPPORTS_V190 0x1 /* Supports extended superblock */ + +/* State flags for sb->flags */ +#define SB_FLAG_RESHAPE_ACTIVE 0x1 +#define SB_FLAG_RESHAPE_BACKWARDS 0x2 + +/* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 compat_features; /* Used to indicate compatible features (like 1.9.0 ondisk metadata extension) */ + + __le32 num_devices; /* Number of devices in this raid set. (Max 64) */ + __le32 array_position; /* The position of this drive in the raid set */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Pre 1.9.0 part of bit field of devices to */ + /* indicate failures (see extension below) */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial raid set + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * raid characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + /******************************************************************** + * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!! + * + * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist + */ + + __le32 flags; /* Flags defining array states for reshaping */ + + /* + * This offset tracks the progress of a raid + * set reshape in order to be able to restart it + */ + __le64 reshape_position; + + /* + * These define the properties of the array in case of an interrupted reshape + */ + __le32 new_level; + __le32 new_layout; + __le32 new_stripe_sectors; + __le32 delta_disks; + + __le64 array_sectors; /* Array size in sectors */ + + /* + * Sector offsets to data on devices (reshaping). + * Needed to support out of place reshaping, thus + * not writing over any stripes whilst converting + * them from old to new layout + */ + __le64 data_offset; + __le64 new_data_offset; + + __le64 sectors; /* Used device size in sectors */ + + /* + * Additonal Bit field of devices indicating failures to support + * up to 256 devices with the 1.9.0 on-disk metadata format + */ + __le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1]; + + __le32 incompat_features; /* Used to indicate any incompatible features */ + + /* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */ +} __packed; + +/* + * Check for reshape constraints on raid set @rs: + * + * - reshape function non-existent + * - degraded set + * - ongoing recovery + * - ongoing reshape + * + * Returns 0 if none or -EPERM if given constraint + * and error message reference in @errmsg + */ +static int rs_check_reshape(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + + if (!mddev->pers || !mddev->pers->check_reshape) + rs->ti->error = "Reshape not supported"; + else if (mddev->degraded) + rs->ti->error = "Can't reshape degraded raid set"; + else if (rs_is_recovering(rs)) + rs->ti->error = "Convert request on recovering raid set prohibited"; + else if (rs_is_reshaping(rs)) + rs->ti->error = "raid set already reshaping!"; + else if (!(rs_is_raid1(rs) || rs_is_raid10(rs) || rs_is_raid456(rs))) + rs->ti->error = "Reshaping only supported for raid1/4/5/6/10"; + else + return 0; + + return -EPERM; +} + +static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded && !force_reload) + return 0; + + rdev->sb_loaded = 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) { + DMERR("Failed to read superblock of device at position %d", + rdev->raid_disk); + md_error(rdev->mddev, rdev); + set_bit(Faulty, &rdev->flags); + return -EIO; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices) +{ + failed_devices[0] = le64_to_cpu(sb->failed_devices); + memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices)); + + if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) { + int i = ARRAY_SIZE(sb->extended_failed_devices); + + while (i--) + failed_devices[i+1] = le64_to_cpu(sb->extended_failed_devices[i]); + } +} + +static void sb_update_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices) +{ + int i = ARRAY_SIZE(sb->extended_failed_devices); + + sb->failed_devices = cpu_to_le64(failed_devices[0]); + while (i--) + sb->extended_failed_devices[i] = cpu_to_le64(failed_devices[i+1]); +} + +/* + * Synchronize the superblock members with the raid set properties + * + * All superblock data is little endian. + */ +static void super_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + bool update_failed_devices = false; + unsigned int i; + uint64_t failed_devices[DISKS_ARRAY_ELEMS]; + struct dm_raid_superblock *sb; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + + /* No metadata device, no superblock */ + if (!rdev->meta_bdev) + return; + + BUG_ON(!rdev->sb_page); + + sb = page_address(rdev->sb_page); + + sb_retrieve_failed_devices(sb, failed_devices); + + for (i = 0; i < rs->raid_disks; i++) + if (!rs->dev[i].data_dev || test_bit(Faulty, &rs->dev[i].rdev.flags)) { + update_failed_devices = true; + set_bit(i, (void *) failed_devices); + } + + if (update_failed_devices) + sb_update_failed_devices(sb, failed_devices); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); + + /******************************************************************** + * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!! + * + * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist + */ + sb->new_level = cpu_to_le32(mddev->new_level); + sb->new_layout = cpu_to_le32(mddev->new_layout); + sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors); + + sb->delta_disks = cpu_to_le32(mddev->delta_disks); + + smp_rmb(); /* Make sure we access most recent reshape position */ + sb->reshape_position = cpu_to_le64(mddev->reshape_position); + if (le64_to_cpu(sb->reshape_position) != MaxSector) { + /* Flag ongoing reshape */ + sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE); + + if (mddev->delta_disks < 0 || mddev->reshape_backwards) + sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS); + } else { + /* Clear reshape flags */ + sb->flags &= ~(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS)); + } + + sb->array_sectors = cpu_to_le64(mddev->array_sectors); + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->new_data_offset = cpu_to_le64(rdev->new_data_offset); + sb->sectors = cpu_to_le64(rdev->sectors); + sb->incompat_features = cpu_to_le32(0); + + /* Zero out the rest of the payload after the size of the superblock */ + memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) +{ + int r; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + r = read_disk_sb(rdev, rdev->sb_size, false); + if (r) + return r; + + sb = page_address(rdev->sb_page); + + /* + * Two cases that we want to write new superblocks and rebuild: + * 1) New device (no matching magic number) + * 2) Device specified for rebuild (!In_sync w/ offset == 0) + */ + if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || + (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); + + /* Force writing of superblocks to disk */ + set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) +{ + int role; + unsigned int d; + struct mddev *mddev = &rs->md; + uint64_t events_sb; + uint64_t failed_devices[DISKS_ARRAY_ELEMS]; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0, rebuild_and_new = 0, rebuilds = 0; + struct md_rdev *r; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + mddev->reshape_position = MaxSector; + + mddev->raid_disks = le32_to_cpu(sb->num_devices); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); + + /* + * Reshaping is supported, e.g. reshape_position is valid + * in superblock and superblock content is authoritative. + */ + if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) { + /* Superblock is authoritative wrt given raid set layout! */ + mddev->new_level = le32_to_cpu(sb->new_level); + mddev->new_layout = le32_to_cpu(sb->new_layout); + mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors); + mddev->delta_disks = le32_to_cpu(sb->delta_disks); + mddev->array_sectors = le64_to_cpu(sb->array_sectors); + + /* raid was reshaping and got interrupted */ + if (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_ACTIVE) { + if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) { + DMERR("Reshape requested but raid set is still reshaping"); + return -EINVAL; + } + + if (mddev->delta_disks < 0 || + (!mddev->delta_disks && (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_BACKWARDS))) + mddev->reshape_backwards = 1; + else + mddev->reshape_backwards = 0; + + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout); + } + + } else { + /* + * No takeover/reshaping, because we don't have the extended v1.9.0 metadata + */ + struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout); + struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout); + + if (rs_takeover_requested(rs)) { + if (rt_cur && rt_new) + DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)", + rt_cur->name, rt_new->name); + else + DMERR("Takeover raid sets not yet supported by metadata. (raid level change)"); + return -EINVAL; + } else if (rs_reshape_requested(rs)) { + DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)"); + if (mddev->layout != mddev->new_layout) { + if (rt_cur && rt_new) + DMERR(" current layout %s vs new layout %s", + rt_cur->name, rt_new->name); + else + DMERR(" current layout 0x%X vs new layout 0x%X", + le32_to_cpu(sb->layout), mddev->new_layout); + } + if (mddev->chunk_sectors != mddev->new_chunk_sectors) + DMERR(" current stripe sectors %u vs new stripe sectors %u", + mddev->chunk_sectors, mddev->new_chunk_sectors); + if (rs->delta_disks) + DMERR(" current %u disks vs new %u disks", + mddev->raid_disks, mddev->raid_disks + rs->delta_disks); + if (rs_is_raid10(rs)) { + DMERR(" Old layout: %s w/ %u copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + DMERR(" New layout: %s w/ %u copies", + raid10_md_layout_to_format(mddev->new_layout), + raid10_md_layout_to_copies(mddev->new_layout)); + } + return -EINVAL; + } + + DMINFO("Discovered old metadata format; upgrading to extended metadata format"); + } + + if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The raid set is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old raid set + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + * 3) This is/are a new device(s) being added to an old + * raid set during takeover to a higher raid level + * to provide capacity for redundancy or during reshape + * to add capacity to grow the raid set. + */ + d = 0; + rdev_for_each(r, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + + if (test_bit(FirstUse, &r->flags)) + new_devs++; + + if (!test_bit(In_sync, &r->flags)) { + DMINFO("Device %d specified for rebuild; clearing superblock", + r->raid_disk); + rebuilds++; + + if (test_bit(FirstUse, &r->flags)) + rebuild_and_new++; + } + + d++; + } + + if (new_devs == rs->raid_disks || !rebuilds) { + /* Replace a broken device */ + if (new_devs == rs->raid_disks) { + DMINFO("Superblocks created for new raid set"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs != rebuilds && + new_devs != rs->delta_disks) { + DMERR("New device injected into existing raid set without " + "'delta_disks' or 'rebuild' parameter specified"); + return -EINVAL; + } + } else if (new_devs && new_devs != rebuilds) { + DMERR("%u 'rebuild' devices cannot be injected into" + " a raid set with %u other first-time devices", + rebuilds, new_devs); + return -EINVAL; + } else if (rebuilds) { + if (rebuild_and_new && rebuilds != rebuild_and_new) { + DMERR("new device%s provided without 'rebuild'", + new_devs > 1 ? "s" : ""); + return -EINVAL; + } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { + DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", + (unsigned long long) mddev->recovery_cp); + return -EINVAL; + } else if (rs_is_reshaping(rs)) { + DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", + (unsigned long long) mddev->reshape_position); + return -EINVAL; + } + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + sb_retrieve_failed_devices(sb, failed_devices); + rdev_for_each(r, mddev) { + if (test_bit(Journal, &rdev->flags) || + !r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + memset(sb2->extended_failed_devices, 0, sizeof(sb2->extended_failed_devices)); + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role < 0) + continue; + + if (role != r->raid_disk) { + if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) { + if (mddev->raid_disks % __raid10_near_copies(mddev->layout) || + rs->raid_disks % rs->raid10_copies) { + rs->ti->error = + "Cannot change raid10 near set to odd # of devices!"; + return -EINVAL; + } + + sb2->array_position = cpu_to_le32(r->raid_disk); + + } else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) && + !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) && + !rt_is_raid1(rs->raid_type)) { + rs->ti->error = "Cannot change device positions in raid set"; + return -EINVAL; + } + + DMINFO("raid device #%d now at position #%d", role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (test_bit(role, (void *) failed_devices)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(struct raid_set *rs, struct md_rdev *rdev) +{ + struct mddev *mddev = &rs->md; + struct dm_raid_superblock *sb; + + if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0) + return 0; + + sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(rs, rdev)) + return -EINVAL; + + if (le32_to_cpu(sb->compat_features) && + le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V190) { + rs->ti->error = "Unable to assemble array: Unknown flag(s) in compatible feature flags"; + return -EINVAL; + } + + if (sb->incompat_features) { + rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet"; + return -EINVAL; + } + + /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */ + mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); + mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; + + if (!test_and_clear_bit(FirstUse, &rdev->flags)) { + /* + * Retrieve rdev size stored in superblock to be prepared for shrink. + * Check extended superblock members are present otherwise the size + * will not be set! + */ + if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) + rdev->sectors = le64_to_cpu(sb->sectors); + + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset == MaxSector) + set_bit(In_sync, &rdev->flags); + /* + * If no reshape in progress -> we're recovering single + * disk(s) and have to set the device(s) to out-of-sync + */ + else if (!rs_is_reshaping(rs)) + clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */ + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_and_clear_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + } + + /* Reshape support -> restore repective data offsets */ + rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = le64_to_cpu(sb->new_data_offset); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int r; + struct md_rdev *rdev, *freshest; + struct mddev *mddev = &rs->md; + + freshest = NULL; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + + if (!rdev->meta_bdev) + continue; + + /* Set superblock offset/size for metadata device. */ + rdev->sb_start = 0; + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(struct dm_raid_superblock) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } + + /* + * Skipping super_load due to CTR_FLAG_SYNC will cause + * the array to undergo initialization again as + * though it were new. This is the intended effect + * of the "sync" directive. + * + * With reshaping capability added, we must ensure that + * the "sync" directive is disallowed during the reshape. + */ + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) + continue; + + r = super_load(rdev, freshest); + + switch (r) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + /* This is a failure to read the superblock from the metadata device. */ + /* + * We have to keep any raid0 data/metadata device pairs or + * the MD raid0 personality will fail to start the array. + */ + if (rs_is_raid0(rs)) + continue; + + /* + * We keep the dm_devs to be able to emit the device tuple + * properly on the table line in raid_status() (rather than + * mistakenly acting as if '- -' got passed into the constructor). + * + * The rdev has to stay on the same_set list to allow for + * the attempt to restore faulty devices on second resume. + */ + rdev->raid_disk = rdev->saved_raid_disk = -1; + break; + } + } + + if (!freshest) + return 0; + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + rs->ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(rs, freshest)) + return -EINVAL; + + if (validate_raid_redundancy(rs)) { + rs->ti->error = "Insufficient redundancy to activate array"; + return -EINVAL; + } + + rdev_for_each(rdev, mddev) + if (!test_bit(Journal, &rdev->flags) && + rdev != freshest && + super_validate(rs, rdev)) + return -EINVAL; + return 0; +} + +/* + * Adjust data_offset and new_data_offset on all disk members of @rs + * for out of place reshaping if requested by constructor + * + * We need free space at the beginning of each raid disk for forward + * and at the end for backward reshapes which userspace has to provide + * via remapping/reordering of space. + */ +static int rs_adjust_data_offsets(struct raid_set *rs) +{ + sector_t data_offset = 0, new_data_offset = 0; + struct md_rdev *rdev; + + /* Constructor did not request data offset change */ + if (!test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) { + if (!rs_is_reshapable(rs)) + goto out; + + return 0; + } + + /* HM FIXME: get In_Sync raid_dev? */ + rdev = &rs->dev[0].rdev; + + if (rs->delta_disks < 0) { + /* + * Removing disks (reshaping backwards): + * + * - before reshape: data is at offset 0 and free space + * is at end of each component LV + * + * - after reshape: data is at offset rs->data_offset != 0 on each component LV + */ + data_offset = 0; + new_data_offset = rs->data_offset; + + } else if (rs->delta_disks > 0) { + /* + * Adding disks (reshaping forwards): + * + * - before reshape: data is at offset rs->data_offset != 0 and + * free space is at begin of each component LV + * + * - after reshape: data is at offset 0 on each component LV + */ + data_offset = rs->data_offset; + new_data_offset = 0; + + } else { + /* + * User space passes in 0 for data offset after having removed reshape space + * + * - or - (data offset != 0) + * + * Changing RAID layout or chunk size -> toggle offsets + * + * - before reshape: data is at offset rs->data_offset 0 and + * free space is at end of each component LV + * -or- + * data is at offset rs->data_offset != 0 and + * free space is at begin of each component LV + * + * - after reshape: data is at offset 0 if it was at offset != 0 + * or at offset != 0 if it was at offset 0 + * on each component LV + * + */ + data_offset = rs->data_offset ? rdev->data_offset : 0; + new_data_offset = data_offset ? 0 : rs->data_offset; + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } + + /* + * Make sure we got a minimum amount of free sectors per device + */ + if (rs->data_offset && + bdev_nr_sectors(rdev->bdev) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) { + rs->ti->error = data_offset ? "No space for forward reshape" : + "No space for backward reshape"; + return -ENOSPC; + } +out: + /* + * Raise recovery_cp in case data_offset != 0 to + * avoid false recovery positives in the constructor. + */ + if (rs->md.recovery_cp < rs->md.dev_sectors) + rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ + rdev_for_each(rdev, &rs->md) { + if (!test_bit(Journal, &rdev->flags)) { + rdev->data_offset = data_offset; + rdev->new_data_offset = new_data_offset; + } + } + + return 0; +} + +/* Userpace reordered disks -> adjust raid_disk indexes in @rs */ +static void __reorder_raid_disk_indexes(struct raid_set *rs) +{ + int i = 0; + struct md_rdev *rdev; + + rdev_for_each(rdev, &rs->md) { + if (!test_bit(Journal, &rdev->flags)) { + rdev->raid_disk = i++; + rdev->saved_raid_disk = rdev->new_raid_disk = -1; + } + } +} + +/* + * Setup @rs for takeover by a different raid level + */ +static int rs_setup_takeover(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + struct md_rdev *rdev; + unsigned int d = mddev->raid_disks = rs->raid_disks; + sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset; + + if (rt_is_raid10(rs->raid_type)) { + if (rs_is_raid0(rs)) { + /* Userpace reordered disks -> adjust raid_disk indexes */ + __reorder_raid_disk_indexes(rs); + + /* raid0 -> raid10_far layout */ + mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR, + rs->raid10_copies); + } else if (rs_is_raid1(rs)) + /* raid1 -> raid10_near layout */ + mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, + rs->raid_disks); + else + return -EINVAL; + + } + + clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + mddev->recovery_cp = MaxSector; + + while (d--) { + rdev = &rs->dev[d].rdev; + + if (test_bit(d, (void *) rs->rebuild_disks)) { + clear_bit(In_sync, &rdev->flags); + clear_bit(Faulty, &rdev->flags); + mddev->recovery_cp = rdev->recovery_offset = 0; + /* Bitmap has to be created when we do an "up" takeover */ + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } + + rdev->new_data_offset = new_data_offset; + } + + return 0; +} + +/* Prepare @rs for reshape */ +static int rs_prepare_reshape(struct raid_set *rs) +{ + bool reshape; + struct mddev *mddev = &rs->md; + + if (rs_is_raid10(rs)) { + if (rs->raid_disks != mddev->raid_disks && + __is_raid10_near(mddev->layout) && + rs->raid10_copies && + rs->raid10_copies != __raid10_near_copies(mddev->layout)) { + /* + * raid disk have to be multiple of data copies to allow this conversion, + * + * This is actually not a reshape it is a + * rebuild of any additional mirrors per group + */ + if (rs->raid_disks % rs->raid10_copies) { + rs->ti->error = "Can't reshape raid10 mirror groups"; + return -EINVAL; + } + + /* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */ + __reorder_raid_disk_indexes(rs); + mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, + rs->raid10_copies); + mddev->new_layout = mddev->layout; + reshape = false; + } else + reshape = true; + + } else if (rs_is_raid456(rs)) + reshape = true; + + else if (rs_is_raid1(rs)) { + if (rs->delta_disks) { + /* Process raid1 via delta_disks */ + mddev->degraded = rs->delta_disks < 0 ? -rs->delta_disks : rs->delta_disks; + reshape = true; + } else { + /* Process raid1 without delta_disks */ + mddev->raid_disks = rs->raid_disks; + reshape = false; + } + } else { + rs->ti->error = "Called with bogus raid type"; + return -EINVAL; + } + + if (reshape) { + set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags); + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } else if (mddev->raid_disks < rs->raid_disks) + /* Create new superblocks and bitmaps, if any new disks */ + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + + return 0; +} + +/* Get reshape sectors from data_offsets or raid set */ +static sector_t _get_reshape_sectors(struct raid_set *rs) +{ + struct md_rdev *rdev; + sector_t reshape_sectors = 0; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) { + reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? + rdev->data_offset - rdev->new_data_offset : + rdev->new_data_offset - rdev->data_offset; + break; + } + + return max(reshape_sectors, (sector_t) rs->data_offset); +} + +/* + * Reshape: + * - change raid layout + * - change chunk size + * - add disks + * - remove disks + */ +static int rs_setup_reshape(struct raid_set *rs) +{ + int r = 0; + unsigned int cur_raid_devs, d; + sector_t reshape_sectors = _get_reshape_sectors(rs); + struct mddev *mddev = &rs->md; + struct md_rdev *rdev; + + mddev->delta_disks = rs->delta_disks; + cur_raid_devs = mddev->raid_disks; + + /* Ignore impossible layout change whilst adding/removing disks */ + if (mddev->delta_disks && + mddev->layout != mddev->new_layout) { + DMINFO("Ignoring invalid layout change with delta_disks=%d", rs->delta_disks); + mddev->new_layout = mddev->layout; + } + + /* + * Adjust array size: + * + * - in case of adding disk(s), array size has + * to grow after the disk adding reshape, + * which'll hapen in the event handler; + * reshape will happen forward, so space has to + * be available at the beginning of each disk + * + * - in case of removing disk(s), array size + * has to shrink before starting the reshape, + * which'll happen here; + * reshape will happen backward, so space has to + * be available at the end of each disk + * + * - data_offset and new_data_offset are + * adjusted for aforementioned out of place + * reshaping based on userspace passing in + * the "data_offset " key/value + * pair via the constructor + */ + + /* Add disk(s) */ + if (rs->delta_disks > 0) { + /* Prepare disks for check in raid4/5/6/10 {check|start}_reshape */ + for (d = cur_raid_devs; d < rs->raid_disks; d++) { + rdev = &rs->dev[d].rdev; + clear_bit(In_sync, &rdev->flags); + + /* + * save_raid_disk needs to be -1, or recovery_offset will be set to 0 + * by md, which'll store that erroneously in the superblock on reshape + */ + rdev->saved_raid_disk = -1; + rdev->raid_disk = d; + + rdev->sectors = mddev->dev_sectors; + rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector; + } + + mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */ + + /* Remove disk(s) */ + } else if (rs->delta_disks < 0) { + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true); + mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */ + + /* Change layout and/or chunk size */ + } else { + /* + * Reshape layout (e.g. raid5_ls -> raid5_n) and/or chunk size: + * + * keeping number of disks and do layout change -> + * + * toggle reshape_backward depending on data_offset: + * + * - free space upfront -> reshape forward + * + * - free space at the end -> reshape backward + * + * + * This utilizes free reshape space avoiding the need + * for userspace to move (parts of) LV segments in + * case of layout/chunksize change (for disk + * adding/removing reshape space has to be at + * the proper address (see above with delta_disks): + * + * add disk(s) -> begin + * remove disk(s)-> end + */ + mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1; + } + + /* + * Adjust device size for forward reshape + * because md_finish_reshape() reduces it. + */ + if (!mddev->reshape_backwards) + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors += reshape_sectors; + + return r; +} + +/* + * If the md resync thread has updated superblock with max reshape position + * at the end of a reshape but not (yet) reset the layout configuration + * changes -> reset the latter. + */ +static void rs_reset_inconclusive_reshape(struct raid_set *rs) +{ + if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) { + rs_set_cur(rs); + rs->md.delta_disks = 0; + rs->md.reshape_backwards = 0; + } +} + +/* + * Enable/disable discard support on RAID set depending on + * RAID level and discard properties of underlying RAID members. + */ +static void configure_discard_support(struct raid_set *rs) +{ + int i; + bool raid456; + struct dm_target *ti = rs->ti; + + /* + * XXX: RAID level 4,5,6 require zeroing for safety. + */ + raid456 = rs_is_raid456(rs); + + for (i = 0; i < rs->raid_disks; i++) { + if (!rs->dev[i].rdev.bdev || + !bdev_max_discard_sectors(rs->dev[i].rdev.bdev)) + return; + + if (raid456) { + if (!devices_handle_discard_safely) { + DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty."); + DMERR("Set dm-raid.devices_handle_discard_safely=Y to override."); + return; + } + } + } + + ti->num_discard_bios = 1; +} + +/* + * Construct a RAID0/1/10/4/5/6 mapping: + * Args: + * <#raid_params> {0,} \ + * <#raid_devs> [ ]{1,} + * + * varies by . See 'parse_raid_params' for + * details on possible . + * + * Userspace is free to initialize the metadata devices, hence the superblocks to + * enforce recreation based on the passed in table parameters. + * + */ +static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + bool resize = false; + struct raid_type *rt; + unsigned int num_raid_params, num_raid_devs; + sector_t sb_array_sectors, rdev_sectors, reshape_sectors; + struct raid_set *rs = NULL; + const char *arg; + struct rs_layout rs_layout; + struct dm_arg_set as = { argc, argv }, as_nrd; + struct dm_arg _args[] = { + { 0, as.argc, "Cannot understand number of raid parameters" }, + { 1, 254, "Cannot understand number of raid devices parameters" } + }; + + arg = dm_shift_arg(&as); + if (!arg) { + ti->error = "No arguments"; + return -EINVAL; + } + + rt = get_raid_type(arg); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + + /* Must have <#raid_params> */ + if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error)) + return -EINVAL; + + /* number of raid device tupples */ + as_nrd = as; + dm_consume_args(&as_nrd, num_raid_params); + _args[1].max = (as_nrd.argc - 1) / 2; + if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error)) + return -EINVAL; + + if (!__within_range(num_raid_devs, 1, MAX_RAID_DEVICES)) { + ti->error = "Invalid number of supplied raid devices"; + return -EINVAL; + } + + rs = raid_set_alloc(ti, rt, num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + r = parse_raid_params(rs, &as, num_raid_params); + if (r) + goto bad; + + r = parse_dev_params(rs, &as); + if (r) + goto bad; + + rs->md.sync_super = super_sync; + + /* + * Calculate ctr requested array and device sizes to allow + * for superblock analysis needing device sizes defined. + * + * Any existing superblock will overwrite the array and device sizes + */ + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); + if (r) + goto bad; + + /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */ + rs->array_sectors = rs->md.array_sectors; + rs->dev_sectors = rs->md.dev_sectors; + + /* + * Backup any new raid set level, layout, ... + * requested to be able to compare to superblock + * members for conversion decisions. + */ + rs_config_backup(rs, &rs_layout); + + r = analyse_superblocks(ti, rs); + if (r) + goto bad; + + /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */ + sb_array_sectors = rs->md.array_sectors; + rdev_sectors = __rdev_sectors(rs); + if (!rdev_sectors) { + ti->error = "Invalid rdev size"; + r = -EINVAL; + goto bad; + } + + + reshape_sectors = _get_reshape_sectors(rs); + if (rs->dev_sectors != rdev_sectors) { + resize = (rs->dev_sectors != rdev_sectors - reshape_sectors); + if (rs->dev_sectors > rdev_sectors - reshape_sectors) + set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + } + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->private = rs; + ti->num_flush_bios = 1; + ti->needs_bio_set_dev = true; + + /* Restore any requested new layout for conversion decision */ + rs_config_restore(rs, &rs_layout); + + /* + * Now that we have any superblock metadata available, + * check for new, recovering, reshaping, to be taken over, + * to be reshaped or an existing, unchanged raid set to + * run in sequence. + */ + if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) { + /* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */ + if (rs_is_raid6(rs) && + test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) { + ti->error = "'nosync' not allowed for new raid6 set"; + r = -EINVAL; + goto bad; + } + rs_setup_recovery(rs, 0); + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + rs_set_new(rs); + } else if (rs_is_recovering(rs)) { + /* A recovering raid set may be resized */ + goto size_check; + } else if (rs_is_reshaping(rs)) { + /* Have to reject size change request during reshape */ + if (resize) { + ti->error = "Can't resize a reshaping raid set"; + r = -EPERM; + goto bad; + } + /* skip setup rs */ + } else if (rs_takeover_requested(rs)) { + if (rs_is_reshaping(rs)) { + ti->error = "Can't takeover a reshaping raid set"; + r = -EPERM; + goto bad; + } + + /* We can't takeover a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't takeover a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + + /* + * If a takeover is needed, userspace sets any additional + * devices to rebuild and we can check for a valid request here. + * + * If acceptible, set the level to the new requested + * one, prohibit requesting recovery, allow the raid + * set to run and store superblocks during resume. + */ + r = rs_check_takeover(rs); + if (r) + goto bad; + + r = rs_setup_takeover(rs); + if (r) + goto bad; + + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + /* Takeover ain't recovery, so disable recovery */ + rs_setup_recovery(rs, MaxSector); + rs_set_new(rs); + } else if (rs_reshape_requested(rs)) { + /* Only request grow on raid set size extensions, not on reshapes. */ + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + + /* + * No need to check for 'ongoing' takeover here, because takeover + * is an instant operation as oposed to an ongoing reshape. + */ + + /* We can't reshape a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't reshape a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + + /* Out-of-place space has to be available to allow for a reshape unless raid1! */ + if (reshape_sectors || rs_is_raid1(rs)) { + /* + * We can only prepare for a reshape here, because the + * raid set needs to run to provide the repective reshape + * check functions via its MD personality instance. + * + * So do the reshape check after md_run() succeeded. + */ + r = rs_prepare_reshape(rs); + if (r) + goto bad; + + /* Reshaping ain't recovery, so disable recovery */ + rs_setup_recovery(rs, MaxSector); + } + rs_set_cur(rs); + } else { +size_check: + /* May not set recovery when a device rebuild is requested */ + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + rs_setup_recovery(rs, MaxSector); + } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + /* + * Set raid set to current size, i.e. size as of + * superblocks to grow to larger size in preresume. + */ + r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false); + if (r) + goto bad; + + rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + } else { + /* This is no size change or it is shrinking, update size and record in superblocks */ + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); + if (r) + goto bad; + + if (sb_array_sectors > rs->array_sectors) + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } + rs_set_cur(rs); + } + + /* If constructor requested it, change data and new_data offsets */ + r = rs_adjust_data_offsets(rs); + if (r) + goto bad; + + /* Catch any inconclusive reshape superblock content. */ + rs_reset_inconclusive_reshape(rs); + + /* Start raid set read-only and assumed clean to change in raid_resume() */ + rs->md.ro = 1; + rs->md.in_sync = 1; + + /* Keep array frozen until resume. */ + set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); + + /* Has to be held on running the array */ + mddev_lock_nointr(&rs->md); + r = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + if (r) { + ti->error = "Failed to run raid array"; + mddev_unlock(&rs->md); + goto bad; + } + + r = md_start(&rs->md); + if (r) { + ti->error = "Failed to start raid array"; + goto bad_unlock; + } + + /* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */ + if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { + r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode); + if (r) { + ti->error = "Failed to set raid4/5/6 journal mode"; + goto bad_unlock; + } + } + + mddev_suspend(&rs->md); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); + + /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ + if (rs_is_raid456(rs)) { + r = rs_set_raid456_stripe_cache(rs); + if (r) + goto bad_unlock; + } + + /* Now do an early reshape check */ + if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { + r = rs_check_reshape(rs); + if (r) + goto bad_unlock; + + /* Restore new, ctr requested layout to perform check */ + rs_config_restore(rs, &rs_layout); + + if (rs->md.pers->start_reshape) { + r = rs->md.pers->check_reshape(&rs->md); + if (r) { + ti->error = "Reshape check failed"; + goto bad_unlock; + } + } + } + + /* Disable/enable discard support on raid set. */ + configure_discard_support(rs); + + mddev_unlock(&rs->md); + return 0; + +bad_unlock: + md_stop(&rs->md); + mddev_unlock(&rs->md); +bad: + raid_set_free(rs); + + return r; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_lock_nointr(&rs->md); + md_stop(&rs->md); + mddev_unlock(&rs->md); + raid_set_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + /* + * If we're reshaping to add disk(s)), ti->len and + * mddev->array_sectors will differ during the process + * (ti->len > mddev->array_sectors), so we have to requeue + * bios with addresses > mddev->array_sectors here or + * there will occur accesses past EOD of the component + * data images thus erroring the raid set. + */ + if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) + return DM_MAPIO_REQUEUE; + + md_handle_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +/* Return sync state string for @state */ +enum sync_state { st_frozen, st_reshape, st_resync, st_check, st_repair, st_recover, st_idle }; +static const char *sync_str(enum sync_state state) +{ + /* Has to be in above sync_state order! */ + static const char *sync_strs[] = { + "frozen", + "reshape", + "resync", + "check", + "repair", + "recover", + "idle" + }; + + return __within_range(state, 0, ARRAY_SIZE(sync_strs) - 1) ? sync_strs[state] : "undef"; +}; + +/* Return enum sync_state for @mddev derived from @recovery flags */ +static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery) +{ + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) + return st_frozen; + + /* The MD sync thread can be done with io or be interrupted but still be running */ + if (!test_bit(MD_RECOVERY_DONE, &recovery) && + (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) + return st_reshape; + + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) + return st_resync; + if (test_bit(MD_RECOVERY_CHECK, &recovery)) + return st_check; + return st_repair; + } + + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + return st_recover; + + if (mddev->reshape_position != MaxSector) + return st_reshape; + } + + return st_idle; +} + +/* + * Return status string for @rdev + * + * Status characters: + * + * 'D' = Dead/Failed raid set component or raid4/5/6 journal device + * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device + * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device + * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) + */ +static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) +{ + if (!rdev->bdev) + return "-"; + else if (test_bit(Faulty, &rdev->flags)) + return "D"; + else if (test_bit(Journal, &rdev->flags)) + return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; + else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) || + (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && + !test_bit(In_sync, &rdev->flags))) + return "a"; + else + return "A"; +} + +/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ +static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, + enum sync_state state, sector_t resync_max_sectors) +{ + sector_t r; + struct mddev *mddev = &rs->md; + + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + + if (rs_is_raid0(rs)) { + r = resync_max_sectors; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + + } else { + if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) + r = mddev->recovery_cp; + else + r = mddev->curr_resync_completed; + + if (state == st_idle && r >= resync_max_sectors) { + /* + * Sync complete. + */ + /* In case we have finished recovering, the array is in sync. */ + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + + } else if (state == st_recover) + /* + * In case we are recovering, the array is not in sync + * and health chars should show the recovering legs. + * + * Already retrieved recovery offset from curr_resync_completed above. + */ + ; + + else if (state == st_resync || state == st_reshape) + /* + * If "resync/reshape" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + + else if (state == st_check || state == st_repair) + /* + * If "check" or "repair" is occurring, the raid set has + * undergone an initial sync and the health characters + * should not be 'a' anymore. + */ + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + + else if (test_bit(MD_RECOVERY_NEEDED, &recovery)) + /* + * We are idle and recovery is needed, prevent 'A' chars race + * caused by components still set to in-sync by constructor. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + + else { + /* + * We are idle and the raid set may be doing an initial + * sync, or it may be rebuilding individual components. + * If all the devices are In_sync, then it is the raid set + * that is being initialized. + */ + struct md_rdev *rdev; + + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + rdev_for_each(rdev, mddev) + if (!test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) { + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + break; + } + } + } + + return min(r, resync_max_sectors); +} + +/* Helper to return @dev name or "-" if !@dev */ +static const char *__get_dev_name(struct dm_dev *dev) +{ + return dev ? dev->name : "-"; +} + +static void raid_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL; + int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; + unsigned long recovery; + unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned int sz = 0; + unsigned int rebuild_writemostly_count = 0; + sector_t progress, resync_max_sectors, resync_mismatches; + enum sync_state state; + struct raid_type *rt; + + switch (type) { + case STATUSTYPE_INFO: + /* *Should* always succeed */ + rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout); + if (!rt) + return; + + DMEMIT("%s %d ", rt->name, mddev->raid_disks); + + /* Access most recent mddev properties for status output */ + smp_rmb(); + /* Get sensible max sectors even if raid set not yet started */ + resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? + mddev->resync_max_sectors : mddev->dev_sectors; + recovery = rs->md.recovery; + state = decipher_sync_action(mddev, recovery); + progress = rs_get_progress(rs, recovery, state, resync_max_sectors); + resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? + atomic64_read(&mddev->resync_mismatches) : 0; + + /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ + for (i = 0; i < rs->raid_disks; i++) + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev)); + + /* + * In-sync/Reshape ratio: + * The in-sync ratio shows the progress of: + * - Initializing the raid set + * - Rebuilding a subset of devices of the raid set + * The user can distinguish between the two by referring + * to the status characters. + * + * The reshape ratio shows the progress of + * changing the raid layout or the number of + * disks of a raid set + */ + DMEMIT(" %llu/%llu", (unsigned long long) progress, + (unsigned long long) resync_max_sectors); + + /* + * v1.5.0+: + * + * Sync action: + * See Documentation/admin-guide/device-mapper/dm-raid.rst for + * information on each of these states. + */ + DMEMIT(" %s", sync_str(state)); + + /* + * v1.5.0+: + * + * resync_mismatches/mismatch_cnt + * This field shows the number of discrepancies found when + * performing a "check" of the raid set. + */ + DMEMIT(" %llu", (unsigned long long) resync_mismatches); + + /* + * v1.9.0+: + * + * data_offset (needed for out of space reshaping) + * This field shows the data offset into the data + * image LV where the first stripes data starts. + * + * We keep data_offset equal on all raid disks of the set, + * so retrieving it from the first raid disk is sufficient. + */ + DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset); + + /* + * v1.10.0+: + */ + DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? + __raid_dev_status(rs, &rs->journal_dev.rdev) : "-"); + break; + + case STATUSTYPE_TABLE: + /* Report the table line string you would use to construct this raid set */ + + /* + * Count any rebuild or writemostly argument pairs and subtract the + * hweight count being added below of any rebuild and writemostly ctr flags. + */ + for (i = 0; i < rs->raid_disks; i++) { + rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) + + (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0); + } + rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) + + (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0); + /* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */ + raid_param_cnt += rebuild_writemostly_count + + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; + /* Emit table line */ + /* This has to be in the documented order for userspace! */ + DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) + DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); + if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) + DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) + for (i = 0; i < rs->raid_disks; i++) + if (test_bit(i, (void *) rs->rebuild_disks)) + DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i); + if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) + DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), + mddev->bitmap_info.daemon_sleep); + if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), + mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), + mddev->sync_speed_max); + if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags)) + for (i = 0; i < rs->raid_disks; i++) + if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY), + rs->dev[i].rdev.raid_disk); + if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) + DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), + mddev->bitmap_info.max_write_behind); + if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), + max_nr_stripes); + if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), + (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); + if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), + raid10_md_layout_to_copies(mddev->layout)); + if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), + raid10_md_layout_to_format(mddev->layout)); + if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), + max(rs->delta_disks, mddev->delta_disks)); + if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), + (unsigned long long) rs->data_offset); + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), + __get_dev_name(rs->journal_dev.dev)); + if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE), + md_journal_mode_to_dm_raid(rs->journal_dev.mode)); + DMEMIT(" %d", rs->raid_disks); + for (i = 0; i < rs->raid_disks; i++) + DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), + __get_dev_name(rs->dev[i].data_dev)); + break; + + case STATUSTYPE_IMA: + rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout); + if (!rt) + return; + + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",raid_type=%s,raid_disks=%d", rt->name, mddev->raid_disks); + + /* Access most recent mddev properties for status output */ + smp_rmb(); + recovery = rs->md.recovery; + state = decipher_sync_action(mddev, recovery); + DMEMIT(",raid_state=%s", sync_str(state)); + + for (i = 0; i < rs->raid_disks; i++) { + DMEMIT(",raid_device_%d_status=", i); + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev)); + } + + if (rt_is_raid456(rt)) { + DMEMIT(",journal_dev_mode="); + switch (rs->journal_dev.mode) { + case R5C_JOURNAL_MODE_WRITE_THROUGH: + DMEMIT("%s", + _raid456_journal_mode[R5C_JOURNAL_MODE_WRITE_THROUGH].param); + break; + case R5C_JOURNAL_MODE_WRITE_BACK: + DMEMIT("%s", + _raid456_journal_mode[R5C_JOURNAL_MODE_WRITE_BACK].param); + break; + default: + DMEMIT("invalid"); + break; + } + } + DMEMIT(";"); + break; + } +} + +static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (!strcasecmp(argv[0], "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + md_reap_sync_thread(mddev); + } + } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) + return -EBUSY; + else if (!strcasecmp(argv[0], "resync")) + ; /* MD_RECOVERY_NEEDED set below */ + else if (!strcasecmp(argv[0], "recover")) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + else { + if (!strcasecmp(argv[0], "check")) { + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else if (!strcasecmp(argv[0], "repair")) { + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else + return -EINVAL; + } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + if (!mddev->suspended && mddev->sync_thread) + md_wakeup_thread(mddev->sync_thread); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!mddev->suspended && mddev->thread) + md_wakeup_thread(mddev->thread); + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned int i; + int r = 0; + + for (i = 0; !r && i < rs->raid_disks; i++) { + if (rs->dev[i].data_dev) { + r = fn(ti, rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, data); + } + } + + return r; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); + + blk_limits_io_min(limits, chunk_size_bytes); + blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Writes have to be stopped before suspending to avoid deadlocks. */ + if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) + md_stop_writes(&rs->md); + + mddev_lock_nointr(&rs->md); + mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + } +} + +static void attempt_restore_of_faulty_devices(struct raid_set *rs) +{ + int i; + uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS]; + unsigned long flags; + bool cleared = false; + struct dm_raid_superblock *sb; + struct mddev *mddev = &rs->md; + struct md_rdev *r; + + /* RAID personalities have to provide hot add/remove methods or we need to bail out. */ + if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk) + return; + + memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); + + for (i = 0; i < rs->raid_disks; i++) { + r = &rs->dev[i].rdev; + /* HM FIXME: enhance journal device recovery processing */ + if (test_bit(Journal, &r->flags)) + continue; + + if (test_bit(Faulty, &r->flags) && + r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) { + DMINFO("Faulty %s device #%d has readable super block." + " Attempting to revive it.", + rs->raid_type->name, i); + + /* + * Faulty bit may be set, but sometimes the array can + * be suspended before the personalities can respond + * by removing the device from the array (i.e. calling + * 'hot_remove_disk'). If they haven't yet removed + * the failed device, its 'raid_disk' number will be + * '>= 0' - meaning we must call this function + * ourselves. + */ + flags = r->flags; + clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */ + if (r->raid_disk >= 0) { + if (mddev->pers->hot_remove_disk(mddev, r)) { + /* Failed to revive this device, try next */ + r->flags = flags; + continue; + } + } else + r->raid_disk = r->saved_raid_disk = i; + + clear_bit(Faulty, &r->flags); + clear_bit(WriteErrorSeen, &r->flags); + + if (mddev->pers->hot_add_disk(mddev, r)) { + /* Failed to revive this device, try next */ + r->raid_disk = r->saved_raid_disk = -1; + r->flags = flags; + } else { + clear_bit(In_sync, &r->flags); + r->recovery_offset = 0; + set_bit(i, (void *) cleared_failed_devices); + cleared = true; + } + } + } + + /* If any failed devices could be cleared, update all sbs failed_devices bits */ + if (cleared) { + uint64_t failed_devices[DISKS_ARRAY_ELEMS]; + + rdev_for_each(r, &rs->md) { + if (test_bit(Journal, &r->flags)) + continue; + + sb = page_address(r->sb_page); + sb_retrieve_failed_devices(sb, failed_devices); + + for (i = 0; i < DISKS_ARRAY_ELEMS; i++) + failed_devices[i] &= ~cleared_failed_devices[i]; + + sb_update_failed_devices(sb, failed_devices); + } + } +} + +static int __load_dirty_region_bitmap(struct raid_set *rs) +{ + int r = 0; + + /* Try loading the bitmap unless "raid0", which does not have one */ + if (!rs_is_raid0(rs) && + !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { + r = md_bitmap_load(&rs->md); + if (r) + DMERR("Failed to load bitmap"); + } + + return r; +} + +/* Enforce updating all superblocks */ +static void rs_update_sbs(struct raid_set *rs) +{ + struct mddev *mddev = &rs->md; + int ro = mddev->ro; + + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + mddev->ro = 0; + md_update_sb(mddev, 1); + mddev->ro = ro; +} + +/* + * Reshape changes raid algorithm of @rs to new one within personality + * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes + * disks from a raid set thus growing/shrinking it or resizes the set + * + * Call mddev_lock_nointr() before! + */ +static int rs_start_reshape(struct raid_set *rs) +{ + int r; + struct mddev *mddev = &rs->md; + struct md_personality *pers = mddev->pers; + + /* Don't allow the sync thread to work until the table gets reloaded. */ + set_bit(MD_RECOVERY_WAIT, &mddev->recovery); + + r = rs_setup_reshape(rs); + if (r) + return r; + + /* + * Check any reshape constraints enforced by the personalility + * + * May as well already kick the reshape off so that * pers->start_reshape() becomes optional. + */ + r = pers->check_reshape(mddev); + if (r) { + rs->ti->error = "pers->check_reshape() failed"; + return r; + } + + /* + * Personality may not provide start reshape method in which + * case check_reshape above has already covered everything + */ + if (pers->start_reshape) { + r = pers->start_reshape(mddev); + if (r) { + rs->ti->error = "pers->start_reshape() failed"; + return r; + } + } + + /* + * Now reshape got set up, update superblocks to + * reflect the fact so that a table reload will + * access proper superblock content in the ctr. + */ + rs_update_sbs(rs); + + return 0; +} + +static int raid_preresume(struct dm_target *ti) +{ + int r; + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + /* This is a resume after a suspend of the set -> it's already started. */ + if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) + return 0; + + /* + * The superblocks need to be updated on disk if the + * array is new or new devices got added (thus zeroed + * out by userspace) or __load_dirty_region_bitmap + * will overwrite them in core with old data or fail. + */ + if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) + rs_update_sbs(rs); + + /* Load the bitmap from disk unless raid0 */ + r = __load_dirty_region_bitmap(rs); + if (r) + return r; + + /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */ + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + mddev->array_sectors = rs->array_sectors; + mddev->dev_sectors = rs->dev_sectors; + rs_set_rdev_sectors(rs); + rs_set_capacity(rs); + } + + /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */ + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && + (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) || + (rs->requested_bitmap_chunk_sectors && + mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { + int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; + + r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); + if (r) + DMERR("Failed to resize bitmap"); + } + + /* Check for any resize/reshape on @rs and adjust/initiate */ + /* Be prepared for mddev_resume() in raid_resume() */ + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + mddev->resync_min = mddev->recovery_cp; + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) + mddev->resync_max_sectors = mddev->dev_sectors; + } + + /* Check for any reshape request unless new raid set */ + if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { + /* Initiate a reshape. */ + rs_set_rdev_sectors(rs); + mddev_lock_nointr(mddev); + r = rs_start_reshape(rs); + mddev_unlock(mddev); + if (r) + DMWARN("Failed to check/start reshape, continuing without change"); + r = 0; + } + + return r; +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + if (test_and_set_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) { + /* + * A secondary resume while the device is active. + * Take this opportunity to check whether any failed + * devices are reachable again. + */ + attempt_restore_of_faulty_devices(rs); + } + + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Only reduce raid set size before running a disk removing reshape. */ + if (mddev->delta_disks < 0) + rs_set_capacity(rs); + + mddev_lock_nointr(mddev); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + mddev->ro = 0; + mddev->in_sync = 0; + mddev_resume(mddev); + mddev_unlock(mddev); + } +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 15, 1}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .message = raid_message, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .postsuspend = raid_postsuspend, + .preresume = raid_preresume, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + DMINFO("Loading target version %u.%u.%u", + raid_target.version[0], + raid_target.version[1], + raid_target.version[2]); + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +module_param(devices_handle_discard_safely, bool, 0644); +MODULE_PARM_DESC(devices_handle_discard_safely, + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); + +MODULE_DESCRIPTION(DM_NAME " raid0/1/10/4/5/6 target"); +MODULE_ALIAS("dm-raid0"); +MODULE_ALIAS("dm-raid1"); +MODULE_ALIAS("dm-raid10"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown "); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c new file mode 100644 index 000000000..c38e63706 --- /dev/null +++ b/drivers/md/dm-raid1.c @@ -0,0 +1,1511 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-bio-record.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "raid1" + +#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ + +#define MAX_NR_MIRRORS (DM_KCOPYD_MAX_REGIONS + 1) + +#define DM_RAID1_HANDLE_ERRORS 0x01 +#define DM_RAID1_KEEP_LOG 0x02 +#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) +#define keep_log(p) ((p)->features & DM_RAID1_KEEP_LOG) + +static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +enum dm_raid1_error { + DM_RAID1_WRITE_ERROR, + DM_RAID1_FLUSH_ERROR, + DM_RAID1_SYNC_ERROR, + DM_RAID1_READ_ERROR +}; + +struct mirror { + struct mirror_set *ms; + atomic_t error_count; + unsigned long error_type; + struct dm_dev *dev; + sector_t offset; +}; + +struct mirror_set { + struct dm_target *ti; + struct list_head list; + + uint64_t features; + + spinlock_t lock; /* protects the lists */ + struct bio_list reads; + struct bio_list writes; + struct bio_list failures; + struct bio_list holds; /* bios are waiting until suspend */ + + struct dm_region_hash *rh; + struct dm_kcopyd_client *kcopyd_client; + struct dm_io_client *io_client; + + /* recovery */ + region_t nr_regions; + int in_sync; + int log_failure; + int leg_failure; + atomic_t suspend; + + atomic_t default_mirror; /* Default mirror */ + + struct workqueue_struct *kmirrord_wq; + struct work_struct kmirrord_work; + struct timer_list timer; + unsigned long timer_pending; + + struct work_struct trigger_event; + + unsigned int nr_mirrors; + struct mirror mirror[]; +}; + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle, + "A percentage of time allocated for raid resynchronization"); + +static void wakeup_mirrord(void *context) +{ + struct mirror_set *ms = context; + + queue_work(ms->kmirrord_wq, &ms->kmirrord_work); +} + +static void delayed_wake_fn(struct timer_list *t) +{ + struct mirror_set *ms = from_timer(ms, t, timer); + + clear_bit(0, &ms->timer_pending); + wakeup_mirrord(ms); +} + +static void delayed_wake(struct mirror_set *ms) +{ + if (test_and_set_bit(0, &ms->timer_pending)) + return; + + ms->timer.expires = jiffies + HZ / 5; + add_timer(&ms->timer); +} + +static void wakeup_all_recovery_waiters(void *context) +{ + wake_up_all(&_kmirrord_recovery_stopped); +} + +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) +{ + unsigned long flags; + int should_wake = 0; + struct bio_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock_irqsave(&ms->lock, flags); + should_wake = !(bl->head); + bio_list_add(bl, bio); + spin_unlock_irqrestore(&ms->lock, flags); + + if (should_wake) + wakeup_mirrord(ms); +} + +static void dispatch_bios(void *context, struct bio_list *bio_list) +{ + struct mirror_set *ms = context; + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + queue_bio(ms, bio, WRITE); +} + +struct dm_raid1_bio_record { + struct mirror *m; + /* if details->bi_bdev == NULL, details were not saved */ + struct dm_bio_details details; + region_t write_region; +}; + +/* + * Every mirror should look like this one. + */ +#define DEFAULT_MIRROR 0 + +/* + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh + * doesn't get submitted to the lower levels of block layer. + */ +static struct mirror *bio_get_m(struct bio *bio) +{ + return (struct mirror *) bio->bi_next; +} + +static void bio_set_m(struct bio *bio, struct mirror *m) +{ + bio->bi_next = (struct bio *) m; +} + +static struct mirror *get_default_mirror(struct mirror_set *ms) +{ + return &ms->mirror[atomic_read(&ms->default_mirror)]; +} + +static void set_default_mirror(struct mirror *m) +{ + struct mirror_set *ms = m->ms; + struct mirror *m0 = &(ms->mirror[0]); + + atomic_set(&ms->default_mirror, m - m0); +} + +static struct mirror *get_valid_mirror(struct mirror_set *ms) +{ + struct mirror *m; + + for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) + if (!atomic_read(&m->error_count)) + return m; + + return NULL; +} + +/* fail_mirror + * @m: mirror device to fail + * @error_type: one of the enum's, DM_RAID1_*_ERROR + * + * If errors are being handled, record the type of + * error encountered for this device. If this type + * of error has already been recorded, we can return; + * otherwise, we must signal userspace by triggering + * an event. Additionally, if the device is the + * primary device, we must choose a new primary, but + * only if the mirror is in-sync. + * + * This function must not block. + */ +static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) +{ + struct mirror_set *ms = m->ms; + struct mirror *new; + + ms->leg_failure = 1; + + /* + * error_count is used for nothing more than a + * simple way to tell if a device has encountered + * errors. + */ + atomic_inc(&m->error_count); + + if (test_and_set_bit(error_type, &m->error_type)) + return; + + if (!errors_handled(ms)) + return; + + if (m != get_default_mirror(ms)) + goto out; + + if (!ms->in_sync && !keep_log(ms)) { + /* + * Better to issue requests to same failing device + * than to risk returning corrupt data. + */ + DMERR("Primary mirror (%s) failed while out-of-sync: Reads may fail.", + m->dev->name); + goto out; + } + + new = get_valid_mirror(ms); + if (new) + set_default_mirror(new); + else + DMWARN("All sides of mirror have failed."); + +out: + schedule_work(&ms->trigger_event); +} + +static int mirror_flush(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + unsigned long error_bits; + + unsigned int i; + struct dm_io_region io[MAX_NR_MIRRORS]; + struct mirror *m; + struct dm_io_request io_req = { + .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = ms->io_client, + }; + + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { + io[i].bdev = m->dev->bdev; + io[i].sector = 0; + io[i].count = 0; + } + + error_bits = -1; + dm_io(&io_req, ms->nr_mirrors, io, &error_bits); + if (unlikely(error_bits != 0)) { + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error_bits)) + fail_mirror(ms->mirror + i, + DM_RAID1_FLUSH_ERROR); + return -EIO; + } + + return 0; +} + +/*----------------------------------------------------------------- + * Recovery. + * + * When a mirror is first activated we may find that some regions + * are in the no-sync state. We have to recover these by + * recopying from the default mirror to all the others. + *---------------------------------------------------------------*/ +static void recovery_complete(int read_err, unsigned long write_err, + void *context) +{ + struct dm_region *reg = context; + struct mirror_set *ms = dm_rh_region_context(reg); + int m, bit = 0; + + if (read_err) { + /* Read error means the failure of default mirror. */ + DMERR_LIMIT("Unable to read primary mirror during recovery"); + fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); + } + + if (write_err) { + DMERR_LIMIT("Write error during recovery (error = 0x%lx)", + write_err); + /* + * Bits correspond to devices (excluding default mirror). + * The default mirror cannot change during recovery. + */ + for (m = 0; m < ms->nr_mirrors; m++) { + if (&ms->mirror[m] == get_default_mirror(ms)) + continue; + if (test_bit(bit, &write_err)) + fail_mirror(ms->mirror + m, + DM_RAID1_SYNC_ERROR); + bit++; + } + } + + dm_rh_recovery_end(reg, !(read_err || write_err)); +} + +static void recover(struct mirror_set *ms, struct dm_region *reg) +{ + unsigned int i; + struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; + struct mirror *m; + unsigned long flags = 0; + region_t key = dm_rh_get_region_key(reg); + sector_t region_size = dm_rh_get_region_size(ms->rh); + + /* fill in the source */ + m = get_default_mirror(ms); + from.bdev = m->dev->bdev; + from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + if (key == (ms->nr_regions - 1)) { + /* + * The final region may be smaller than + * region_size. + */ + from.count = ms->ti->len & (region_size - 1); + if (!from.count) + from.count = region_size; + } else + from.count = region_size; + + /* fill in the destinations */ + for (i = 0, dest = to; i < ms->nr_mirrors; i++) { + if (&ms->mirror[i] == get_default_mirror(ms)) + continue; + + m = ms->mirror + i; + dest->bdev = m->dev->bdev; + dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + dest->count = from.count; + dest++; + } + + /* hand to kcopyd */ + if (!errors_handled(ms)) + flags |= BIT(DM_KCOPYD_IGNORE_ERROR); + + dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, + flags, recovery_complete, reg); +} + +static void reset_ms_flags(struct mirror_set *ms) +{ + unsigned int m; + + ms->leg_failure = 0; + for (m = 0; m < ms->nr_mirrors; m++) { + atomic_set(&(ms->mirror[m].error_count), 0); + ms->mirror[m].error_type = 0; + } +} + +static void do_recovery(struct mirror_set *ms) +{ + struct dm_region *reg; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + /* + * Start quiescing some regions. + */ + dm_rh_recovery_prepare(ms->rh); + + /* + * Copy any already quiesced regions. + */ + while ((reg = dm_rh_recovery_start(ms->rh))) + recover(ms, reg); + + /* + * Update the in sync flag. + */ + if (!ms->in_sync && + (log->type->get_sync_count(log) == ms->nr_regions)) { + /* the sync is complete */ + dm_table_event(ms->ti->table); + ms->in_sync = 1; + reset_ms_flags(ms); + } +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +{ + struct mirror *m = get_default_mirror(ms); + + do { + if (likely(!atomic_read(&m->error_count))) + return m; + + if (m-- == ms->mirror) + m += ms->nr_mirrors; + } while (m != get_default_mirror(ms)); + + return NULL; +} + +static int default_ok(struct mirror *m) +{ + struct mirror *default_mirror = get_default_mirror(m->ms); + + return !atomic_read(&default_mirror->error_count); +} + +static int mirror_available(struct mirror_set *ms, struct bio *bio) +{ + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->in_sync(log, region, 0)) + return choose_mirror(ms, bio->bi_iter.bi_sector) ? 1 : 0; + + return 0; +} + +/* + * remap a buffer to a particular mirror. + */ +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + if (unlikely(!bio->bi_iter.bi_size)) + return 0; + return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector); +} + +static void map_bio(struct mirror *m, struct bio *bio) +{ + bio_set_dev(bio, m->dev->bdev); + bio->bi_iter.bi_sector = map_sector(m, bio); +} + +static void map_region(struct dm_io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio_sectors(bio); +} + +static void hold_bio(struct mirror_set *ms, struct bio *bio) +{ + /* + * Lock is required to avoid race condition during suspend + * process. + */ + spin_lock_irq(&ms->lock); + + if (atomic_read(&ms->suspend)) { + spin_unlock_irq(&ms->lock); + + /* + * If device is suspended, complete the bio. + */ + if (dm_noflush_suspending(ms->ti)) + bio->bi_status = BLK_STS_DM_REQUEUE; + else + bio->bi_status = BLK_STS_IOERR; + + bio_endio(bio); + return; + } + + /* + * Hold bio until the suspend is complete. + */ + bio_list_add(&ms->holds, bio); + spin_unlock_irq(&ms->lock); +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (likely(!error)) { + bio_endio(bio); + return; + } + + fail_mirror(m, DM_RAID1_READ_ERROR); + + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { + DMWARN_LIMIT("Read failure on mirror device %s. Trying alternative device.", + m->dev->name); + queue_bio(m->ms, bio, bio_data_dir(bio)); + return; + } + + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", + m->dev->name); + bio_io_error(bio); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct dm_io_region io; + struct dm_io_request io_req = { + .bi_opf = REQ_OP_READ, + .mem.type = DM_IO_BIO, + .mem.ptr.bio = bio, + .notify.fn = read_callback, + .notify.context = bio, + .client = m->ms->io_client, + }; + + map_region(&io, m, bio); + bio_set_m(bio, m); + BUG_ON(dm_io(&io_req, 1, &io, NULL)); +} + +static inline int region_in_sync(struct mirror_set *ms, region_t region, + int may_block) +{ + int state = dm_rh_get_state(ms->rh, region, may_block); + return state == DM_RH_CLEAN || state == DM_RH_DIRTY; +} + +static void do_reads(struct mirror_set *ms, struct bio_list *reads) +{ + region_t region; + struct bio *bio; + struct mirror *m; + + while ((bio = bio_list_pop(reads))) { + region = dm_rh_bio_to_region(ms->rh, bio); + m = get_default_mirror(ms); + + /* + * We can only read balance if the region is in sync. + */ + if (likely(region_in_sync(ms, region, 1))) + m = choose_mirror(ms, bio->bi_iter.bi_sector); + else if (m && atomic_read(&m->error_count)) + m = NULL; + + if (likely(m)) + read_async_bio(m, bio); + else + bio_io_error(bio); + } +} + +/*----------------------------------------------------------------- + * Writes. + * + * We do different things with the write io depending on the + * state of the region that it's in: + * + * SYNC: increment pending, use kcopyd to write to *all* mirrors + * RECOVERING: delay the io until recovery completes + * NOSYNC: increment pending, just write to the default mirror + *---------------------------------------------------------------*/ + + +static void write_callback(unsigned long error, void *context) +{ + unsigned int i; + struct bio *bio = (struct bio *) context; + struct mirror_set *ms; + int should_wake = 0; + unsigned long flags; + + ms = bio_get_m(bio)->ms; + bio_set_m(bio, NULL); + + /* + * NOTE: We don't decrement the pending count here, + * instead it is done by the targets endio function. + * This way we handle both writes to SYNC and NOSYNC + * regions with the same code. + */ + if (likely(!error)) { + bio_endio(bio); + return; + } + + /* + * If the bio is discard, return an error, but do not + * degrade the array. + */ + if (bio_op(bio) == REQ_OP_DISCARD) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return; + } + + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error)) + fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); + + /* + * Need to raise event. Since raising + * events can block, we need to do it in + * the main thread. + */ + spin_lock_irqsave(&ms->lock, flags); + if (!ms->failures.head) + should_wake = 1; + bio_list_add(&ms->failures, bio); + spin_unlock_irqrestore(&ms->lock, flags); + if (should_wake) + wakeup_mirrord(ms); +} + +static void do_write(struct mirror_set *ms, struct bio *bio) +{ + unsigned int i; + struct dm_io_region io[MAX_NR_MIRRORS], *dest = io; + struct mirror *m; + blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH); + struct dm_io_request io_req = { + .bi_opf = REQ_OP_WRITE | op_flags, + .mem.type = DM_IO_BIO, + .mem.ptr.bio = bio, + .notify.fn = write_callback, + .notify.context = bio, + .client = ms->io_client, + }; + + if (bio_op(bio) == REQ_OP_DISCARD) { + io_req.bi_opf = REQ_OP_DISCARD | op_flags; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); + + /* + * Use default mirror because we only need it to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, get_default_mirror(ms)); + + BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); +} + +static void do_writes(struct mirror_set *ms, struct bio_list *writes) +{ + int state; + struct bio *bio; + struct bio_list sync, nosync, recover, *this_list = NULL; + struct bio_list requeue; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region; + + if (!writes->head) + return; + + /* + * Classify each write. + */ + bio_list_init(&sync); + bio_list_init(&nosync); + bio_list_init(&recover); + bio_list_init(&requeue); + + while ((bio = bio_list_pop(writes))) { + if ((bio->bi_opf & REQ_PREFLUSH) || + (bio_op(bio) == REQ_OP_DISCARD)) { + bio_list_add(&sync, bio); + continue; + } + + region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->is_remote_recovering && + log->type->is_remote_recovering(log, region)) { + bio_list_add(&requeue, bio); + continue; + } + + state = dm_rh_get_state(ms->rh, region, 1); + switch (state) { + case DM_RH_CLEAN: + case DM_RH_DIRTY: + this_list = &sync; + break; + + case DM_RH_NOSYNC: + this_list = &nosync; + break; + + case DM_RH_RECOVERING: + this_list = &recover; + break; + } + + bio_list_add(this_list, bio); + } + + /* + * Add bios that are delayed due to remote recovery + * back on to the write queue + */ + if (unlikely(requeue.head)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->writes, &requeue); + spin_unlock_irq(&ms->lock); + delayed_wake(ms); + } + + /* + * Increment the pending counts for any regions that will + * be written to (writes to recover regions are going to + * be delayed). + */ + dm_rh_inc_pending(ms->rh, &sync); + dm_rh_inc_pending(ms->rh, &nosync); + + /* + * If the flush fails on a previous call and succeeds here, + * we must not reset the log_failure variable. We need + * userspace interaction to do that. + */ + ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; + + /* + * Dispatch io. + */ + if (unlikely(ms->log_failure) && errors_handled(ms)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, &sync); + spin_unlock_irq(&ms->lock); + wakeup_mirrord(ms); + } else + while ((bio = bio_list_pop(&sync))) + do_write(ms, bio); + + while ((bio = bio_list_pop(&recover))) + dm_rh_delay(ms->rh, bio); + + while ((bio = bio_list_pop(&nosync))) { + if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) { + spin_lock_irq(&ms->lock); + bio_list_add(&ms->failures, bio); + spin_unlock_irq(&ms->lock); + wakeup_mirrord(ms); + } else { + map_bio(get_default_mirror(ms), bio); + submit_bio_noacct(bio); + } + } +} + +static void do_failures(struct mirror_set *ms, struct bio_list *failures) +{ + struct bio *bio; + + if (likely(!failures->head)) + return; + + /* + * If the log has failed, unattempted writes are being + * put on the holds list. We can't issue those writes + * until a log has been marked, so we must store them. + * + * If a 'noflush' suspend is in progress, we can requeue + * the I/O's to the core. This give userspace a chance + * to reconfigure the mirror, at which point the core + * will reissue the writes. If the 'noflush' flag is + * not set, we have no choice but to return errors. + * + * Some writes on the failures list may have been + * submitted before the log failure and represent a + * failure to write to one of the devices. It is ok + * for us to treat them the same and requeue them + * as well. + */ + while ((bio = bio_list_pop(failures))) { + if (!ms->log_failure) { + ms->in_sync = 0; + dm_rh_mark_nosync(ms->rh, bio); + } + + /* + * If all the legs are dead, fail the I/O. + * If the device has failed and keep_log is enabled, + * fail the I/O. + * + * If we have been told to handle errors, and keep_log + * isn't enabled, hold the bio and wait for userspace to + * deal with the problem. + * + * Otherwise pretend that the I/O succeeded. (This would + * be wrong if the failed leg returned after reboot and + * got replicated back to the good legs.) + */ + if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure))) + bio_io_error(bio); + else if (errors_handled(ms) && !keep_log(ms)) + hold_bio(ms, bio); + else + bio_endio(bio); + } +} + +static void trigger_event(struct work_struct *work) +{ + struct mirror_set *ms = + container_of(work, struct mirror_set, trigger_event); + + dm_table_event(ms->ti->table); +} + +/*----------------------------------------------------------------- + * kmirrord + *---------------------------------------------------------------*/ +static void do_mirror(struct work_struct *work) +{ + struct mirror_set *ms = container_of(work, struct mirror_set, + kmirrord_work); + struct bio_list reads, writes, failures; + unsigned long flags; + + spin_lock_irqsave(&ms->lock, flags); + reads = ms->reads; + writes = ms->writes; + failures = ms->failures; + bio_list_init(&ms->reads); + bio_list_init(&ms->writes); + bio_list_init(&ms->failures); + spin_unlock_irqrestore(&ms->lock, flags); + + dm_rh_update_states(ms->rh, errors_handled(ms)); + do_recovery(ms); + do_reads(ms, &reads); + do_writes(ms, &writes); + do_failures(ms, &failures); +} + +/*----------------------------------------------------------------- + * Target functions + *---------------------------------------------------------------*/ +static struct mirror_set *alloc_context(unsigned int nr_mirrors, + uint32_t region_size, + struct dm_target *ti, + struct dm_dirty_log *dl) +{ + struct mirror_set *ms = + kzalloc(struct_size(ms, mirror, nr_mirrors), GFP_KERNEL); + + if (!ms) { + ti->error = "Cannot allocate mirror context"; + return NULL; + } + + spin_lock_init(&ms->lock); + bio_list_init(&ms->reads); + bio_list_init(&ms->writes); + bio_list_init(&ms->failures); + bio_list_init(&ms->holds); + + ms->ti = ti; + ms->nr_mirrors = nr_mirrors; + ms->nr_regions = dm_sector_div_up(ti->len, region_size); + ms->in_sync = 0; + ms->log_failure = 0; + ms->leg_failure = 0; + atomic_set(&ms->suspend, 0); + atomic_set(&ms->default_mirror, DEFAULT_MIRROR); + + ms->io_client = dm_io_client_create(); + if (IS_ERR(ms->io_client)) { + ti->error = "Error creating dm_io client"; + kfree(ms); + return NULL; + } + + ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, + wakeup_all_recovery_waiters, + ms->ti->begin, MAX_RECOVERY, + dl, region_size, ms->nr_regions); + if (IS_ERR(ms->rh)) { + ti->error = "Error creating dirty region hash"; + dm_io_client_destroy(ms->io_client); + kfree(ms); + return NULL; + } + + return ms; +} + +static void free_context(struct mirror_set *ms, struct dm_target *ti, + unsigned int m) +{ + while (m--) + dm_put_device(ti, ms->mirror[m].dev); + + dm_io_client_destroy(ms->io_client); + dm_region_hash_destroy(ms->rh); + kfree(ms); +} + +static int get_mirror(struct mirror_set *ms, struct dm_target *ti, + unsigned int mirror, char **argv) +{ + unsigned long long offset; + char dummy; + int ret; + + if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1 || + offset != (sector_t)offset) { + ti->error = "Invalid offset"; + return -EINVAL; + } + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &ms->mirror[mirror].dev); + if (ret) { + ti->error = "Device lookup failure"; + return ret; + } + + ms->mirror[mirror].ms = ms; + atomic_set(&(ms->mirror[mirror].error_count), 0); + ms->mirror[mirror].error_type = 0; + ms->mirror[mirror].offset = offset; + + return 0; +} + +/* + * Create dirty log: log_type #log_params + */ +static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, + unsigned int argc, char **argv, + unsigned int *args_used) +{ + unsigned int param_count; + struct dm_dirty_log *dl; + char dummy; + + if (argc < 2) { + ti->error = "Insufficient mirror log arguments"; + return NULL; + } + + if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { + ti->error = "Invalid mirror log argument count"; + return NULL; + } + + *args_used = 2 + param_count; + + if (argc < *args_used) { + ti->error = "Insufficient mirror log arguments"; + return NULL; + } + + dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, + argv + 2); + if (!dl) { + ti->error = "Error creating mirror dirty log"; + return NULL; + } + + return dl; +} + +static int parse_features(struct mirror_set *ms, unsigned int argc, char **argv, + unsigned int *args_used) +{ + unsigned int num_features; + struct dm_target *ti = ms->ti; + char dummy; + int i; + + *args_used = 0; + + if (!argc) + return 0; + + if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { + ti->error = "Invalid number of features"; + return -EINVAL; + } + + argc--; + argv++; + (*args_used)++; + + if (num_features > argc) { + ti->error = "Not enough arguments to support feature count"; + return -EINVAL; + } + + for (i = 0; i < num_features; i++) { + if (!strcmp("handle_errors", argv[0])) + ms->features |= DM_RAID1_HANDLE_ERRORS; + else if (!strcmp("keep_log", argv[0])) + ms->features |= DM_RAID1_KEEP_LOG; + else { + ti->error = "Unrecognised feature requested"; + return -EINVAL; + } + + argc--; + argv++; + (*args_used)++; + } + if (!errors_handled(ms) && keep_log(ms)) { + ti->error = "keep_log feature requires the handle_errors feature"; + return -EINVAL; + } + + return 0; +} + +/* + * Construct a mirror mapping: + * + * log_type #log_params + * #mirrors [mirror_path offset]{2,} + * [#features ] + * + * log_type is "core" or "disk" + * #log_params is between 1 and 3 + * + * If present, supported features are "handle_errors" and "keep_log". + */ +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned int nr_mirrors, m, args_used; + struct mirror_set *ms; + struct dm_dirty_log *dl; + char dummy; + + dl = create_dirty_log(ti, argc, argv, &args_used); + if (!dl) + return -EINVAL; + + argv += args_used; + argc -= args_used; + + if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || + nr_mirrors < 2 || nr_mirrors > MAX_NR_MIRRORS) { + ti->error = "Invalid number of mirrors"; + dm_dirty_log_destroy(dl); + return -EINVAL; + } + + argv++, argc--; + + if (argc < nr_mirrors * 2) { + ti->error = "Too few mirror arguments"; + dm_dirty_log_destroy(dl); + return -EINVAL; + } + + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); + if (!ms) { + dm_dirty_log_destroy(dl); + return -ENOMEM; + } + + /* Get the mirror parameter sets */ + for (m = 0; m < nr_mirrors; m++) { + r = get_mirror(ms, ti, m, argv); + if (r) { + free_context(ms, ti, m); + return r; + } + argv += 2; + argc -= 2; + } + + ti->private = ms; + + r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh)); + if (r) + goto err_free_context; + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->per_io_data_size = sizeof(struct dm_raid1_bio_record); + + ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); + if (!ms->kmirrord_wq) { + DMERR("couldn't start kmirrord"); + r = -ENOMEM; + goto err_free_context; + } + INIT_WORK(&ms->kmirrord_work, do_mirror); + timer_setup(&ms->timer, delayed_wake_fn, 0); + ms->timer_pending = 0; + INIT_WORK(&ms->trigger_event, trigger_event); + + r = parse_features(ms, argc, argv, &args_used); + if (r) + goto err_destroy_wq; + + argv += args_used; + argc -= args_used; + + /* + * Any read-balancing addition depends on the + * DM_RAID1_HANDLE_ERRORS flag being present. + * This is because the decision to balance depends + * on the sync state of a region. If the above + * flag is not present, we ignore errors; and + * the sync state may be inaccurate. + */ + + if (argc) { + ti->error = "Too many mirror arguments"; + r = -EINVAL; + goto err_destroy_wq; + } + + ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(ms->kcopyd_client)) { + r = PTR_ERR(ms->kcopyd_client); + goto err_destroy_wq; + } + + wakeup_mirrord(ms); + return 0; + +err_destroy_wq: + destroy_workqueue(ms->kmirrord_wq); +err_free_context: + free_context(ms, ti, ms->nr_mirrors); + return r; +} + +static void mirror_dtr(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + + del_timer_sync(&ms->timer); + flush_workqueue(ms->kmirrord_wq); + flush_work(&ms->trigger_event); + dm_kcopyd_client_destroy(ms->kcopyd_client); + destroy_workqueue(ms->kmirrord_wq); + free_context(ms, ti, ms->nr_mirrors); +} + +/* + * Mirror mapping function + */ +static int mirror_map(struct dm_target *ti, struct bio *bio) +{ + int r, rw = bio_data_dir(bio); + struct mirror *m; + struct mirror_set *ms = ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + struct dm_raid1_bio_record *bio_record = + dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); + + bio_record->details.bi_bdev = NULL; + + if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio); + queue_bio(ms, bio, rw); + return DM_MAPIO_SUBMITTED; + } + + r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); + if (r < 0 && r != -EWOULDBLOCK) + return DM_MAPIO_KILL; + + /* + * If region is not in-sync queue the bio. + */ + if (!r || (r == -EWOULDBLOCK)) { + if (bio->bi_opf & REQ_RAHEAD) + return DM_MAPIO_KILL; + + queue_bio(ms, bio, rw); + return DM_MAPIO_SUBMITTED; + } + + /* + * The region is in-sync and we can perform reads directly. + * Store enough information so we can retry if it fails. + */ + m = choose_mirror(ms, bio->bi_iter.bi_sector); + if (unlikely(!m)) + return DM_MAPIO_KILL; + + dm_bio_record(&bio_record->details, bio); + bio_record->m = m; + + map_bio(m, bio); + + return DM_MAPIO_REMAPPED; +} + +static int mirror_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) +{ + int rw = bio_data_dir(bio); + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct mirror *m = NULL; + struct dm_bio_details *bd = NULL; + struct dm_raid1_bio_record *bio_record = + dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); + + /* + * We need to dec pending if this was a write. + */ + if (rw == WRITE) { + if (!(bio->bi_opf & REQ_PREFLUSH) && + bio_op(bio) != REQ_OP_DISCARD) + dm_rh_dec(ms->rh, bio_record->write_region); + return DM_ENDIO_DONE; + } + + if (*error == BLK_STS_NOTSUPP) + goto out; + + if (bio->bi_opf & REQ_RAHEAD) + goto out; + + if (unlikely(*error)) { + if (!bio_record->details.bi_bdev) { + /* + * There wasn't enough memory to record necessary + * information for a retry or there was no other + * mirror in-sync. + */ + DMERR_LIMIT("Mirror read failed."); + return DM_ENDIO_DONE; + } + + m = bio_record->m; + + DMERR("Mirror read failed from %s. Trying alternative device.", + m->dev->name); + + fail_mirror(m, DM_RAID1_READ_ERROR); + + /* + * A failed read is requeued for another attempt using an intact + * mirror. + */ + if (default_ok(m) || mirror_available(ms, bio)) { + bd = &bio_record->details; + + dm_bio_restore(bd, bio); + bio_record->details.bi_bdev = NULL; + bio->bi_status = 0; + + queue_bio(ms, bio, rw); + return DM_ENDIO_INCOMPLETE; + } + DMERR("All replicated volumes dead, failing I/O"); + } + +out: + bio_record->details.bi_bdev = NULL; + + return DM_ENDIO_DONE; +} + +static void mirror_presuspend(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + struct bio_list holds; + struct bio *bio; + + atomic_set(&ms->suspend, 1); + + /* + * Process bios in the hold list to start recovery waiting + * for bios in the hold list. After the process, no bio has + * a chance to be added in the hold list because ms->suspend + * is set. + */ + spin_lock_irq(&ms->lock); + holds = ms->holds; + bio_list_init(&ms->holds); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&holds))) + hold_bio(ms, bio); + + /* + * We must finish up all the work that we've + * generated (i.e. recovery work). + */ + dm_rh_stop_recovery(ms->rh); + + wait_event(_kmirrord_recovery_stopped, + !dm_rh_recovery_in_flight(ms->rh)); + + if (log->type->presuspend && log->type->presuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log presuspend failed"); + + /* + * Now that recovery is complete/stopped and the + * delayed bios are queued, we need to wait for + * the worker thread to complete. This way, + * we know that all of our I/O has been pushed. + */ + flush_workqueue(ms->kmirrord_wq); +} + +static void mirror_postsuspend(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + if (log->type->postsuspend && log->type->postsuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log postsuspend failed"); +} + +static void mirror_resume(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + atomic_set(&ms->suspend, 0); + if (log->type->resume && log->type->resume(log)) + /* FIXME: need better error handling */ + DMWARN("log resume failed"); + dm_rh_start_recovery(ms->rh); +} + +/* + * device_status_char + * @m: mirror device/leg we want the status of + * + * We return one character representing the most severe error + * we have encountered. + * A => Alive - No failures + * D => Dead - A write failure occurred leaving mirror out-of-sync + * S => Sync - A sychronization failure occurred, mirror out-of-sync + * R => Read - A read failure occurred, mirror data unaffected + * + * Returns: + */ +static char device_status_char(struct mirror *m) +{ + if (!atomic_read(&(m->error_count))) + return 'A'; + + return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : + (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : + (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : + (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; +} + + +static void mirror_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + unsigned int m, sz = 0; + int num_feature_args = 0; + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + char buffer[MAX_NR_MIRRORS + 1]; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", ms->nr_mirrors); + for (m = 0; m < ms->nr_mirrors; m++) { + DMEMIT("%s ", ms->mirror[m].dev->name); + buffer[m] = device_status_char(&(ms->mirror[m])); + } + buffer[m] = '\0'; + + DMEMIT("%llu/%llu 1 %s ", + (unsigned long long)log->type->get_sync_count(log), + (unsigned long long)ms->nr_regions, buffer); + + sz += log->type->status(log, type, result+sz, maxlen-sz); + + break; + + case STATUSTYPE_TABLE: + sz = log->type->status(log, type, result, maxlen); + + DMEMIT("%d", ms->nr_mirrors); + for (m = 0; m < ms->nr_mirrors; m++) + DMEMIT(" %s %llu", ms->mirror[m].dev->name, + (unsigned long long)ms->mirror[m].offset); + + num_feature_args += !!errors_handled(ms); + num_feature_args += !!keep_log(ms); + if (num_feature_args) { + DMEMIT(" %d", num_feature_args); + if (errors_handled(ms)) + DMEMIT(" handle_errors"); + if (keep_log(ms)) + DMEMIT(" keep_log"); + } + + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",nr_mirrors=%d", ms->nr_mirrors); + for (m = 0; m < ms->nr_mirrors; m++) { + DMEMIT(",mirror_device_%d=%s", m, ms->mirror[m].dev->name); + DMEMIT(",mirror_device_%d_status=%c", + m, device_status_char(&(ms->mirror[m]))); + } + + DMEMIT(",handle_errors=%c", errors_handled(ms) ? 'y' : 'n'); + DMEMIT(",keep_log=%c", keep_log(ms) ? 'y' : 'n'); + + DMEMIT(",log_type_status="); + sz += log->type->status(log, type, result+sz, maxlen-sz); + DMEMIT(";"); + break; + } +} + +static int mirror_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct mirror_set *ms = ti->private; + int ret = 0; + unsigned int i; + + for (i = 0; !ret && i < ms->nr_mirrors; i++) + ret = fn(ti, ms->mirror[i].dev, + ms->mirror[i].offset, ti->len, data); + + return ret; +} + +static struct target_type mirror_target = { + .name = "mirror", + .version = {1, 14, 0}, + .module = THIS_MODULE, + .ctr = mirror_ctr, + .dtr = mirror_dtr, + .map = mirror_map, + .end_io = mirror_end_io, + .presuspend = mirror_presuspend, + .postsuspend = mirror_postsuspend, + .resume = mirror_resume, + .status = mirror_status, + .iterate_devices = mirror_iterate_devices, +}; + +static int __init dm_mirror_init(void) +{ + int r; + + r = dm_register_target(&mirror_target); + if (r < 0) { + DMERR("Failed to register mirror target"); + goto bad_target; + } + + return 0; + +bad_target: + return r; +} + +static void __exit dm_mirror_exit(void) +{ + dm_unregister_target(&mirror_target); +} + +/* Module hooks */ +module_init(dm_mirror_init); +module_exit(dm_mirror_exit); + +MODULE_DESCRIPTION(DM_NAME " mirror target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c new file mode 100644 index 000000000..adbdb4b67 --- /dev/null +++ b/drivers/md/dm-region-hash.c @@ -0,0 +1,724 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "dm.h" + +#define DM_MSG_PREFIX "region hash" + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. dm_rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. dm_rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'delayed_bios' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct dm_region_hash { + uint32_t region_size; + unsigned int region_shift; + + /* holds persistent region state */ + struct dm_dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + unsigned int mask; + unsigned int nr_buckets; + unsigned int prime; + unsigned int shift; + struct list_head *buckets; + + /* + * If there was a flush failure no regions can be marked clean. + */ + int flush_failure; + + unsigned int max_recovery; /* Max # of regions to recover in parallel */ + + spinlock_t region_lock; + atomic_t recovery_in_flight; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; + struct list_head failed_recovered_regions; + struct semaphore recovery_count; + + mempool_t region_pool; + + void *context; + sector_t target_begin; + + /* Callback function to schedule bios writes */ + void (*dispatch_bios)(void *context, struct bio_list *bios); + + /* Callback function to wakeup callers worker thread. */ + void (*wakeup_workers)(void *context); + + /* Callback function to wakeup callers recovery waiters. */ + void (*wakeup_all_recovery_waiters)(void *context); +}; + +struct dm_region { + struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) +{ + return sector >> rh->region_shift; +} + +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) +{ + return region << rh->region_shift; +} +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); + +region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector - + rh->target_begin); +} +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); + +void *dm_rh_region_context(struct dm_region *reg) +{ + return reg->rh->context; +} +EXPORT_SYMBOL_GPL(dm_rh_region_context); + +region_t dm_rh_get_region_key(struct dm_region *reg) +{ + return reg->key; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_key); + +sector_t dm_rh_get_region_size(struct dm_region_hash *rh) +{ + return rh->region_size; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_size); + +/* + * FIXME: shall we pass in a structure instead of all these args to + * dm_region_hash_create()???? + */ +#define RH_HASH_MULT 2654435387U +#define RH_HASH_SHIFT 12 + +#define MIN_REGIONS 64 +struct dm_region_hash *dm_region_hash_create( + void *context, void (*dispatch_bios)(void *context, + struct bio_list *bios), + void (*wakeup_workers)(void *context), + void (*wakeup_all_recovery_waiters)(void *context), + sector_t target_begin, unsigned int max_recovery, + struct dm_dirty_log *log, uint32_t region_size, + region_t nr_regions) +{ + struct dm_region_hash *rh; + unsigned int nr_buckets, max_buckets; + size_t i; + int ret; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh = kzalloc(sizeof(*rh), GFP_KERNEL); + if (!rh) { + DMERR("unable to allocate region hash memory"); + return ERR_PTR(-ENOMEM); + } + + rh->context = context; + rh->dispatch_bios = dispatch_bios; + rh->wakeup_workers = wakeup_workers; + rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; + rh->target_begin = target_begin; + rh->max_recovery = max_recovery; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = __ffs(region_size); + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->shift = RH_HASH_SHIFT; + rh->prime = RH_HASH_MULT; + + rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets))); + if (!rh->buckets) { + DMERR("unable to allocate region hash bucket memory"); + kfree(rh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + atomic_set(&rh->recovery_in_flight, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + INIT_LIST_HEAD(&rh->failed_recovered_regions); + rh->flush_failure = 0; + + ret = mempool_init_kmalloc_pool(&rh->region_pool, MIN_REGIONS, + sizeof(struct dm_region)); + if (ret) { + vfree(rh->buckets); + kfree(rh); + rh = ERR_PTR(-ENOMEM); + } + + return rh; +} +EXPORT_SYMBOL_GPL(dm_region_hash_create); + +void dm_region_hash_destroy(struct dm_region_hash *rh) +{ + unsigned int h; + struct dm_region *reg, *nreg; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_entry_safe(reg, nreg, rh->buckets + h, + hash_list) { + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, &rh->region_pool); + } + } + + if (rh->log) + dm_dirty_log_destroy(rh->log); + + mempool_exit(&rh->region_pool); + vfree(rh->buckets); + kfree(rh); +} +EXPORT_SYMBOL_GPL(dm_region_hash_destroy); + +struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) +{ + return rh->log; +} +EXPORT_SYMBOL_GPL(dm_rh_dirty_log); + +static unsigned int rh_hash(struct dm_region_hash *rh, region_t region) +{ + return (unsigned int) ((region * rh->prime) >> rh->shift) & rh->mask; +} + +static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + struct list_head *bucket = rh->buckets + rh_hash(rh, region); + + list_for_each_entry(reg, bucket, hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) +{ + list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); +} + +static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg, *nreg; + + nreg = mempool_alloc(&rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + DM_RH_CLEAN : DM_RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + INIT_LIST_HEAD(&nreg->list); + atomic_set(&nreg->pending, 0); + bio_list_init(&nreg->delayed_bios); + + write_lock_irq(&rh->hash_lock); + reg = __rh_lookup(rh, region); + if (reg) + /* We lost the race. */ + mempool_free(nreg, &rh->region_pool); + else { + __rh_insert(rh, nreg); + if (nreg->state == DM_RH_CLEAN) { + spin_lock(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock(&rh->region_lock); + } + + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + + return reg; +} + +static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) { + read_unlock(&rh->hash_lock); + reg = __rh_alloc(rh, region); + read_lock(&rh->hash_lock); + } + + return reg; +} + +int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) +{ + int r; + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a DM_RH_NOSYNC + */ + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; +} +EXPORT_SYMBOL_GPL(dm_rh_get_state); + +static void complete_resync_work(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + rh->dispatch_bios(rh->context, ®->delayed_bios); + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); + up(&rh->recovery_count); +} + +/* dm_rh_mark_nosync + * @ms + * @bio + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state DM_RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) +{ + unsigned long flags; + struct dm_dirty_log *log = rh->log; + struct dm_region *reg; + region_t region = dm_rh_bio_to_region(rh, bio); + int recovering = 0; + + if (bio->bi_opf & REQ_PREFLUSH) { + rh->flush_failure = 1; + return; + } + + if (bio_op(bio) == REQ_OP_DISCARD) + return; + + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + /* region hash entry should exist because write was in-flight */ + BUG_ON(!reg); + BUG_ON(!list_empty(®->list)); + + spin_lock_irqsave(&rh->region_lock, flags); + /* + * Possible cases: + * 1) DM_RH_DIRTY + * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed + * 3) DM_RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + recovering = (reg->state == DM_RH_RECOVERING); + reg->state = DM_RH_NOSYNC; + BUG_ON(!list_empty(®->list)); + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (recovering) + complete_resync_work(reg, 0); +} +EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); + +void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) +{ + struct dm_region *reg, *next; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + LIST_HEAD(failed_recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice_init(&rh->clean_regions, &clean); + + list_for_each_entry(reg, &clean, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice_init(&rh->recovered_regions, &recovered); + + list_for_each_entry(reg, &recovered, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->failed_recovered_regions)) { + list_splice_init(&rh->failed_recovered_regions, + &failed_recovered); + + list_for_each_entry(reg, &failed_recovered, list) + list_del(®->hash_list); + } + + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_entry_safe(reg, next, &recovered, list) { + rh->log->type->clear_region(rh->log, reg->key); + complete_resync_work(reg, 1); + mempool_free(reg, &rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &failed_recovered, list) { + complete_resync_work(reg, errors_handled ? 0 : 1); + mempool_free(reg, &rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + mempool_free(reg, &rh->region_pool); + } + + rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL_GPL(dm_rh_update_states); + +static void rh_inc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + + spin_lock_irq(&rh->region_lock); + atomic_inc(®->pending); + + if (reg->state == DM_RH_CLEAN) { + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + + rh->log->type->mark_region(rh->log, reg->key); + } else + spin_unlock_irq(&rh->region_lock); + + + read_unlock(&rh->hash_lock); +} + +void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) +{ + struct bio *bio; + + for (bio = bios->head; bio; bio = bio->bi_next) { + if (bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD) + continue; + rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + } +} +EXPORT_SYMBOL_GPL(dm_rh_inc_pending); + +void dm_rh_dec(struct dm_region_hash *rh, region_t region) +{ + unsigned long flags; + struct dm_region *reg; + int should_wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irqsave(&rh->region_lock, flags); + if (atomic_dec_and_test(®->pending)) { + /* + * There is no pending I/O for this region. + * We can move the region to corresponding list for next action. + * At this point, the region is not yet connected to any list. + * + * If the state is DM_RH_NOSYNC, the region should be kept off + * from clean list. + * The hash entry for DM_RH_NOSYNC will remain in memory + * until the region is recovered or the map is reloaded. + */ + + /* do nothing for DM_RH_NOSYNC */ + if (unlikely(rh->flush_failure)) { + /* + * If a write flush failed some time ago, we + * don't know whether or not this write made it + * to the disk, so we must resync the device. + */ + reg->state = DM_RH_NOSYNC; + } else if (reg->state == DM_RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else if (reg->state == DM_RH_DIRTY) { + reg->state = DM_RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + should_wake = 1; + } + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (should_wake) + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_dec); + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct dm_region_hash *rh) +{ + int r; + region_t region; + struct dm_region *reg; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + else + list_move(®->list, &rh->quiesced_regions); + + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +void dm_rh_recovery_prepare(struct dm_region_hash *rh) +{ + /* Extra reference to avoid race with dm_rh_stop_recovery */ + atomic_inc(&rh->recovery_in_flight); + + while (!down_trylock(&rh->recovery_count)) { + atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { + atomic_dec(&rh->recovery_in_flight); + up(&rh->recovery_count); + break; + } + } + + /* Drop the extra reference */ + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); + +/* + * Returns any quiesced regions. + */ +struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) +{ + struct dm_region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct dm_region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_start); + +void dm_rh_recovery_end(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + if (success) + list_add(®->list, ®->rh->recovered_regions); + else + list_add(®->list, ®->rh->failed_recovered_regions); + + spin_unlock_irq(&rh->region_lock); + + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_end); + +/* Return recovery in flight count. */ +int dm_rh_recovery_in_flight(struct dm_region_hash *rh) +{ + return atomic_read(&rh->recovery_in_flight); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); + +int dm_rh_flush(struct dm_region_hash *rh) +{ + return rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL_GPL(dm_rh_flush); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} +EXPORT_SYMBOL_GPL(dm_rh_delay); + +void dm_rh_stop_recovery(struct dm_region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < rh->max_recovery; i++) + down(&rh->recovery_count); +} +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); + +void dm_rh_start_recovery(struct dm_region_hash *rh) +{ + int i; + + for (i = 0; i < rh->max_recovery; i++) + up(&rh->recovery_count); + + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_start_recovery); + +MODULE_DESCRIPTION(DM_NAME " region hash"); +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c new file mode 100644 index 000000000..80f46e01b --- /dev/null +++ b/drivers/md/dm-rq.c @@ -0,0 +1,596 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-core.h" +#include "dm-rq.h" + +#include + +#define DM_MSG_PREFIX "core-rq" + +/* + * One of these is allocated per request. + */ +struct dm_rq_target_io { + struct mapped_device *md; + struct dm_target *ti; + struct request *orig, *clone; + struct kthread_work work; + blk_status_t error; + union map_info info; + struct dm_stats_aux stats_aux; + unsigned long duration_jiffies; + unsigned int n_sectors; + unsigned int completed; +}; + +#define DM_MQ_NR_HW_QUEUES 1 +#define DM_MQ_QUEUE_DEPTH 2048 +static unsigned int dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; +static unsigned int dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; + +/* + * Request-based DM's mempools' reserved IOs set by the user. + */ +#define RESERVED_REQUEST_BASED_IOS 256 +static unsigned int reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; + +unsigned int dm_get_reserved_rq_based_ios(void) +{ + return __dm_get_module_param(&reserved_rq_based_ios, + RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS); +} + +static unsigned int dm_get_blk_mq_nr_hw_queues(void) +{ + return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); +} + +static unsigned int dm_get_blk_mq_queue_depth(void) +{ + return __dm_get_module_param(&dm_mq_queue_depth, + DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); +} + +int dm_request_based(struct mapped_device *md) +{ + return queue_is_mq(md->queue); +} + +void dm_start_queue(struct request_queue *q) +{ + blk_mq_unquiesce_queue(q); + blk_mq_kick_requeue_list(q); +} + +void dm_stop_queue(struct request_queue *q) +{ + blk_mq_quiesce_queue(q); +} + +/* + * Partial completion handling for request-based dm + */ +static void end_clone_bio(struct bio *clone) +{ + struct dm_rq_clone_bio_info *info = + container_of(clone, struct dm_rq_clone_bio_info, clone); + struct dm_rq_target_io *tio = info->tio; + unsigned int nr_bytes = info->orig->bi_iter.bi_size; + blk_status_t error = clone->bi_status; + bool is_last = !clone->bi_next; + + bio_put(clone); + + if (tio->error) + /* + * An error has already been detected on the request. + * Once error occurred, just let clone->end_io() handle + * the remainder. + */ + return; + else if (error) { + /* + * Don't notice the error to the upper layer yet. + * The error handling decision is made by the target driver, + * when the request is completed. + */ + tio->error = error; + goto exit; + } + + /* + * I/O for the bio successfully completed. + * Notice the data completion to the upper layer. + */ + tio->completed += nr_bytes; + + /* + * Update the original request. + * Do not use blk_mq_end_request() here, because it may complete + * the original request before the clone, and break the ordering. + */ + if (is_last) + exit: + blk_update_request(tio->orig, BLK_STS_OK, tio->completed); +} + +static struct dm_rq_target_io *tio_from_request(struct request *rq) +{ + return blk_mq_rq_to_pdu(rq); +} + +static void rq_end_stats(struct mapped_device *md, struct request *orig) +{ + if (unlikely(dm_stats_used(&md->stats))) { + struct dm_rq_target_io *tio = tio_from_request(orig); + tio->duration_jiffies = jiffies - tio->duration_jiffies; + dm_stats_account_io(&md->stats, rq_data_dir(orig), + blk_rq_pos(orig), tio->n_sectors, true, + tio->duration_jiffies, &tio->stats_aux); + } +} + +/* + * Don't touch any member of the md after calling this function because + * the md may be freed in dm_put() at the end of this function. + * Or do dm_get() before calling this function and dm_put() later. + */ +static void rq_completed(struct mapped_device *md) +{ + /* + * dm_put() must be at the end of this function. See the comment above + */ + dm_put(md); +} + +/* + * Complete the clone and the original request. + * Must be called without clone's queue lock held, + * see end_clone_request() for more details. + */ +static void dm_end_request(struct request *clone, blk_status_t error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone, NULL); + + rq_end_stats(md, rq); + blk_mq_end_request(rq, error); + rq_completed(md); +} + +static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs) +{ + blk_mq_delay_kick_requeue_list(q, msecs); +} + +void dm_mq_kick_requeue_list(struct mapped_device *md) +{ + __dm_mq_kick_requeue_list(md->queue, 0); +} +EXPORT_SYMBOL(dm_mq_kick_requeue_list); + +static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs) +{ + blk_mq_requeue_request(rq, false); + __dm_mq_kick_requeue_list(rq->q, msecs); +} + +static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_requeue) +{ + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + unsigned long delay_ms = delay_requeue ? 100 : 0; + + rq_end_stats(md, rq); + if (tio->clone) { + blk_rq_unprep_clone(tio->clone); + tio->ti->type->release_clone_rq(tio->clone, NULL); + } + + dm_mq_delay_requeue_request(rq, delay_ms); + rq_completed(md); +} + +static void dm_done(struct request *clone, blk_status_t error, bool mapped) +{ + int r = DM_ENDIO_DONE; + struct dm_rq_target_io *tio = clone->end_io_data; + dm_request_endio_fn rq_end_io = NULL; + + if (tio->ti) { + rq_end_io = tio->ti->type->rq_end_io; + + if (mapped && rq_end_io) + r = rq_end_io(tio->ti, clone, error, &tio->info); + } + + if (unlikely(error == BLK_STS_TARGET)) { + if (req_op(clone) == REQ_OP_DISCARD && + !clone->q->limits.max_discard_sectors) + disable_discard(tio->md); + else if (req_op(clone) == REQ_OP_WRITE_ZEROES && + !clone->q->limits.max_write_zeroes_sectors) + disable_write_zeroes(tio->md); + } + + switch (r) { + case DM_ENDIO_DONE: + /* The target wants to complete the I/O */ + dm_end_request(clone, error); + break; + case DM_ENDIO_INCOMPLETE: + /* The target will handle the I/O */ + return; + case DM_ENDIO_REQUEUE: + /* The target wants to requeue the I/O */ + dm_requeue_original_request(tio, false); + break; + case DM_ENDIO_DELAY_REQUEUE: + /* The target wants to requeue the I/O after a delay */ + dm_requeue_original_request(tio, true); + break; + default: + DMCRIT("unimplemented target endio return value: %d", r); + BUG(); + } +} + +/* + * Request completion handler for request-based dm + */ +static void dm_softirq_done(struct request *rq) +{ + bool mapped = true; + struct dm_rq_target_io *tio = tio_from_request(rq); + struct request *clone = tio->clone; + + if (!clone) { + struct mapped_device *md = tio->md; + + rq_end_stats(md, rq); + blk_mq_end_request(rq, tio->error); + rq_completed(md); + return; + } + + if (rq->rq_flags & RQF_FAILED) + mapped = false; + + dm_done(clone, tio->error, mapped); +} + +/* + * Complete the clone and the original request with the error status + * through softirq context. + */ +static void dm_complete_request(struct request *rq, blk_status_t error) +{ + struct dm_rq_target_io *tio = tio_from_request(rq); + + tio->error = error; + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); +} + +/* + * Complete the not-mapped clone and the original request with the error status + * through softirq context. + * Target's rq_end_io() function isn't called. + * This may be used when the target's clone_and_map_rq() function fails. + */ +static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) +{ + rq->rq_flags |= RQF_FAILED; + dm_complete_request(rq, error); +} + +static enum rq_end_io_ret end_clone_request(struct request *clone, + blk_status_t error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + + dm_complete_request(tio->orig, error); + return RQ_END_IO_NONE; +} + +static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, + void *data) +{ + struct dm_rq_target_io *tio = data; + struct dm_rq_clone_bio_info *info = + container_of(bio, struct dm_rq_clone_bio_info, clone); + + info->orig = bio_orig; + info->tio = tio; + bio->bi_end_io = end_clone_bio; + + return 0; +} + +static int setup_clone(struct request *clone, struct request *rq, + struct dm_rq_target_io *tio, gfp_t gfp_mask) +{ + int r; + + r = blk_rq_prep_clone(clone, rq, &tio->md->mempools->bs, gfp_mask, + dm_rq_bio_constructor, tio); + if (r) + return r; + + clone->end_io = end_clone_request; + clone->end_io_data = tio; + + tio->clone = clone; + + return 0; +} + +static void init_tio(struct dm_rq_target_io *tio, struct request *rq, + struct mapped_device *md) +{ + tio->md = md; + tio->ti = NULL; + tio->clone = NULL; + tio->orig = rq; + tio->error = 0; + tio->completed = 0; + /* + * Avoid initializing info for blk-mq; it passes + * target-specific data through info.ptr + * (see: dm_mq_init_request) + */ + if (!md->init_tio_pdu) + memset(&tio->info, 0, sizeof(tio->info)); +} + +/* + * Returns: + * DM_MAPIO_* : the request has been processed as indicated + * DM_MAPIO_REQUEUE : the original request needs to be immediately requeued + * < 0 : the request was completed due to failure + */ +static int map_request(struct dm_rq_target_io *tio) +{ + int r; + struct dm_target *ti = tio->ti; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + struct request *clone = NULL; + blk_status_t ret; + + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* The target has taken the I/O to submit by itself later */ + break; + case DM_MAPIO_REMAPPED: + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone, &tio->info); + return DM_MAPIO_REQUEUE; + } + + /* The target has remapped the I/O so dispatch it */ + trace_block_rq_remap(clone, disk_devt(dm_disk(md)), + blk_rq_pos(rq)); + ret = blk_insert_cloned_request(clone); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_rq_unprep_clone(clone); + blk_mq_cleanup_rq(clone); + tio->ti->type->release_clone_rq(clone, &tio->info); + tio->clone = NULL; + return DM_MAPIO_REQUEUE; + default: + /* must complete clone in terms of original request */ + dm_complete_request(rq, ret); + } + break; + case DM_MAPIO_REQUEUE: + /* The target wants to requeue the I/O */ + break; + case DM_MAPIO_DELAY_REQUEUE: + /* The target wants to requeue the I/O after a delay */ + dm_requeue_original_request(tio, true); + break; + case DM_MAPIO_KILL: + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(rq, BLK_STS_IOERR); + break; + default: + DMCRIT("unimplemented target map return value: %d", r); + BUG(); + } + + return r; +} + +/* DEPRECATED: previously used for request-based merge heuristic in dm_request_fn() */ +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) +{ + return sprintf(buf, "%u\n", 0); +} + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count) +{ + return count; +} + +static void dm_start_request(struct mapped_device *md, struct request *orig) +{ + blk_mq_start_request(orig); + + if (unlikely(dm_stats_used(&md->stats))) { + struct dm_rq_target_io *tio = tio_from_request(orig); + tio->duration_jiffies = jiffies; + tio->n_sectors = blk_rq_sectors(orig); + dm_stats_account_io(&md->stats, rq_data_dir(orig), + blk_rq_pos(orig), tio->n_sectors, false, 0, + &tio->stats_aux); + } + + /* + * Hold the md reference here for the in-flight I/O. + * We can't rely on the reference count by device opener, + * because the device may be closed during the request completion + * when all bios are completed. + * See the comment in rq_completed() too. + */ + dm_get(md); +} + +static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct mapped_device *md = set->driver_data; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + + /* + * Must initialize md member of tio, otherwise it won't + * be available in dm_mq_queue_rq. + */ + tio->md = md; + + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + + return 0; +} + +static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + struct mapped_device *md = tio->md; + struct dm_target *ti = md->immutable_target; + + /* + * blk-mq's unquiesce may come from outside events, such as + * elevator switch, updating nr_requests or others, and request may + * come during suspend, so simply ask for blk-mq to requeue it. + */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) + return BLK_STS_RESOURCE; + + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map; + + map = dm_get_live_table(md, &srcu_idx); + if (unlikely(!map)) { + dm_put_live_table(md, srcu_idx); + return BLK_STS_RESOURCE; + } + ti = dm_table_find_target(map, 0); + dm_put_live_table(md, srcu_idx); + } + + if (ti->type->busy && ti->type->busy(ti)) + return BLK_STS_RESOURCE; + + dm_start_request(md, rq); + + /* Init tio using md established in .init_request */ + init_tio(tio, rq, md); + + /* + * Establish tio->ti before calling map_request(). + */ + tio->ti = ti; + + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_end_stats(md, rq); + rq_completed(md); + return BLK_STS_RESOURCE; + } + + return BLK_STS_OK; +} + +static const struct blk_mq_ops dm_mq_ops = { + .queue_rq = dm_mq_queue_rq, + .complete = dm_softirq_done, + .init_request = dm_mq_init_request, +}; + +int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) +{ + struct dm_target *immutable_tgt; + int err; + + md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); + if (!md->tag_set) + return -ENOMEM; + + md->tag_set->ops = &dm_mq_ops; + md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); + md->tag_set->numa_node = md->numa_node_id; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); + md->tag_set->driver_data = md; + + md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); + immutable_tgt = dm_table_get_immutable_target(t); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->tag_set->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + + err = blk_mq_alloc_tag_set(md->tag_set); + if (err) + goto out_kfree_tag_set; + + err = blk_mq_init_allocated_queue(md->tag_set, md->queue); + if (err) + goto out_tag_set; + return 0; + +out_tag_set: + blk_mq_free_tag_set(md->tag_set); +out_kfree_tag_set: + kfree(md->tag_set); + md->tag_set = NULL; + + return err; +} + +void dm_mq_cleanup_mapped_device(struct mapped_device *md) +{ + if (md->tag_set) { + blk_mq_free_tag_set(md->tag_set); + kfree(md->tag_set); + md->tag_set = NULL; + } +} + +module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); + +/* Unused, but preserved for userspace compatibility */ +static bool use_blk_mq = true; +module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); + +module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); + +module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h new file mode 100644 index 000000000..2c97ad145 --- /dev/null +++ b/drivers/md/dm-rq.h @@ -0,0 +1,47 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef DM_RQ_INTERNAL_H +#define DM_RQ_INTERNAL_H + +#include +#include + +#include "dm-stats.h" + +struct mapped_device; + +/* + * For request-based dm - the bio clones we allocate are embedded in these + * structs. + * + * We allocate these with bio_alloc_bioset, using the front_pad parameter when + * the bioset is created - this means the bio has to come at the end of the + * struct. + */ +struct dm_rq_clone_bio_info { + struct bio *orig; + struct dm_rq_target_io *tio; + struct bio clone; +}; + +int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t); +void dm_mq_cleanup_mapped_device(struct mapped_device *md); + +void dm_start_queue(struct request_queue *q); +void dm_stop_queue(struct request_queue *q); + +void dm_mq_kick_requeue_list(struct mapped_device *md); + +unsigned int dm_get_reserved_rq_based_ios(void); + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf); +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count); + +#endif diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c new file mode 100644 index 000000000..80b95746a --- /dev/null +++ b/drivers/md/dm-snap-persistent.c @@ -0,0 +1,972 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "persistent snapshot" +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */ + +#define DM_PREFETCH_CHUNKS 12 + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +#define NUM_SNAPSHOT_HDR_CHUNKS 1 + +struct disk_header { + __le32 magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + __le32 valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + __le32 version; + + /* In sectors */ + __le32 chunk_size; +} __packed; + +struct disk_exception { + __le64 old_chunk; + __le64 new_chunk; +} __packed; + +struct core_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_exception_store *store; + int version; + int valid; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + + /* + * An area of zeros used to clear the next area. + */ + void *zero_area; + + /* + * An area used for header. The header can be written + * concurrently with metadata (when invalidating the snapshot), + * so it needs a separate buffer. + */ + void *header_area; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + chunk_t current_area; + + /* + * The next free chunk for an exception. + * + * When creating exceptions, all the chunks here and above are + * free. It holds the next chunk to be allocated. On rare + * occasions (e.g. after a system crash) holes can be left in + * the exception store because chunks can be committed out of + * order. + * + * When merging exceptions, it does not necessarily mean all the + * chunks here and above are free. It holds the value it would + * have held if all chunks had been committed in order of + * allocation. Consequently the value may occasionally be + * slightly too low, but since it's only used for 'status' and + * it can never reach its minimum value too early this doesn't + * matter. + */ + + chunk_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; + struct dm_io_client *io_client; + + struct workqueue_struct *metadata_wq; +}; + +static int alloc_area(struct pstore *ps) +{ + int r = -ENOMEM; + size_t len; + + len = ps->store->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + goto err_area; + + ps->zero_area = vzalloc(len); + if (!ps->zero_area) + goto err_zero_area; + + ps->header_area = vmalloc(len); + if (!ps->header_area) + goto err_header_area; + + return 0; + +err_header_area: + vfree(ps->zero_area); + +err_zero_area: + vfree(ps->area); + +err_area: + return r; +} + +static void free_area(struct pstore *ps) +{ + vfree(ps->area); + ps->area = NULL; + vfree(ps->zero_area); + ps->zero_area = NULL; + vfree(ps->header_area); + ps->header_area = NULL; +} + +struct mdata_req { + struct dm_io_region *where; + struct dm_io_request *io_req; + struct work_struct work; + int result; +}; + +static void do_metadata(struct work_struct *work) +{ + struct mdata_req *req = container_of(work, struct mdata_req, work); + + req->result = dm_io(req->io_req, 1, req->where, NULL); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, blk_opf_t opf, + int metadata) +{ + struct dm_io_region where = { + .bdev = dm_snap_cow(ps->store->snap)->bdev, + .sector = ps->store->chunk_size * chunk, + .count = ps->store->chunk_size, + }; + struct dm_io_request io_req = { + .bi_opf = opf, + .mem.type = DM_IO_VMA, + .mem.ptr.vma = area, + .client = ps->io_client, + .notify.fn = NULL, + }; + struct mdata_req req; + + if (!metadata) + return dm_io(&io_req, 1, &where, NULL); + + req.where = &where; + req.io_req = &io_req; + + /* + * Issue the synchronous I/O from a different thread + * to avoid submit_bio_noacct recursion. + */ + INIT_WORK_ONSTACK(&req.work, do_metadata); + queue_work(ps->metadata_wq, &req.work); + flush_workqueue(ps->metadata_wq); + destroy_work_on_stack(&req.work); + + return req.result; +} + +/* + * Convert a metadata area index to a chunk index. + */ +static chunk_t area_location(struct pstore *ps, chunk_t area) +{ + return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); +} + +static void skip_metadata(struct pstore *ps) +{ + uint32_t stride = ps->exceptions_per_area + 1; + chunk_t next_free = ps->next_free; + if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS) + ps->next_free++; +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, blk_opf_t opf) +{ + chunk_t chunk = area_location(ps, ps->current_area); + + return chunk_io(ps, ps->area, chunk, opf, 0); +} + +static void zero_memory_area(struct pstore *ps) +{ + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); +} + +static int zero_disk_area(struct pstore *ps, chunk_t area) +{ + return chunk_io(ps, ps->zero_area, area_location(ps, area), + REQ_OP_WRITE, 0); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + unsigned int chunk_size; + int chunk_size_supplied = 1; + char *chunk_err; + + /* + * Use default chunk size (or logical_block_size, if larger) + * if none supplied + */ + if (!ps->store->chunk_size) { + ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> + bdev) >> 9); + ps->store->chunk_mask = ps->store->chunk_size - 1; + ps->store->chunk_shift = __ffs(ps->store->chunk_size); + chunk_size_supplied = 0; + } + + ps->io_client = dm_io_client_create(); + if (IS_ERR(ps->io_client)) + return PTR_ERR(ps->io_client); + + r = alloc_area(ps); + if (r) + return r; + + r = chunk_io(ps, ps->header_area, 0, REQ_OP_READ, 1); + if (r) + goto bad; + + dh = ps->header_area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + return 0; + } + + if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { + DMWARN("Invalid or corrupt snapshot"); + r = -ENXIO; + goto bad; + } + + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + chunk_size = le32_to_cpu(dh->chunk_size); + + if (ps->store->chunk_size == chunk_size) + return 0; + + if (chunk_size_supplied) + DMWARN("chunk size %u in device metadata overrides table chunk size of %u.", + chunk_size, ps->store->chunk_size); + + /* We had a bogus chunk_size. Fix stuff up. */ + free_area(ps); + + r = dm_exception_store_set_chunk_size(ps->store, chunk_size, + &chunk_err); + if (r) { + DMERR("invalid on-disk chunk size %u: %s.", + chunk_size, chunk_err); + return r; + } + + r = alloc_area(ps); + return r; + +bad: + free_area(ps); + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT); + + dh = ps->header_area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(ps->valid); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->store->chunk_size); + + return chunk_io(ps, ps->header_area, 0, REQ_OP_WRITE, 1); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, void *ps_area, + uint32_t index) +{ + BUG_ON(index >= ps->exceptions_per_area); + + return ((struct disk_exception *) ps_area) + index; +} + +static void read_exception(struct pstore *ps, void *ps_area, + uint32_t index, struct core_exception *result) +{ + struct disk_exception *de = get_exception(ps, ps_area, index); + + /* copy it */ + result->old_chunk = le64_to_cpu(de->old_chunk); + result->new_chunk = le64_to_cpu(de->new_chunk); +} + +static void write_exception(struct pstore *ps, + uint32_t index, struct core_exception *e) +{ + struct disk_exception *de = get_exception(ps, ps->area, index); + + /* copy it */ + de->old_chunk = cpu_to_le64(e->old_chunk); + de->new_chunk = cpu_to_le64(e->new_chunk); +} + +static void clear_exception(struct pstore *ps, uint32_t index) +{ + struct disk_exception *de = get_exception(ps, ps->area, index); + + /* clear it */ + de->old_chunk = 0; + de->new_chunk = 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, void *ps_area, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context, + int *full) +{ + int r; + unsigned int i; + struct core_exception e; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + read_exception(ps, ps_area, i, &e); + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (e.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= e.new_chunk) + ps->next_free = e.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = callback(callback_context, e.old_chunk, e.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps, + int (*callback)(void *callback_context, chunk_t old, + chunk_t new), + void *callback_context) +{ + int r, full = 1; + struct dm_bufio_client *client; + chunk_t prefetch_area = 0; + + client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev, + ps->store->chunk_size << SECTOR_SHIFT, + 1, 0, NULL, NULL, 0); + + if (IS_ERR(client)) + return PTR_ERR(client); + + /* + * Setup for one current buffer + desired readahead buffers. + */ + dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS); + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (ps->current_area = 0; full; ps->current_area++) { + struct dm_buffer *bp; + void *area; + chunk_t chunk; + + if (unlikely(prefetch_area < ps->current_area)) + prefetch_area = ps->current_area; + + if (DM_PREFETCH_CHUNKS) do { + chunk_t pf_chunk = area_location(ps, prefetch_area); + if (unlikely(pf_chunk >= dm_bufio_get_device_size(client))) + break; + dm_bufio_prefetch(client, pf_chunk, 1); + prefetch_area++; + if (unlikely(!prefetch_area)) + break; + } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS); + + chunk = area_location(ps, ps->current_area); + + area = dm_bufio_read(client, chunk, &bp); + if (IS_ERR(area)) { + r = PTR_ERR(area); + goto ret_destroy_bufio; + } + + r = insert_exceptions(ps, area, callback, callback_context, + &full); + + if (!full) + memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); + + dm_bufio_release(bp); + + dm_bufio_forget(client, chunk); + + if (unlikely(r)) + goto ret_destroy_bufio; + } + + ps->current_area--; + + skip_metadata(ps); + + r = 0; + +ret_destroy_bufio: + dm_bufio_client_destroy(client); + + return r; +} + +static struct pstore *get_info(struct dm_exception_store *store) +{ + return (struct pstore *) store->context; +} + +static void persistent_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) +{ + struct pstore *ps = get_info(store); + + *sectors_allocated = ps->next_free * store->chunk_size; + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); + + /* + * First chunk is the fixed header. + * Then there are (ps->current_area + 1) metadata chunks, each one + * separated from the next by ps->exceptions_per_area data chunks. + */ + *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * + store->chunk_size; +} + +static void persistent_dtr(struct dm_exception_store *store) +{ + struct pstore *ps = get_info(store); + + destroy_workqueue(ps->metadata_wq); + + /* Created in read_header */ + if (ps->io_client) + dm_io_client_destroy(ps->io_client); + free_area(ps); + + /* Allocated in persistent_read_metadata */ + kvfree(ps->callbacks); + + kfree(ps); +} + +static int persistent_read_metadata(struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context) +{ + int r, new_snapshot; + struct pstore *ps = get_info(store); + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + return r; + + /* + * Now we know correct chunk_size, complete the initialisation. + */ + ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->callbacks = kvcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks), GFP_KERNEL); + if (!ps->callbacks) + return -ENOMEM; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + return r; + } + + ps->current_area = 0; + zero_memory_area(ps); + r = zero_disk_area(ps, 0); + if (r) + DMWARN("zero_disk_area(0) failed"); + return r; + } + /* + * Sanity checks. + */ + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; + } + + /* + * Metadata are valid, but snapshot is invalidated + */ + if (!ps->valid) + return 1; + + /* + * Read the metadata. + */ + r = read_exceptions(ps, callback, callback_context); + + return r; +} + +static int persistent_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e) +{ + struct pstore *ps = get_info(store); + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + ps->next_free++; + skip_metadata(ps); + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, int valid, + void (*callback) (void *, int success), + void *callback_context) +{ + unsigned int i; + struct pstore *ps = get_info(store); + struct core_exception ce; + struct commit_callback *cb; + + if (!valid) + ps->valid = 0; + + ce.old_chunk = e->old_chunk; + ce.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &ce); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are exceptions in flight and we have not yet + * filled this metadata area there's nothing more to do. + */ + if (!atomic_dec_and_test(&ps->pending_count) && + (ps->current_committed != ps->exceptions_per_area)) + return; + + /* + * If we completely filled the current area, then wipe the next one. + */ + if ((ps->current_committed == ps->exceptions_per_area) && + zero_disk_area(ps, ps->current_area + 1)) + ps->valid = 0; + + /* + * Commit exceptions to disk. + */ + if (ps->valid && area_io(ps, REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | + REQ_SYNC)) + ps->valid = 0; + + /* + * Advance to the next area if this one is full. + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + ps->current_area++; + zero_memory_area(ps); + } + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, ps->valid); + } + + ps->callback_count = 0; +} + +static int persistent_prepare_merge(struct dm_exception_store *store, + chunk_t *last_old_chunk, + chunk_t *last_new_chunk) +{ + struct pstore *ps = get_info(store); + struct core_exception ce; + int nr_consecutive; + int r; + + /* + * When current area is empty, move back to preceding area. + */ + if (!ps->current_committed) { + /* + * Have we finished? + */ + if (!ps->current_area) + return 0; + + ps->current_area--; + r = area_io(ps, REQ_OP_READ); + if (r < 0) + return r; + ps->current_committed = ps->exceptions_per_area; + } + + read_exception(ps, ps->area, ps->current_committed - 1, &ce); + *last_old_chunk = ce.old_chunk; + *last_new_chunk = ce.new_chunk; + + /* + * Find number of consecutive chunks within the current area, + * working backwards. + */ + for (nr_consecutive = 1; nr_consecutive < ps->current_committed; + nr_consecutive++) { + read_exception(ps, ps->area, + ps->current_committed - 1 - nr_consecutive, &ce); + if (ce.old_chunk != *last_old_chunk - nr_consecutive || + ce.new_chunk != *last_new_chunk - nr_consecutive) + break; + } + + return nr_consecutive; +} + +static int persistent_commit_merge(struct dm_exception_store *store, + int nr_merged) +{ + int r, i; + struct pstore *ps = get_info(store); + + BUG_ON(nr_merged > ps->current_committed); + + for (i = 0; i < nr_merged; i++) + clear_exception(ps, ps->current_committed - 1 - i); + + r = area_io(ps, REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); + if (r < 0) + return r; + + ps->current_committed -= nr_merged; + + /* + * At this stage, only persistent_usage() uses ps->next_free, so + * we make no attempt to keep ps->next_free strictly accurate + * as exceptions may have been committed out-of-order originally. + * Once a snapshot has become merging, we set it to the value it + * would have held had all the exceptions been committed in order. + * + * ps->current_area does not get reduced by prepare_merge() until + * after commit_merge() has removed the nr_merged previous exceptions. + */ + ps->next_free = area_location(ps, ps->current_area) + + ps->current_committed + 1; + + return 0; +} + +static void persistent_drop_snapshot(struct dm_exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +static int persistent_ctr(struct dm_exception_store *store, char *options) +{ + struct pstore *ps; + int r; + + /* allocate the pstore */ + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + ps->store = store; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->area = NULL; + ps->zero_area = NULL; + ps->header_area = NULL; + ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ + ps->current_committed = 0; + + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = NULL; + + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); + if (!ps->metadata_wq) { + DMERR("couldn't start header metadata update thread"); + r = -ENOMEM; + goto err_workqueue; + } + + if (options) { + char overflow = toupper(options[0]); + if (overflow == 'O') + store->userspace_supports_overflow = true; + else { + DMERR("Unsupported persistent store option: %s", options); + r = -EINVAL; + goto err_options; + } + } + + store->context = ps; + + return 0; + +err_options: + destroy_workqueue(ps->metadata_wq); +err_workqueue: + kfree(ps); + + return r; +} + +static unsigned int persistent_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned int maxlen) +{ + unsigned int sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" %s %llu", store->userspace_supports_overflow ? "PO" : "P", + (unsigned long long)store->chunk_size); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return sz; +} + +static struct dm_exception_store_type _persistent_type = { + .name = "persistent", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, + .drop_snapshot = persistent_drop_snapshot, + .usage = persistent_usage, + .status = persistent_status, +}; + +static struct dm_exception_store_type _persistent_compat_type = { + .name = "P", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, + .drop_snapshot = persistent_drop_snapshot, + .usage = persistent_usage, + .status = persistent_status, +}; + +int dm_persistent_snapshot_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_persistent_type); + if (r) { + DMERR("Unable to register persistent exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_persistent_compat_type); + if (r) { + DMERR("Unable to register old-style persistent exception store type"); + dm_exception_store_type_unregister(&_persistent_type); + return r; + } + + return r; +} + +void dm_persistent_snapshot_exit(void) +{ + dm_exception_store_type_unregister(&_persistent_type); + dm_exception_store_type_unregister(&_persistent_compat_type); +} diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c new file mode 100644 index 000000000..11de107f5 --- /dev/null +++ b/drivers/md/dm-snap-transient.c @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "transient snapshot" + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +static void transient_dtr(struct dm_exception_store *store) +{ + kfree(store->context); +} + +static int transient_read_metadata(struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context) +{ + return 0; +} + +static int transient_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e) +{ + struct transient_c *tc = store->context; + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); + + if (size < (tc->next_free + store->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store, tc->next_free); + tc->next_free += store->chunk_size; + + return 0; +} + +static void transient_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, int valid, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, valid); +} + +static void transient_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) +{ + *sectors_allocated = ((struct transient_c *) store->context)->next_free; + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); + *metadata_sectors = 0; +} + +static int transient_ctr(struct dm_exception_store *store, char *options) +{ + struct transient_c *tc; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} + +static unsigned int transient_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned int maxlen) +{ + unsigned int sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" N %llu", (unsigned long long)store->chunk_size); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + + return sz; +} + +static struct dm_exception_store_type _transient_type = { + .name = "transient", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .usage = transient_usage, + .status = transient_status, +}; + +static struct dm_exception_store_type _transient_compat_type = { + .name = "N", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .usage = transient_usage, + .status = transient_status, +}; + +int dm_transient_snapshot_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_transient_type); + if (r) { + DMWARN("Unable to register transient exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_transient_compat_type); + if (r) { + DMWARN("Unable to register old-style transient exception store type"); + dm_exception_store_type_unregister(&_transient_type); + return r; + } + + return r; +} + +void dm_transient_snapshot_exit(void) +{ + dm_exception_store_type_unregister(&_transient_type); + dm_exception_store_type_unregister(&_transient_compat_type); +} diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c new file mode 100644 index 000000000..b748901a4 --- /dev/null +++ b/drivers/md/dm-snap.c @@ -0,0 +1,2866 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm.h" + +#include "dm-exception-store.h" + +#define DM_MSG_PREFIX "snapshots" + +static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; + +#define dm_target_is_snapshot_merge(ti) \ + ((ti)->type->name == dm_snapshot_merge_target_name) + +/* + * The size of the mempool used to track chunks in use. + */ +#define MIN_IOS 256 + +#define DM_TRACKED_CHUNK_HASH_SIZE 16 +#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ + (DM_TRACKED_CHUNK_HASH_SIZE - 1)) + +struct dm_exception_table { + uint32_t hash_mask; + unsigned int hash_shift; + struct hlist_bl_head *table; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + + struct dm_dev *origin; + struct dm_dev *cow; + + struct dm_target *ti; + + /* List of snapshots per Origin */ + struct list_head list; + + /* + * You can't use a snapshot if this is 0 (e.g. if full). + * A snapshot-merge target never clears this. + */ + int valid; + + /* + * The snapshot overflowed because of a write to the snapshot device. + * We don't have to invalidate the snapshot in this case, but we need + * to prevent further writes. + */ + int snapshot_overflowed; + + /* Origin writes don't trigger exceptions until this is set */ + int active; + + atomic_t pending_exceptions_count; + + spinlock_t pe_allocation_lock; + + /* Protected by "pe_allocation_lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct rb_root out_of_order_tree; + + mempool_t pending_pool; + + struct dm_exception_table pending; + struct dm_exception_table complete; + + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + + /* Chunks with outstanding reads */ + spinlock_t tracked_chunk_lock; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + + /* The on disk metadata handler */ + struct dm_exception_store *store; + + unsigned int in_progress; + struct wait_queue_head in_progress_wait; + + struct dm_kcopyd_client *kcopyd_client; + + /* Wait for events based on state_bits */ + unsigned long state_bits; + + /* Range of chunks currently being merged. */ + chunk_t first_merging_chunk; + int num_merging_chunks; + + /* + * The merge operation failed if this flag is set. + * Failure modes are handled as follows: + * - I/O error reading the header + * => don't load the target; abort. + * - Header does not have "valid" flag set + * => use the origin; forget about the snapshot. + * - I/O error when reading exceptions + * => don't load the target; abort. + * (We can't use the intermediate origin state.) + * - I/O error while merging + * => stop merging; set merge_failed; process I/O normally. + */ + bool merge_failed:1; + + bool discard_zeroes_cow:1; + bool discard_passdown_origin:1; + + /* + * Incoming bios that overlap with chunks being merged must wait + * for them to be committed. + */ + struct bio_list bios_queued_during_merge; +}; + +/* + * state_bits: + * RUNNING_MERGE - Merge operation is in progress. + * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; + * cleared afterwards. + */ +#define RUNNING_MERGE 0 +#define SHUTDOWN_MERGE 1 + +/* + * Maximum number of chunks being copied on write. + * + * The value was decided experimentally as a trade-off between memory + * consumption, stalling the kernel's workqueues and maintaining a high enough + * throughput. + */ +#define DEFAULT_COW_THRESHOLD 2048 + +static unsigned int cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); +MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, + "A percentage of time allocated for copy on write"); + +struct dm_dev *dm_snap_origin(struct dm_snapshot *s) +{ + return s->origin; +} +EXPORT_SYMBOL(dm_snap_origin); + +struct dm_dev *dm_snap_cow(struct dm_snapshot *s) +{ + return s->cow; +} +EXPORT_SYMBOL(dm_snap_cow); + +static sector_t chunk_to_sector(struct dm_exception_store *store, + chunk_t chunk) +{ + return chunk << store->chunk_shift; +} + +static int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ + /* + * There is only ever one instance of a particular block + * device so we can compare pointers safely. + */ + return lhs == rhs; +} + +struct dm_snap_pending_exception { + struct dm_exception e; + + /* + * Origin buffers waiting for this to complete are held + * in a bio list + */ + struct bio_list origin_bios; + struct bio_list snapshot_bios; + + /* Pointer back to snapshot context */ + struct dm_snapshot *snap; + + /* + * 1 indicates the exception has already been sent to + * kcopyd. + */ + int started; + + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct rb_node out_of_order_node; + + /* + * For writing a complete chunk, bypassing the copy. + */ + struct bio *full_bio; + bio_end_io_t *full_bio_end_io; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static struct kmem_cache *exception_cache; +static struct kmem_cache *pending_cache; + +struct dm_snap_tracked_chunk { + struct hlist_node node; + chunk_t chunk; +}; + +static void init_tracked_chunk(struct bio *bio) +{ + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + INIT_HLIST_NODE(&c->node); +} + +static bool is_bio_tracked(struct bio *bio) +{ + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + return !hlist_unhashed(&c->node); +} + +static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + + c->chunk = chunk; + + spin_lock_irq(&s->tracked_chunk_lock); + hlist_add_head(&c->node, + &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); + spin_unlock_irq(&s->tracked_chunk_lock); +} + +static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio) +{ + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + unsigned long flags; + + spin_lock_irqsave(&s->tracked_chunk_lock, flags); + hlist_del(&c->node); + spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); +} + +static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c; + int found = 0; + + spin_lock_irq(&s->tracked_chunk_lock); + + hlist_for_each_entry(c, + &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { + if (c->chunk == chunk) { + found = 1; + break; + } + } + + spin_unlock_irq(&s->tracked_chunk_lock); + + return found; +} + +/* + * This conflicting I/O is extremely improbable in the caller, + * so msleep(1) is sufficient and there is no need for a wait queue. + */ +static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) +{ + while (__chunk_is_tracked(s, chunk)) + msleep(1); +} + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { + /* The origin device */ + struct block_device *bdev; + + struct list_head hash_list; + + /* List of snapshots for this origin */ + struct list_head snapshots; +}; + +/* + * This structure is allocated for each origin target + */ +struct dm_origin { + struct dm_dev *dev; + struct dm_target *ti; + unsigned int split_boundary; + struct list_head hash_list; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK 0xFF +static struct list_head *_origins; +static struct list_head *_dm_origins; +static struct rw_semaphore _origins_lock; + +static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); +static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); +static uint64_t _pending_exceptions_done_count; + +static int init_origin_hash(void) +{ + int i; + + _origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!_origins) { + DMERR("unable to allocate memory for _origins"); + return -ENOMEM; + } + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_origins + i); + + _dm_origins = kmalloc_array(ORIGIN_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!_dm_origins) { + DMERR("unable to allocate memory for _dm_origins"); + kfree(_origins); + return -ENOMEM; + } + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_dm_origins + i); + + init_rwsem(&_origins_lock); + + return 0; +} + +static void exit_origin_hash(void) +{ + kfree(_origins); + kfree(_dm_origins); +} + +static unsigned int origin_hash(struct block_device *bdev) +{ + return bdev->bd_dev & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(struct block_device *origin) +{ + struct list_head *ol; + struct origin *o; + + ol = &_origins[origin_hash(origin)]; + list_for_each_entry(o, ol, hash_list) + if (bdev_equal(o->bdev, origin)) + return o; + + return NULL; +} + +static void __insert_origin(struct origin *o) +{ + struct list_head *sl = &_origins[origin_hash(o->bdev)]; + list_add_tail(&o->hash_list, sl); +} + +static struct dm_origin *__lookup_dm_origin(struct block_device *origin) +{ + struct list_head *ol; + struct dm_origin *o; + + ol = &_dm_origins[origin_hash(origin)]; + list_for_each_entry(o, ol, hash_list) + if (bdev_equal(o->dev->bdev, origin)) + return o; + + return NULL; +} + +static void __insert_dm_origin(struct dm_origin *o) +{ + struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)]; + list_add_tail(&o->hash_list, sl); +} + +static void __remove_dm_origin(struct dm_origin *o) +{ + list_del(&o->hash_list); +} + +/* + * _origins_lock must be held when calling this function. + * Returns number of snapshots registered using the supplied cow device, plus: + * snap_src - a snapshot suitable for use as a source of exception handover + * snap_dest - a snapshot capable of receiving exception handover. + * snap_merge - an existing snapshot-merge target linked to the same origin. + * There can be at most one snapshot-merge target. The parameter is optional. + * + * Possible return values and states of snap_src and snap_dest. + * 0: NULL, NULL - first new snapshot + * 1: snap_src, NULL - normal snapshot + * 2: snap_src, snap_dest - waiting for handover + * 2: snap_src, NULL - handed over, waiting for old to be deleted + * 1: NULL, snap_dest - source got destroyed without handover + */ +static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, + struct dm_snapshot **snap_src, + struct dm_snapshot **snap_dest, + struct dm_snapshot **snap_merge) +{ + struct dm_snapshot *s; + struct origin *o; + int count = 0; + int active; + + o = __lookup_origin(snap->origin->bdev); + if (!o) + goto out; + + list_for_each_entry(s, &o->snapshots, list) { + if (dm_target_is_snapshot_merge(s->ti) && snap_merge) + *snap_merge = s; + if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) + continue; + + down_read(&s->lock); + active = s->active; + up_read(&s->lock); + + if (active) { + if (snap_src) + *snap_src = s; + } else if (snap_dest) + *snap_dest = s; + + count++; + } + +out: + return count; +} + +/* + * On success, returns 1 if this snapshot is a handover destination, + * otherwise returns 0. + */ +static int __validate_exception_handover(struct dm_snapshot *snap) +{ + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + struct dm_snapshot *snap_merge = NULL; + + /* Does snapshot need exceptions handed over to it? */ + if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, + &snap_merge) == 2) || + snap_dest) { + snap->ti->error = "Snapshot cow pairing for exception table handover failed"; + return -EINVAL; + } + + /* + * If no snap_src was found, snap cannot become a handover + * destination. + */ + if (!snap_src) + return 0; + + /* + * Non-snapshot-merge handover? + */ + if (!dm_target_is_snapshot_merge(snap->ti)) + return 1; + + /* + * Do not allow more than one merging snapshot. + */ + if (snap_merge) { + snap->ti->error = "A snapshot is already merging."; + return -EINVAL; + } + + if (!snap_src->store->type->prepare_merge || + !snap_src->store->type->commit_merge) { + snap->ti->error = "Snapshot exception store does not support snapshot-merge."; + return -EINVAL; + } + + return 1; +} + +static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) +{ + struct dm_snapshot *l; + + /* Sort the list according to chunk size, largest-first smallest-last */ + list_for_each_entry(l, &o->snapshots, list) + if (l->store->chunk_size < s->store->chunk_size) + break; + list_add_tail(&s->list, &l->list); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + * + * Also validate snapshot exception store handovers. + * On success, returns 1 if this registration is a handover destination, + * otherwise returns 0. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ + struct origin *o, *new_o = NULL; + struct block_device *bdev = snap->origin->bdev; + int r = 0; + + new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); + if (!new_o) + return -ENOMEM; + + down_write(&_origins_lock); + + r = __validate_exception_handover(snap); + if (r < 0) { + kfree(new_o); + goto out; + } + + o = __lookup_origin(bdev); + if (o) + kfree(new_o); + else { + /* New origin */ + o = new_o; + + /* Initialise the struct */ + INIT_LIST_HEAD(&o->snapshots); + o->bdev = bdev; + + __insert_origin(o); + } + + __insert_snapshot(o, snap); + +out: + up_write(&_origins_lock); + + return r; +} + +/* + * Move snapshot to correct place in list according to chunk size. + */ +static void reregister_snapshot(struct dm_snapshot *s) +{ + struct block_device *bdev = s->origin->bdev; + + down_write(&_origins_lock); + + list_del(&s->list); + __insert_snapshot(__lookup_origin(bdev), s); + + up_write(&_origins_lock); +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(s->origin->bdev); + + list_del(&s->list); + if (o && list_empty(&o->snapshots)) { + list_del(&o->hash_list); + kfree(o); + } + + up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + * The lowest hash_shift bits of the chunk number are ignored, allowing + * some consecutive chunks to be grouped together. + */ +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); + +/* Lock to protect access to the completed and pending exception hash tables. */ +struct dm_exception_table_lock { + struct hlist_bl_head *complete_slot; + struct hlist_bl_head *pending_slot; +}; + +static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, + struct dm_exception_table_lock *lock) +{ + struct dm_exception_table *complete = &s->complete; + struct dm_exception_table *pending = &s->pending; + + lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; + lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; +} + +static void dm_exception_table_lock(struct dm_exception_table_lock *lock) +{ + hlist_bl_lock(lock->complete_slot); + hlist_bl_lock(lock->pending_slot); +} + +static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) +{ + hlist_bl_unlock(lock->pending_slot); + hlist_bl_unlock(lock->complete_slot); +} + +static int dm_exception_table_init(struct dm_exception_table *et, + uint32_t size, unsigned int hash_shift) +{ + unsigned int i; + + et->hash_shift = hash_shift; + et->hash_mask = size - 1; + et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head), + GFP_KERNEL); + if (!et->table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_HLIST_BL_HEAD(et->table + i); + + return 0; +} + +static void dm_exception_table_exit(struct dm_exception_table *et, + struct kmem_cache *mem) +{ + struct hlist_bl_head *slot; + struct dm_exception *ex; + struct hlist_bl_node *pos, *n; + int i, size; + + size = et->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = et->table + i; + + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) + kmem_cache_free(mem, ex); + } + + kvfree(et->table); +} + +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) +{ + return (chunk >> et->hash_shift) & et->hash_mask; +} + +static void dm_remove_exception(struct dm_exception *e) +{ + hlist_bl_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, + chunk_t chunk) +{ + struct hlist_bl_head *slot; + struct hlist_bl_node *pos; + struct dm_exception *e; + + slot = &et->table[exception_hash(et, chunk)]; + hlist_bl_for_each_entry(e, pos, slot, hash_list) + if (chunk >= e->old_chunk && + chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) + return e; + + return NULL; +} + +static struct dm_exception *alloc_completed_exception(gfp_t gfp) +{ + struct dm_exception *e; + + e = kmem_cache_alloc(exception_cache, gfp); + if (!e && gfp == GFP_NOIO) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static void free_completed_exception(struct dm_exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) +{ + struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool, + GFP_NOIO); + + atomic_inc(&s->pending_exceptions_count); + pe->snap = s; + + return pe; +} + +static void free_pending_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + mempool_free(pe, &s->pending_pool); + smp_mb__before_atomic(); + atomic_dec(&s->pending_exceptions_count); +} + +static void dm_insert_exception(struct dm_exception_table *eh, + struct dm_exception *new_e) +{ + struct hlist_bl_head *l; + struct hlist_bl_node *pos; + struct dm_exception *e = NULL; + + l = &eh->table[exception_hash(eh, new_e->old_chunk)]; + + /* Add immediately if this table doesn't support consecutive chunks */ + if (!eh->hash_shift) + goto out; + + /* List is ordered by old_chunk */ + hlist_bl_for_each_entry(e, pos, l, hash_list) { + /* Insert after an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk + + dm_consecutive_chunk_count(e) + 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) + + dm_consecutive_chunk_count(e) + 1)) { + dm_consecutive_chunk_count_inc(e); + free_completed_exception(new_e); + return; + } + + /* Insert before an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk - 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { + dm_consecutive_chunk_count_inc(e); + e->old_chunk--; + e->new_chunk--; + free_completed_exception(new_e); + return; + } + + if (new_e->old_chunk < e->old_chunk) + break; + } + +out: + if (!e) { + /* + * Either the table doesn't support consecutive chunks or slot + * l is empty. + */ + hlist_bl_add_head(&new_e->hash_list, l); + } else if (new_e->old_chunk < e->old_chunk) { + /* Add before an existing exception */ + hlist_bl_add_before(&new_e->hash_list, &e->hash_list); + } else { + /* Add to l's tail: e is the last exception in this slot */ + hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); + } +} + +/* + * Callback used by the exception stores to load exceptions when + * initialising. + */ +static int dm_add_exception(void *context, chunk_t old, chunk_t new) +{ + struct dm_exception_table_lock lock; + struct dm_snapshot *s = context; + struct dm_exception *e; + + e = alloc_completed_exception(GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + + /* Consecutive_count is implicitly initialised to zero */ + e->new_chunk = new; + + /* + * Although there is no need to lock access to the exception tables + * here, if we don't then hlist_bl_add_head(), called by + * dm_insert_exception(), will complain about accessing the + * corresponding list without locking it first. + */ + dm_exception_table_lock_init(s, old, &lock); + + dm_exception_table_lock(&lock); + dm_insert_exception(&s->complete, e); + dm_exception_table_unlock(&lock); + + return 0; +} + +/* + * Return a minimum chunk size of all snapshots that have the specified origin. + * Return zero if the origin has no snapshots. + */ +static uint32_t __minimum_chunk_size(struct origin *o) +{ + struct dm_snapshot *snap; + unsigned int chunk_size = rounddown_pow_of_two(UINT_MAX); + + if (o) + list_for_each_entry(snap, &o->snapshots, list) + chunk_size = min_not_zero(chunk_size, + snap->store->chunk_size); + + return (uint32_t) chunk_size; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ + /* use a fixed size of 2MB */ + unsigned long mem = 2 * 1024 * 1024; + mem /= sizeof(struct hlist_bl_head); + + return mem; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ + sector_t hash_size, cow_dev_size, max_buckets; + + /* + * Calculate based on the size of the original volume or + * the COW volume... + */ + cow_dev_size = get_dev_size(s->cow->bdev); + max_buckets = calc_max_buckets(); + + hash_size = cow_dev_size >> s->store->chunk_shift; + hash_size = min(hash_size, max_buckets); + + if (hash_size < 64) + hash_size = 64; + hash_size = rounddown_pow_of_two(hash_size); + if (dm_exception_table_init(&s->complete, hash_size, + DM_CHUNK_CONSECUTIVE_BITS)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (hash_size < 64) + hash_size = 64; + + if (dm_exception_table_init(&s->pending, hash_size, 0)) { + dm_exception_table_exit(&s->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +static void merge_shutdown(struct dm_snapshot *s) +{ + clear_bit_unlock(RUNNING_MERGE, &s->state_bits); + smp_mb__after_atomic(); + wake_up_bit(&s->state_bits, RUNNING_MERGE); +} + +static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) +{ + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + + return bio_list_get(&s->bios_queued_during_merge); +} + +/* + * Remove one chunk from the index of completed exceptions. + */ +static int __remove_single_exception_chunk(struct dm_snapshot *s, + chunk_t old_chunk) +{ + struct dm_exception *e; + + e = dm_lookup_exception(&s->complete, old_chunk); + if (!e) { + DMERR("Corruption detected: exception for block %llu is on disk but not in memory", + (unsigned long long)old_chunk); + return -EINVAL; + } + + /* + * If this is the only chunk using this exception, remove exception. + */ + if (!dm_consecutive_chunk_count(e)) { + dm_remove_exception(e); + free_completed_exception(e); + return 0; + } + + /* + * The chunk may be either at the beginning or the end of a + * group of consecutive chunks - never in the middle. We are + * removing chunks in the opposite order to that in which they + * were added, so this should always be true. + * Decrement the consecutive chunk counter and adjust the + * starting point if necessary. + */ + if (old_chunk == e->old_chunk) { + e->old_chunk++; + e->new_chunk++; + } else if (old_chunk != e->old_chunk + + dm_consecutive_chunk_count(e)) { + DMERR("Attempt to merge block %llu from the middle of a chunk range [%llu - %llu]", + (unsigned long long)old_chunk, + (unsigned long long)e->old_chunk, + (unsigned long long) + e->old_chunk + dm_consecutive_chunk_count(e)); + return -EINVAL; + } + + dm_consecutive_chunk_count_dec(e); + + return 0; +} + +static void flush_bios(struct bio *bio); + +static int remove_single_exception_chunk(struct dm_snapshot *s) +{ + struct bio *b = NULL; + int r; + chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; + + down_write(&s->lock); + + /* + * Process chunks (and associated exceptions) in reverse order + * so that dm_consecutive_chunk_count_dec() accounting works. + */ + do { + r = __remove_single_exception_chunk(s, old_chunk); + if (r) + goto out; + } while (old_chunk-- > s->first_merging_chunk); + + b = __release_queued_bios_after_merge(s); + +out: + up_write(&s->lock); + if (b) + flush_bios(b); + + return r; +} + +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned int chunk_size); + +static void merge_callback(int read_err, unsigned long write_err, + void *context); + +static uint64_t read_pending_exceptions_done_count(void) +{ + uint64_t pending_exceptions_done; + + spin_lock(&_pending_exceptions_done_spinlock); + pending_exceptions_done = _pending_exceptions_done_count; + spin_unlock(&_pending_exceptions_done_spinlock); + + return pending_exceptions_done; +} + +static void increment_pending_exceptions_done_count(void) +{ + spin_lock(&_pending_exceptions_done_spinlock); + _pending_exceptions_done_count++; + spin_unlock(&_pending_exceptions_done_spinlock); + + wake_up_all(&_pending_exceptions_done); +} + +static void snapshot_merge_next_chunks(struct dm_snapshot *s) +{ + int i, linear_chunks; + chunk_t old_chunk, new_chunk; + struct dm_io_region src, dest; + sector_t io_size; + uint64_t previous_count; + + BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); + if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) + goto shut; + + /* + * valid flag never changes during merge, so no lock required. + */ + if (!s->valid) { + DMERR("Snapshot is invalid: can't merge"); + goto shut; + } + + linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, + &new_chunk); + if (linear_chunks <= 0) { + if (linear_chunks < 0) { + DMERR("Read error in exception store: shutting down merge"); + down_write(&s->lock); + s->merge_failed = true; + up_write(&s->lock); + } + goto shut; + } + + /* Adjust old_chunk and new_chunk to reflect start of linear region */ + old_chunk = old_chunk + 1 - linear_chunks; + new_chunk = new_chunk + 1 - linear_chunks; + + /* + * Use one (potentially large) I/O to copy all 'linear_chunks' + * from the exception store to the origin + */ + io_size = linear_chunks * s->store->chunk_size; + + dest.bdev = s->origin->bdev; + dest.sector = chunk_to_sector(s->store, old_chunk); + dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); + + src.bdev = s->cow->bdev; + src.sector = chunk_to_sector(s->store, new_chunk); + src.count = dest.count; + + /* + * Reallocate any exceptions needed in other snapshots then + * wait for the pending exceptions to complete. + * Each time any pending exception (globally on the system) + * completes we are woken and repeat the process to find out + * if we can proceed. While this may not seem a particularly + * efficient algorithm, it is not expected to have any + * significant impact on performance. + */ + previous_count = read_pending_exceptions_done_count(); + while (origin_write_extent(s, dest.sector, io_size)) { + wait_event(_pending_exceptions_done, + (read_pending_exceptions_done_count() != + previous_count)); + /* Retry after the wait, until all exceptions are done. */ + previous_count = read_pending_exceptions_done_count(); + } + + down_write(&s->lock); + s->first_merging_chunk = old_chunk; + s->num_merging_chunks = linear_chunks; + up_write(&s->lock); + + /* Wait until writes to all 'linear_chunks' drain */ + for (i = 0; i < linear_chunks; i++) + __check_for_conflicting_io(s, old_chunk + i); + + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); + return; + +shut: + merge_shutdown(s); +} + +static void error_bios(struct bio *bio); + +static void merge_callback(int read_err, unsigned long write_err, void *context) +{ + struct dm_snapshot *s = context; + struct bio *b = NULL; + + if (read_err || write_err) { + if (read_err) + DMERR("Read error: shutting down merge."); + else + DMERR("Write error: shutting down merge."); + goto shut; + } + + if (blkdev_issue_flush(s->origin->bdev) < 0) { + DMERR("Flush after merge failed: shutting down merge"); + goto shut; + } + + if (s->store->type->commit_merge(s->store, + s->num_merging_chunks) < 0) { + DMERR("Write error in exception store: shutting down merge"); + goto shut; + } + + if (remove_single_exception_chunk(s) < 0) + goto shut; + + snapshot_merge_next_chunks(s); + + return; + +shut: + down_write(&s->lock); + s->merge_failed = true; + b = __release_queued_bios_after_merge(s); + up_write(&s->lock); + error_bios(b); + + merge_shutdown(s); +} + +static void start_merge(struct dm_snapshot *s) +{ + if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) + snapshot_merge_next_chunks(s); +} + +/* + * Stop the merging process and wait until it finishes. + */ +static void stop_merge(struct dm_snapshot *s) +{ + set_bit(SHUTDOWN_MERGE, &s->state_bits); + wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE); + clear_bit(SHUTDOWN_MERGE, &s->state_bits); +} + +static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s, + struct dm_target *ti) +{ + int r; + unsigned int argc; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, 2, "Invalid number of feature arguments"}, + }; + + /* + * No feature arguments supplied. + */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + while (argc && !r) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "discard_zeroes_cow")) + s->discard_zeroes_cow = true; + + else if (!strcasecmp(arg_name, "discard_passdown_origin")) + s->discard_passdown_origin = true; + + else { + ti->error = "Unrecognised feature requested"; + r = -EINVAL; + break; + } + } + + if (!s->discard_zeroes_cow && s->discard_passdown_origin) { + /* + * TODO: really these are disjoint.. but ti->num_discard_bios + * and dm_bio_get_target_bio_nr() require rigid constraints. + */ + ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow"; + r = -EINVAL; + } + + return r; +} + +/* + * Construct a snapshot mapping: + * [<# feature args> []*] + */ +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_snapshot *s; + struct dm_arg_set as; + int i; + int r = -EINVAL; + char *origin_path, *cow_path; + dev_t origin_dev, cow_dev; + unsigned int args_used, num_flush_bios = 1; + fmode_t origin_mode = FMODE_READ; + + if (argc < 4) { + ti->error = "requires 4 or more arguments"; + r = -EINVAL; + goto bad; + } + + if (dm_target_is_snapshot_merge(ti)) { + num_flush_bios = 2; + origin_mode = FMODE_WRITE; + } + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + ti->error = "Cannot allocate private snapshot structure"; + r = -ENOMEM; + goto bad; + } + + as.argc = argc; + as.argv = argv; + dm_consume_args(&as, 4); + r = parse_snapshot_features(&as, s, ti); + if (r) + goto bad_features; + + origin_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, origin_path, origin_mode, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad_origin; + } + origin_dev = s->origin->bdev->bd_dev; + + cow_path = argv[0]; + argv++; + argc--; + + cow_dev = dm_get_dev_t(cow_path); + if (cow_dev && cow_dev == origin_dev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_cow; + } + + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); + if (r) { + ti->error = "Cannot get COW device"; + goto bad_cow; + } + + r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad_store; + } + + argv += args_used; + argc -= args_used; + + s->ti = ti; + s->valid = 1; + s->snapshot_overflowed = 0; + s->active = 0; + atomic_set(&s->pending_exceptions_count, 0); + spin_lock_init(&s->pe_allocation_lock); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + s->out_of_order_tree = RB_ROOT; + init_rwsem(&s->lock); + INIT_LIST_HEAD(&s->list); + spin_lock_init(&s->pe_lock); + s->state_bits = 0; + s->merge_failed = false; + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + bio_list_init(&s->bios_queued_during_merge); + + /* Allocate hash table for COW data */ + if (init_hash_tables(s)) { + ti->error = "Unable to allocate hash table space"; + r = -ENOMEM; + goto bad_hash_tables; + } + + init_waitqueue_head(&s->in_progress_wait); + + s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(s->kcopyd_client)) { + r = PTR_ERR(s->kcopyd_client); + ti->error = "Could not create kcopyd client"; + goto bad_kcopyd; + } + + r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache); + if (r) { + ti->error = "Could not allocate mempool for pending exceptions"; + goto bad_pending_pool; + } + + for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); + + spin_lock_init(&s->tracked_chunk_lock); + + ti->private = s; + ti->num_flush_bios = num_flush_bios; + if (s->discard_zeroes_cow) + ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1); + ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); + + /* Add snapshot to the list of snapshots for this origin */ + /* Exceptions aren't triggered till snapshot_resume() is called */ + r = register_snapshot(s); + if (r == -ENOMEM) { + ti->error = "Snapshot origin struct allocation failed"; + goto bad_load_and_register; + } else if (r < 0) { + /* invalid handover, register_snapshot has set ti->error */ + goto bad_load_and_register; + } + + /* + * Metadata must only be loaded into one table at once, so skip this + * if metadata will be handed over during resume. + * Chunk size will be set during the handover - set it to zero to + * ensure it's ignored. + */ + if (r > 0) { + s->store->chunk_size = 0; + return 0; + } + + r = s->store->type->read_metadata(s->store, dm_add_exception, + (void *)s); + if (r < 0) { + ti->error = "Failed to read snapshot metadata"; + goto bad_read_metadata; + } else if (r > 0) { + s->valid = 0; + DMWARN("Snapshot is marked invalid."); + } + + if (!s->store->chunk_size) { + ti->error = "Chunk size not set"; + r = -EINVAL; + goto bad_read_metadata; + } + + r = dm_set_target_max_io_len(ti, s->store->chunk_size); + if (r) + goto bad_read_metadata; + + return 0; + +bad_read_metadata: + unregister_snapshot(s); +bad_load_and_register: + mempool_exit(&s->pending_pool); +bad_pending_pool: + dm_kcopyd_client_destroy(s->kcopyd_client); +bad_kcopyd: + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); +bad_hash_tables: + dm_exception_store_destroy(s->store); +bad_store: + dm_put_device(ti, s->cow); +bad_cow: + dm_put_device(ti, s->origin); +bad_origin: +bad_features: + kfree(s); +bad: + return r; +} + +static void __free_exceptions(struct dm_snapshot *s) +{ + dm_kcopyd_client_destroy(s->kcopyd_client); + s->kcopyd_client = NULL; + + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); +} + +static void __handover_exceptions(struct dm_snapshot *snap_src, + struct dm_snapshot *snap_dest) +{ + union { + struct dm_exception_table table_swap; + struct dm_exception_store *store_swap; + } u; + + /* + * Swap all snapshot context information between the two instances. + */ + u.table_swap = snap_dest->complete; + snap_dest->complete = snap_src->complete; + snap_src->complete = u.table_swap; + + u.store_swap = snap_dest->store; + snap_dest->store = snap_src->store; + snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow; + snap_src->store = u.store_swap; + + snap_dest->store->snap = snap_dest; + snap_src->store->snap = snap_src; + + snap_dest->ti->max_io_len = snap_dest->store->chunk_size; + snap_dest->valid = snap_src->valid; + snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed; + + /* + * Set source invalid to ensure it receives no further I/O. + */ + snap_src->valid = 0; +} + +static void snapshot_dtr(struct dm_target *ti) +{ +#ifdef CONFIG_DM_DEBUG + int i; +#endif + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + /* Check whether exception handover must be cancelled */ + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest && (s == snap_src)) { + down_write(&snap_dest->lock); + snap_dest->valid = 0; + up_write(&snap_dest->lock); + DMERR("Cancelling snapshot handover."); + } + up_read(&_origins_lock); + + if (dm_target_is_snapshot_merge(ti)) + stop_merge(s); + + /* Prevent further origin writes from using this snapshot. */ + /* After this returns there can be no new kcopyd jobs. */ + unregister_snapshot(s); + + while (atomic_read(&s->pending_exceptions_count)) + msleep(1); + /* + * Ensure instructions in mempool_exit aren't reordered + * before atomic_read. + */ + smp_mb(); + +#ifdef CONFIG_DM_DEBUG + for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); +#endif + + __free_exceptions(s); + + mempool_exit(&s->pending_pool); + + dm_exception_store_destroy(s->store); + + dm_put_device(ti, s->cow); + + dm_put_device(ti, s->origin); + + WARN_ON(s->in_progress); + + kfree(s); +} + +static void account_start_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + s->in_progress++; + spin_unlock(&s->in_progress_wait.lock); +} + +static void account_end_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + BUG_ON(!s->in_progress); + s->in_progress--; + if (likely(s->in_progress <= cow_threshold) && + unlikely(waitqueue_active(&s->in_progress_wait))) + wake_up_locked(&s->in_progress_wait); + spin_unlock(&s->in_progress_wait.lock); +} + +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) +{ + if (unlikely(s->in_progress > cow_threshold)) { + spin_lock(&s->in_progress_wait.lock); + if (likely(s->in_progress > cow_threshold)) { + /* + * NOTE: this throttle doesn't account for whether + * the caller is servicing an IO that will trigger a COW + * so excess throttling may result for chunks not required + * to be COW'd. But if cow_threshold was reached, extra + * throttling is unlikely to negatively impact performance. + */ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&s->in_progress_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&s->in_progress_wait.lock); + if (unlock_origins) + up_read(&_origins_lock); + io_schedule(); + remove_wait_queue(&s->in_progress_wait, &wait); + return false; + } + spin_unlock(&s->in_progress_wait.lock); + } + return true; +} + +/* + * Flush a list of buffers. + */ +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + submit_bio_noacct(bio); + bio = n; + } +} + +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); + +/* + * Flush a list of buffers. + */ +static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) +{ + struct bio *n; + int r; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + r = do_origin(s->origin, bio, false); + if (r == DM_MAPIO_REMAPPED) + submit_bio_noacct(bio); + bio = n; + } +} + +/* + * Error a list of buffers. + */ +static void error_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + bio_io_error(bio); + bio = n; + } +} + +static void __invalidate_snapshot(struct dm_snapshot *s, int err) +{ + if (!s->valid) + return; + + if (err == -EIO) + DMERR("Invalidating snapshot: Error reading/writing."); + else if (err == -ENOMEM) + DMERR("Invalidating snapshot: Unable to allocate exception."); + + if (s->store->type->drop_snapshot) + s->store->type->drop_snapshot(s->store); + + s->valid = 0; + + dm_table_event(s->ti->table); +} + +static void invalidate_snapshot(struct dm_snapshot *s, int err) +{ + down_write(&s->lock); + __invalidate_snapshot(s, err); + up_write(&s->lock); +} + +static void pending_complete(void *context, int success) +{ + struct dm_snap_pending_exception *pe = context; + struct dm_exception *e; + struct dm_snapshot *s = pe->snap; + struct bio *origin_bios = NULL; + struct bio *snapshot_bios = NULL; + struct bio *full_bio = NULL; + struct dm_exception_table_lock lock; + int error = 0; + + dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); + + if (!success) { + /* Read/write error - snapshot is unusable */ + invalidate_snapshot(s, -EIO); + error = 1; + + dm_exception_table_lock(&lock); + goto out; + } + + e = alloc_completed_exception(GFP_NOIO); + if (!e) { + invalidate_snapshot(s, -ENOMEM); + error = 1; + + dm_exception_table_lock(&lock); + goto out; + } + *e = pe->e; + + down_read(&s->lock); + dm_exception_table_lock(&lock); + if (!s->valid) { + up_read(&s->lock); + free_completed_exception(e); + error = 1; + + goto out; + } + + /* + * Add a proper exception. After inserting the completed exception all + * subsequent snapshot reads to this chunk will be redirected to the + * COW device. This ensures that we do not starve. Moreover, as long + * as the pending exception exists, neither origin writes nor snapshot + * merging can overwrite the chunk in origin. + */ + dm_insert_exception(&s->complete, e); + up_read(&s->lock); + + /* Wait for conflicting reads to drain */ + if (__chunk_is_tracked(s, pe->e.old_chunk)) { + dm_exception_table_unlock(&lock); + __check_for_conflicting_io(s, pe->e.old_chunk); + dm_exception_table_lock(&lock); + } + +out: + /* Remove the in-flight exception from the list */ + dm_remove_exception(&pe->e); + + dm_exception_table_unlock(&lock); + + snapshot_bios = bio_list_get(&pe->snapshot_bios); + origin_bios = bio_list_get(&pe->origin_bios); + full_bio = pe->full_bio; + if (full_bio) + full_bio->bi_end_io = pe->full_bio_end_io; + increment_pending_exceptions_done_count(); + + /* Submit any pending write bios */ + if (error) { + if (full_bio) + bio_io_error(full_bio); + error_bios(snapshot_bios); + } else { + if (full_bio) + bio_endio(full_bio); + flush_bios(snapshot_bios); + } + + retry_origin_bios(s, origin_bios); + + free_pending_exception(pe); +} + +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error, + pending_complete, pe); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned long write_err, void *context) +{ + struct dm_snap_pending_exception *pe = context; + struct dm_snapshot *s = pe->snap; + + pe->copy_error = read_err || write_err; + + if (pe->exception_sequence == s->exception_complete_sequence) { + struct rb_node *next; + + s->exception_complete_sequence++; + complete_exception(pe); + + next = rb_first(&s->out_of_order_tree); + while (next) { + pe = rb_entry(next, struct dm_snap_pending_exception, + out_of_order_node); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + next = rb_next(next); + s->exception_complete_sequence++; + rb_erase(&pe->out_of_order_node, &s->out_of_order_tree); + complete_exception(pe); + cond_resched(); + } + } else { + struct rb_node *parent = NULL; + struct rb_node **p = &s->out_of_order_tree.rb_node; + struct dm_snap_pending_exception *pe2; + + while (*p) { + pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node); + parent = *p; + + BUG_ON(pe->exception_sequence == pe2->exception_sequence); + if (pe->exception_sequence < pe2->exception_sequence) + p = &((*p)->rb_left); + else + p = &((*p)->rb_right); + } + + rb_link_node(&pe->out_of_order_node, parent, p); + rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); + } + account_end_copy(s); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static void start_copy(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct dm_io_region src, dest; + struct block_device *bdev = s->origin->bdev; + sector_t dev_size; + + dev_size = get_dev_size(bdev); + + src.bdev = bdev; + src.sector = chunk_to_sector(s->store, pe->e.old_chunk); + src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); + + dest.bdev = s->cow->bdev; + dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); + dest.count = src.count; + + /* Hand over to kcopyd */ + account_start_copy(s); + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); +} + +static void full_bio_end_io(struct bio *bio) +{ + void *callback_data = bio->bi_private; + + dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0); +} + +static void start_full_bio(struct dm_snap_pending_exception *pe, + struct bio *bio) +{ + struct dm_snapshot *s = pe->snap; + void *callback_data; + + pe->full_bio = bio; + pe->full_bio_end_io = bio->bi_end_io; + + account_start_copy(s); + callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, + copy_callback, pe); + + bio->bi_end_io = full_bio_end_io; + bio->bi_private = callback_data; + + submit_bio_noacct(bio); +} + +static struct dm_snap_pending_exception * +__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); + + if (!e) + return NULL; + + return container_of(e, struct dm_snap_pending_exception, e); +} + +/* + * Inserts a pending exception into the pending table. + * + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. + */ +static struct dm_snap_pending_exception * +__insert_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + pe->e.old_chunk = chunk; + bio_list_init(&pe->origin_bios); + bio_list_init(&pe->snapshot_bios); + pe->started = 0; + pe->full_bio = NULL; + + spin_lock(&s->pe_allocation_lock); + if (s->store->type->prepare_exception(s->store, &pe->e)) { + spin_unlock(&s->pe_allocation_lock); + free_pending_exception(pe); + return NULL; + } + + pe->exception_sequence = s->exception_start_sequence++; + spin_unlock(&s->pe_allocation_lock); + + dm_insert_exception(&s->pending, &pe->e); + + return pe; +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. + */ +static struct dm_snap_pending_exception * +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + struct dm_snap_pending_exception *pe2; + + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { + free_pending_exception(pe); + return pe2; + } + + return __insert_pending_exception(s, pe, chunk); +} + +static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, + struct bio *bio, chunk_t chunk) +{ + bio_set_dev(bio, s->cow->bdev); + bio->bi_iter.bi_sector = + chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_iter.bi_sector & s->store->chunk_mask); +} + +static void zero_callback(int read_err, unsigned long write_err, void *context) +{ + struct bio *bio = context; + struct dm_snapshot *s = bio->bi_private; + + account_end_copy(s); + bio->bi_status = write_err ? BLK_STS_IOERR : 0; + bio_endio(bio); +} + +static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, + struct bio *bio, chunk_t chunk) +{ + struct dm_io_region dest; + + dest.bdev = s->cow->bdev; + dest.sector = bio->bi_iter.bi_sector; + dest.count = s->store->chunk_size; + + account_start_copy(s); + WARN_ON_ONCE(bio->bi_private); + bio->bi_private = s; + dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); +} + +static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio) +{ + return bio->bi_iter.bi_size == + (s->store->chunk_size << SECTOR_SHIFT); +} + +static int snapshot_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_exception *e; + struct dm_snapshot *s = ti->private; + int r = DM_MAPIO_REMAPPED; + chunk_t chunk; + struct dm_snap_pending_exception *pe = NULL; + struct dm_exception_table_lock lock; + + init_tracked_chunk(bio); + + if (bio->bi_opf & REQ_PREFLUSH) { + bio_set_dev(bio, s->cow->bdev); + return DM_MAPIO_REMAPPED; + } + + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + dm_exception_table_lock_init(s, chunk, &lock); + + /* Full snapshots are not usable */ + /* To get here the table must be live so s->active is always set. */ + if (!s->valid) + return DM_MAPIO_KILL; + + if (bio_data_dir(bio) == WRITE) { + while (unlikely(!wait_for_in_progress(s, false))) + ; /* wait_for_in_progress() has slept */ + } + + down_read(&s->lock); + dm_exception_table_lock(&lock); + + if (!s->valid || (unlikely(s->snapshot_overflowed) && + bio_data_dir(bio) == WRITE)) { + r = DM_MAPIO_KILL; + goto out_unlock; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) { + /* + * passdown discard to origin (without triggering + * snapshot exceptions via do_origin; doing so would + * defeat the goal of freeing space in origin that is + * implied by the "discard_passdown_origin" feature) + */ + bio_set_dev(bio, s->origin->bdev); + track_chunk(s, bio, chunk); + goto out_unlock; + } + /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */ + } + + /* If the block is already remapped - use that, else remap it */ + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio, chunk); + if (unlikely(bio_op(bio) == REQ_OP_DISCARD) && + io_overlaps_chunk(s, bio)) { + dm_exception_table_unlock(&lock); + up_read(&s->lock); + zero_exception(s, e, bio, chunk); + r = DM_MAPIO_SUBMITTED; /* discard is not issued */ + goto out; + } + goto out_unlock; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + /* + * If no exception exists, complete discard immediately + * otherwise it'll trigger copy-out. + */ + bio_endio(bio); + r = DM_MAPIO_SUBMITTED; + goto out_unlock; + } + + /* + * Write to snapshot - higher level takes care of RW/RO + * flags so we should only get this if we are + * writable. + */ + if (bio_data_dir(bio) == WRITE) { + pe = __lookup_pending_exception(s, chunk); + if (!pe) { + dm_exception_table_unlock(&lock); + pe = alloc_pending_exception(s); + dm_exception_table_lock(&lock); + + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + free_pending_exception(pe); + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + pe = __find_pending_exception(s, pe, chunk); + if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&s->lock); + + down_write(&s->lock); + + if (s->store->userspace_supports_overflow) { + if (s->valid && !s->snapshot_overflowed) { + s->snapshot_overflowed = 1; + DMERR("Snapshot overflowed: Unable to allocate exception."); + } + } else + __invalidate_snapshot(s, -ENOMEM); + up_write(&s->lock); + + r = DM_MAPIO_KILL; + goto out; + } + } + + remap_exception(s, &pe->e, bio, chunk); + + r = DM_MAPIO_SUBMITTED; + + if (!pe->started && io_overlaps_chunk(s, bio)) { + pe->started = 1; + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + + start_full_bio(pe, bio); + goto out; + } + + bio_list_add(&pe->snapshot_bios, bio); + + if (!pe->started) { + /* this is protected by the exception table lock */ + pe->started = 1; + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + + start_copy(pe); + goto out; + } + } else { + bio_set_dev(bio, s->origin->bdev); + track_chunk(s, bio, chunk); + } + +out_unlock: + dm_exception_table_unlock(&lock); + up_read(&s->lock); +out: + return r; +} + +/* + * A snapshot-merge target behaves like a combination of a snapshot + * target and a snapshot-origin target. It only generates new + * exceptions in other snapshots and not in the one that is being + * merged. + * + * For each chunk, if there is an existing exception, it is used to + * redirect I/O to the cow device. Otherwise I/O is sent to the origin, + * which in turn might generate exceptions in other snapshots. + * If merging is currently taking place on the chunk in question, the + * I/O is deferred by adding it to s->bios_queued_during_merge. + */ +static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_exception *e; + struct dm_snapshot *s = ti->private; + int r = DM_MAPIO_REMAPPED; + chunk_t chunk; + + init_tracked_chunk(bio); + + if (bio->bi_opf & REQ_PREFLUSH) { + if (!dm_bio_get_target_bio_nr(bio)) + bio_set_dev(bio, s->origin->bdev); + else + bio_set_dev(bio, s->cow->bdev); + return DM_MAPIO_REMAPPED; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + /* Once merging, discards no longer effect change */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + + down_write(&s->lock); + + /* Full merging snapshots are redirected to the origin */ + if (!s->valid) + goto redirect_to_origin; + + /* If the block is already remapped - use that */ + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + /* Queue writes overlapping with chunks being merged */ + if (bio_data_dir(bio) == WRITE && + chunk >= s->first_merging_chunk && + chunk < (s->first_merging_chunk + + s->num_merging_chunks)) { + bio_set_dev(bio, s->origin->bdev); + bio_list_add(&s->bios_queued_during_merge, bio); + r = DM_MAPIO_SUBMITTED; + goto out_unlock; + } + + remap_exception(s, e, bio, chunk); + + if (bio_data_dir(bio) == WRITE) + track_chunk(s, bio, chunk); + goto out_unlock; + } + +redirect_to_origin: + bio_set_dev(bio, s->origin->bdev); + + if (bio_data_dir(bio) == WRITE) { + up_write(&s->lock); + return do_origin(s->origin, bio, false); + } + +out_unlock: + up_write(&s->lock); + + return r; +} + +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) +{ + struct dm_snapshot *s = ti->private; + + if (is_bio_tracked(bio)) + stop_tracking_chunk(s, bio); + + return DM_ENDIO_DONE; +} + +static void snapshot_merge_presuspend(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + stop_merge(s); +} + +static int snapshot_preresume(struct dm_target *ti) +{ + int r = 0; + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) { + down_read(&snap_src->lock); + if (s == snap_src) { + DMERR("Unable to resume snapshot source until handover completes."); + r = -EINVAL; + } else if (!dm_suspended(snap_src->ti)) { + DMERR("Unable to perform snapshot handover until source is suspended."); + r = -EINVAL; + } + up_read(&snap_src->lock); + } + up_read(&_origins_lock); + + return r; +} + +static void snapshot_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL; + struct dm_origin *o; + struct mapped_device *origin_md = NULL; + bool must_restart_merging = false; + + down_read(&_origins_lock); + + o = __lookup_dm_origin(s->origin->bdev); + if (o) + origin_md = dm_table_get_md(o->ti->table); + if (!origin_md) { + (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging); + if (snap_merging) + origin_md = dm_table_get_md(snap_merging->ti->table); + } + if (origin_md == dm_table_get_md(ti->table)) + origin_md = NULL; + if (origin_md) { + if (dm_hold(origin_md)) + origin_md = NULL; + } + + up_read(&_origins_lock); + + if (origin_md) { + dm_internal_suspend_fast(origin_md); + if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) { + must_restart_merging = true; + stop_merge(snap_merging); + } + } + + down_read(&_origins_lock); + + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) { + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + __handover_exceptions(snap_src, snap_dest); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); + } + + up_read(&_origins_lock); + + if (origin_md) { + if (must_restart_merging) + start_merge(snap_merging); + dm_internal_resume_fast(origin_md); + dm_put(origin_md); + } + + /* Now we have correct chunk size, reregister */ + reregister_snapshot(s); + + down_write(&s->lock); + s->active = 1; + up_write(&s->lock); +} + +static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) +{ + uint32_t min_chunksize; + + down_read(&_origins_lock); + min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); + up_read(&_origins_lock); + + return min_chunksize; +} + +static void snapshot_merge_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + /* + * Handover exceptions from existing snapshot. + */ + snapshot_resume(ti); + + /* + * snapshot-merge acts as an origin, so set ti->max_io_len + */ + ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev); + + start_merge(s); +} + +static void snapshot_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct dm_snapshot *snap = ti->private; + unsigned int num_features; + + switch (type) { + case STATUSTYPE_INFO: + + down_write(&snap->lock); + + if (!snap->valid) + DMEMIT("Invalid"); + else if (snap->merge_failed) + DMEMIT("Merge failed"); + else if (snap->snapshot_overflowed) + DMEMIT("Overflow"); + else { + if (snap->store->type->usage) { + sector_t total_sectors, sectors_allocated, + metadata_sectors; + snap->store->type->usage(snap->store, + &total_sectors, + §ors_allocated, + &metadata_sectors); + DMEMIT("%llu/%llu %llu", + (unsigned long long)sectors_allocated, + (unsigned long long)total_sectors, + (unsigned long long)metadata_sectors); + } + else + DMEMIT("Unknown"); + } + + up_write(&snap->lock); + + break; + + case STATUSTYPE_TABLE: + /* + * kdevname returns a static pointer so we need + * to make private copies if the output is to + * make sense. + */ + DMEMIT("%s %s", snap->origin->name, snap->cow->name); + sz += snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); + num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin; + if (num_features) { + DMEMIT(" %u", num_features); + if (snap->discard_zeroes_cow) + DMEMIT(" discard_zeroes_cow"); + if (snap->discard_passdown_origin) + DMEMIT(" discard_passdown_origin"); + } + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",snap_origin_name=%s", snap->origin->name); + DMEMIT(",snap_cow_name=%s", snap->cow->name); + DMEMIT(",snap_valid=%c", snap->valid ? 'y' : 'n'); + DMEMIT(",snap_merge_failed=%c", snap->merge_failed ? 'y' : 'n'); + DMEMIT(",snapshot_overflowed=%c", snap->snapshot_overflowed ? 'y' : 'n'); + DMEMIT(";"); + break; + } +} + +static int snapshot_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_snapshot *snap = ti->private; + int r; + + r = fn(ti, snap->origin, 0, ti->len, data); + + if (!r) + r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); + + return r; +} + +static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_snapshot *snap = ti->private; + + if (snap->discard_zeroes_cow) { + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + + (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) + snap = snap_src; + + /* All discards are split on chunk_size boundary */ + limits->discard_granularity = snap->store->chunk_size; + limits->max_discard_sectors = snap->store->chunk_size; + + up_read(&_origins_lock); + } +} + +/*----------------------------------------------------------------- + * Origin methods + *---------------------------------------------------------------*/ + +/* + * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any + * supplied bio was ignored. The caller may submit it immediately. + * (No remapping actually occurs as the origin is always a direct linear + * map.) + * + * If further exceptions are required, DM_MAPIO_SUBMITTED is returned + * and any supplied bio is added to a list to be submitted once all + * the necessary exceptions exist. + */ +static int __origin_write(struct list_head *snapshots, sector_t sector, + struct bio *bio) +{ + int r = DM_MAPIO_REMAPPED; + struct dm_snapshot *snap; + struct dm_exception *e; + struct dm_snap_pending_exception *pe, *pe2; + struct dm_snap_pending_exception *pe_to_start_now = NULL; + struct dm_snap_pending_exception *pe_to_start_last = NULL; + struct dm_exception_table_lock lock; + chunk_t chunk; + + /* Do all the snapshots on this origin */ + list_for_each_entry(snap, snapshots, list) { + /* + * Don't make new exceptions in a merging snapshot + * because it has effectively been deleted + */ + if (dm_target_is_snapshot_merge(snap->ti)) + continue; + + /* Nothing to do if writing beyond end of snapshot */ + if (sector >= dm_table_get_size(snap->ti->table)) + continue; + + /* + * Remember, different snapshots can have + * different chunk sizes. + */ + chunk = sector_to_chunk(snap->store, sector); + dm_exception_table_lock_init(snap, chunk, &lock); + + down_read(&snap->lock); + dm_exception_table_lock(&lock); + + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) + goto next_snapshot; + + pe = __lookup_pending_exception(snap, chunk); + if (!pe) { + /* + * Check exception table to see if block is already + * remapped in this snapshot and trigger an exception + * if not. + */ + e = dm_lookup_exception(&snap->complete, chunk); + if (e) + goto next_snapshot; + + dm_exception_table_unlock(&lock); + pe = alloc_pending_exception(snap); + dm_exception_table_lock(&lock); + + pe2 = __lookup_pending_exception(snap, chunk); + + if (!pe2) { + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __insert_pending_exception(snap, pe, chunk); + if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&snap->lock); + + invalidate_snapshot(snap, -ENOMEM); + continue; + } + } else { + free_pending_exception(pe); + pe = pe2; + } + } + + r = DM_MAPIO_SUBMITTED; + + /* + * If an origin bio was supplied, queue it to wait for the + * completion of this exception, and start this one last, + * at the end of the function. + */ + if (bio) { + bio_list_add(&pe->origin_bios, bio); + bio = NULL; + + if (!pe->started) { + pe->started = 1; + pe_to_start_last = pe; + } + } + + if (!pe->started) { + pe->started = 1; + pe_to_start_now = pe; + } + +next_snapshot: + dm_exception_table_unlock(&lock); + up_read(&snap->lock); + + if (pe_to_start_now) { + start_copy(pe_to_start_now); + pe_to_start_now = NULL; + } + } + + /* + * Submit the exception against which the bio is queued last, + * to give the other exceptions a head start. + */ + if (pe_to_start_last) + start_copy(pe_to_start_last); + + return r; +} + +/* + * Called on a write from the origin driver. + */ +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) +{ + struct origin *o; + int r = DM_MAPIO_REMAPPED; + +again: + down_read(&_origins_lock); + o = __lookup_origin(origin->bdev); + if (o) { + if (limit) { + struct dm_snapshot *s; + list_for_each_entry(s, &o->snapshots, list) + if (unlikely(!wait_for_in_progress(s, true))) + goto again; + } + + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); + } + up_read(&_origins_lock); + + return r; +} + +/* + * Trigger exceptions in all non-merging snapshots. + * + * The chunk size of the merging snapshot may be larger than the chunk + * size of some other snapshot so we may need to reallocate multiple + * chunks in other snapshots. + * + * We scan all the overlapping exceptions in the other snapshots. + * Returns 1 if anything was reallocated and must be waited for, + * otherwise returns 0. + * + * size must be a multiple of merging_snap's chunk_size. + */ +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned int size) +{ + int must_wait = 0; + sector_t n; + struct origin *o; + + /* + * The origin's __minimum_chunk_size() got stored in max_io_len + * by snapshot_merge_resume(). + */ + down_read(&_origins_lock); + o = __lookup_origin(merging_snap->origin->bdev); + for (n = 0; n < size; n += merging_snap->ti->max_io_len) + if (__origin_write(&o->snapshots, sector + n, NULL) == + DM_MAPIO_SUBMITTED) + must_wait = 1; + up_read(&_origins_lock); + + return must_wait; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct dm_origin *o; + + if (argc != 1) { + ti->error = "origin: incorrect number of arguments"; + return -EINVAL; + } + + o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL); + if (!o) { + ti->error = "Cannot allocate private origin structure"; + r = -ENOMEM; + goto bad_alloc; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev); + if (r) { + ti->error = "Cannot get target device"; + goto bad_open; + } + + o->ti = ti; + ti->private = o; + ti->num_flush_bios = 1; + + return 0; + +bad_open: + kfree(o); +bad_alloc: + return r; +} + +static void origin_dtr(struct dm_target *ti) +{ + struct dm_origin *o = ti->private; + + dm_put_device(ti, o->dev); + kfree(o); +} + +static int origin_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_origin *o = ti->private; + unsigned int available_sectors; + + bio_set_dev(bio, o->dev->bdev); + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + return DM_MAPIO_REMAPPED; + + if (bio_data_dir(bio) != WRITE) + return DM_MAPIO_REMAPPED; + + available_sectors = o->split_boundary - + ((unsigned int)bio->bi_iter.bi_sector & (o->split_boundary - 1)); + + if (bio_sectors(bio) > available_sectors) + dm_accept_partial_bio(bio, available_sectors); + + /* Only tell snapshots if this is a write */ + return do_origin(o->dev, bio, true); +} + +/* + * Set the target "max_io_len" field to the minimum of all the snapshots' + * chunk sizes. + */ +static void origin_resume(struct dm_target *ti) +{ + struct dm_origin *o = ti->private; + + o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev); + + down_write(&_origins_lock); + __insert_dm_origin(o); + up_write(&_origins_lock); +} + +static void origin_postsuspend(struct dm_target *ti) +{ + struct dm_origin *o = ti->private; + + down_write(&_origins_lock); + __remove_dm_origin(o); + up_write(&_origins_lock); +} + +static void origin_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dm_origin *o = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s", o->dev->name); + break; + case STATUSTYPE_IMA: + result[0] = '\0'; + break; + } +} + +static int origin_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_origin *o = ti->private; + + return fn(ti, o->dev, 0, ti->len, data); +} + +static struct target_type origin_target = { + .name = "snapshot-origin", + .version = {1, 9, 0}, + .module = THIS_MODULE, + .ctr = origin_ctr, + .dtr = origin_dtr, + .map = origin_map, + .resume = origin_resume, + .postsuspend = origin_postsuspend, + .status = origin_status, + .iterate_devices = origin_iterate_devices, +}; + +static struct target_type snapshot_target = { + .name = "snapshot", + .version = {1, 16, 0}, + .module = THIS_MODULE, + .ctr = snapshot_ctr, + .dtr = snapshot_dtr, + .map = snapshot_map, + .end_io = snapshot_end_io, + .preresume = snapshot_preresume, + .resume = snapshot_resume, + .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, + .io_hints = snapshot_io_hints, +}; + +static struct target_type merge_target = { + .name = dm_snapshot_merge_target_name, + .version = {1, 5, 0}, + .module = THIS_MODULE, + .ctr = snapshot_ctr, + .dtr = snapshot_dtr, + .map = snapshot_merge_map, + .end_io = snapshot_end_io, + .presuspend = snapshot_merge_presuspend, + .preresume = snapshot_preresume, + .resume = snapshot_merge_resume, + .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, + .io_hints = snapshot_io_hints, +}; + +static int __init dm_snapshot_init(void) +{ + int r; + + r = dm_exception_store_init(); + if (r) { + DMERR("Failed to initialize exception stores"); + return r; + } + + r = init_origin_hash(); + if (r) { + DMERR("init_origin_hash failed."); + goto bad_origin_hash; + } + + exception_cache = KMEM_CACHE(dm_exception, 0); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad_exception_cache; + } + + pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad_pending_cache; + } + + r = dm_register_target(&snapshot_target); + if (r < 0) { + DMERR("snapshot target register failed %d", r); + goto bad_register_snapshot_target; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Origin target register failed %d", r); + goto bad_register_origin_target; + } + + r = dm_register_target(&merge_target); + if (r < 0) { + DMERR("Merge target register failed %d", r); + goto bad_register_merge_target; + } + + return 0; + +bad_register_merge_target: + dm_unregister_target(&origin_target); +bad_register_origin_target: + dm_unregister_target(&snapshot_target); +bad_register_snapshot_target: + kmem_cache_destroy(pending_cache); +bad_pending_cache: + kmem_cache_destroy(exception_cache); +bad_exception_cache: + exit_origin_hash(); +bad_origin_hash: + dm_exception_store_exit(); + + return r; +} + +static void __exit dm_snapshot_exit(void) +{ + dm_unregister_target(&snapshot_target); + dm_unregister_target(&origin_target); + dm_unregister_target(&merge_target); + + exit_origin_hash(); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); + + dm_exception_store_exit(); +} + +/* Module hooks */ +module_init(dm_snapshot_init); +module_exit(dm_snapshot_exit); + +MODULE_DESCRIPTION(DM_NAME " snapshot target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("dm-snapshot-origin"); +MODULE_ALIAS("dm-snapshot-merge"); diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c new file mode 100644 index 000000000..7eeb3c2a2 --- /dev/null +++ b/drivers/md/dm-stats.c @@ -0,0 +1,1249 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-core.h" +#include "dm-stats.h" + +#define DM_MSG_PREFIX "stats" + +static int dm_stat_need_rcu_barrier; + +/* + * Using 64-bit values to avoid overflow (which is a + * problem that block/genhd.c's IO accounting has). + */ +struct dm_stat_percpu { + unsigned long long sectors[2]; + unsigned long long ios[2]; + unsigned long long merges[2]; + unsigned long long ticks[2]; + unsigned long long io_ticks[2]; + unsigned long long io_ticks_total; + unsigned long long time_in_queue; + unsigned long long *histogram; +}; + +struct dm_stat_shared { + atomic_t in_flight[2]; + unsigned long long stamp; + struct dm_stat_percpu tmp; +}; + +struct dm_stat { + struct list_head list_entry; + int id; + unsigned int stat_flags; + size_t n_entries; + sector_t start; + sector_t end; + sector_t step; + unsigned int n_histogram_entries; + unsigned long long *histogram_boundaries; + const char *program_id; + const char *aux_data; + struct rcu_head rcu_head; + size_t shared_alloc_size; + size_t percpu_alloc_size; + size_t histogram_alloc_size; + struct dm_stat_percpu *stat_percpu[NR_CPUS]; + struct dm_stat_shared stat_shared[]; +}; + +#define STAT_PRECISE_TIMESTAMPS 1 + +struct dm_stats_last_position { + sector_t last_sector; + unsigned int last_rw; +}; + +/* + * A typo on the command line could possibly make the kernel run out of memory + * and crash. To prevent the crash we account all used memory. We fail if we + * exhaust 1/4 of all memory or 1/2 of vmalloc space. + */ +#define DM_STATS_MEMORY_FACTOR 4 +#define DM_STATS_VMALLOC_FACTOR 2 + +static DEFINE_SPINLOCK(shared_memory_lock); + +static unsigned long shared_memory_amount; + +static bool __check_shared_memory(size_t alloc_size) +{ + size_t a; + + a = shared_memory_amount + alloc_size; + if (a < shared_memory_amount) + return false; + if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) + return false; +#ifdef CONFIG_MMU + if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) + return false; +#endif + return true; +} + +static bool check_shared_memory(size_t alloc_size) +{ + bool ret; + + spin_lock_irq(&shared_memory_lock); + + ret = __check_shared_memory(alloc_size); + + spin_unlock_irq(&shared_memory_lock); + + return ret; +} + +static bool claim_shared_memory(size_t alloc_size) +{ + spin_lock_irq(&shared_memory_lock); + + if (!__check_shared_memory(alloc_size)) { + spin_unlock_irq(&shared_memory_lock); + return false; + } + + shared_memory_amount += alloc_size; + + spin_unlock_irq(&shared_memory_lock); + + return true; +} + +static void free_shared_memory(size_t alloc_size) +{ + unsigned long flags; + + spin_lock_irqsave(&shared_memory_lock, flags); + + if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { + spin_unlock_irqrestore(&shared_memory_lock, flags); + DMCRIT("Memory usage accounting bug."); + return; + } + + shared_memory_amount -= alloc_size; + + spin_unlock_irqrestore(&shared_memory_lock, flags); +} + +static void *dm_kvzalloc(size_t alloc_size, int node) +{ + void *p; + + if (!claim_shared_memory(alloc_size)) + return NULL; + + p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node); + if (p) + return p; + + free_shared_memory(alloc_size); + + return NULL; +} + +static void dm_kvfree(void *ptr, size_t alloc_size) +{ + if (!ptr) + return; + + free_shared_memory(alloc_size); + + kvfree(ptr); +} + +static void dm_stat_free(struct rcu_head *head) +{ + int cpu; + struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + + kfree(s->histogram_boundaries); + kfree(s->program_id); + kfree(s->aux_data); + for_each_possible_cpu(cpu) { + dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); + dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); + } + dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); + dm_kvfree(s, s->shared_alloc_size); +} + +static int dm_stat_in_flight(struct dm_stat_shared *shared) +{ + return atomic_read(&shared->in_flight[READ]) + + atomic_read(&shared->in_flight[WRITE]); +} + +int dm_stats_init(struct dm_stats *stats) +{ + int cpu; + struct dm_stats_last_position *last; + + mutex_init(&stats->mutex); + INIT_LIST_HEAD(&stats->list); + stats->precise_timestamps = false; + stats->last = alloc_percpu(struct dm_stats_last_position); + if (!stats->last) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + last = per_cpu_ptr(stats->last, cpu); + last->last_sector = (sector_t)ULLONG_MAX; + last->last_rw = UINT_MAX; + } + + return 0; +} + +void dm_stats_cleanup(struct dm_stats *stats) +{ + size_t ni; + struct dm_stat *s; + struct dm_stat_shared *shared; + + while (!list_empty(&stats->list)) { + s = container_of(stats->list.next, struct dm_stat, list_entry); + list_del(&s->list_entry); + for (ni = 0; ni < s->n_entries; ni++) { + shared = &s->stat_shared[ni]; + if (WARN_ON(dm_stat_in_flight(shared))) { + DMCRIT("leaked in-flight counter at index %lu " + "(start %llu, end %llu, step %llu): reads %d, writes %d", + (unsigned long)ni, + (unsigned long long)s->start, + (unsigned long long)s->end, + (unsigned long long)s->step, + atomic_read(&shared->in_flight[READ]), + atomic_read(&shared->in_flight[WRITE])); + } + cond_resched(); + } + dm_stat_free(&s->rcu_head); + } + free_percpu(stats->last); + mutex_destroy(&stats->mutex); +} + +static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats) +{ + struct list_head *l; + struct dm_stat *tmp_s; + bool precise_timestamps = false; + + list_for_each(l, &stats->list) { + tmp_s = container_of(l, struct dm_stat, list_entry); + if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) { + precise_timestamps = true; + break; + } + } + stats->precise_timestamps = precise_timestamps; +} + +static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, + sector_t step, unsigned int stat_flags, + unsigned int n_histogram_entries, + unsigned long long *histogram_boundaries, + const char *program_id, const char *aux_data, + void (*suspend_callback)(struct mapped_device *), + void (*resume_callback)(struct mapped_device *), + struct mapped_device *md) +{ + struct list_head *l; + struct dm_stat *s, *tmp_s; + sector_t n_entries; + size_t ni; + size_t shared_alloc_size; + size_t percpu_alloc_size; + size_t histogram_alloc_size; + struct dm_stat_percpu *p; + int cpu; + int ret_id; + int r; + + if (end < start || !step) + return -EINVAL; + + n_entries = end - start; + if (dm_sector_div64(n_entries, step)) + n_entries++; + + if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) + return -EOVERFLOW; + + shared_alloc_size = struct_size(s, stat_shared, n_entries); + if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) + return -EOVERFLOW; + + percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); + if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) + return -EOVERFLOW; + + histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); + if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) + return -EOVERFLOW; + + if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + + num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) + return -ENOMEM; + + s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); + if (!s) + return -ENOMEM; + + s->stat_flags = stat_flags; + s->n_entries = n_entries; + s->start = start; + s->end = end; + s->step = step; + s->shared_alloc_size = shared_alloc_size; + s->percpu_alloc_size = percpu_alloc_size; + s->histogram_alloc_size = histogram_alloc_size; + + s->n_histogram_entries = n_histogram_entries; + s->histogram_boundaries = kmemdup(histogram_boundaries, + s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); + if (!s->histogram_boundaries) { + r = -ENOMEM; + goto out; + } + + s->program_id = kstrdup(program_id, GFP_KERNEL); + if (!s->program_id) { + r = -ENOMEM; + goto out; + } + s->aux_data = kstrdup(aux_data, GFP_KERNEL); + if (!s->aux_data) { + r = -ENOMEM; + goto out; + } + + for (ni = 0; ni < n_entries; ni++) { + atomic_set(&s->stat_shared[ni].in_flight[READ], 0); + atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + cond_resched(); + } + + if (s->n_histogram_entries) { + unsigned long long *hi; + hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); + if (!hi) { + r = -ENOMEM; + goto out; + } + for (ni = 0; ni < n_entries; ni++) { + s->stat_shared[ni].tmp.histogram = hi; + hi += s->n_histogram_entries + 1; + cond_resched(); + } + } + + for_each_possible_cpu(cpu) { + p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); + if (!p) { + r = -ENOMEM; + goto out; + } + s->stat_percpu[cpu] = p; + if (s->n_histogram_entries) { + unsigned long long *hi; + hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); + if (!hi) { + r = -ENOMEM; + goto out; + } + for (ni = 0; ni < n_entries; ni++) { + p[ni].histogram = hi; + hi += s->n_histogram_entries + 1; + cond_resched(); + } + } + } + + /* + * Suspend/resume to make sure there is no i/o in flight, + * so that newly created statistics will be exact. + * + * (note: we couldn't suspend earlier because we must not + * allocate memory while suspended) + */ + suspend_callback(md); + + mutex_lock(&stats->mutex); + s->id = 0; + list_for_each(l, &stats->list) { + tmp_s = container_of(l, struct dm_stat, list_entry); + if (WARN_ON(tmp_s->id < s->id)) { + r = -EINVAL; + goto out_unlock_resume; + } + if (tmp_s->id > s->id) + break; + if (unlikely(s->id == INT_MAX)) { + r = -ENFILE; + goto out_unlock_resume; + } + s->id++; + } + ret_id = s->id; + list_add_tail_rcu(&s->list_entry, l); + + dm_stats_recalc_precise_timestamps(stats); + + if (!static_key_enabled(&stats_enabled.key)) + static_branch_enable(&stats_enabled); + + mutex_unlock(&stats->mutex); + + resume_callback(md); + + return ret_id; + +out_unlock_resume: + mutex_unlock(&stats->mutex); + resume_callback(md); +out: + dm_stat_free(&s->rcu_head); + return r; +} + +static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + + list_for_each_entry(s, &stats->list, list_entry) { + if (s->id > id) + break; + if (s->id == id) + return s; + } + + return NULL; +} + +static int dm_stats_delete(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + int cpu; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + list_del_rcu(&s->list_entry); + + dm_stats_recalc_precise_timestamps(stats); + + mutex_unlock(&stats->mutex); + + /* + * vfree can't be called from RCU callback + */ + for_each_possible_cpu(cpu) + if (is_vmalloc_addr(s->stat_percpu) || + is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) + goto do_sync_free; + if (is_vmalloc_addr(s) || + is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { +do_sync_free: + synchronize_rcu_expedited(); + dm_stat_free(&s->rcu_head); + } else { + WRITE_ONCE(dm_stat_need_rcu_barrier, 1); + call_rcu(&s->rcu_head, dm_stat_free); + } + return 0; +} + +static int dm_stats_list(struct dm_stats *stats, const char *program, + char *result, unsigned int maxlen) +{ + struct dm_stat *s; + sector_t len; + unsigned int sz = 0; + + /* + * Output format: + * : + + */ + + mutex_lock(&stats->mutex); + list_for_each_entry(s, &stats->list, list_entry) { + if (!program || !strcmp(program, s->program_id)) { + len = s->end - s->start; + DMEMIT("%d: %llu+%llu %llu %s %s", s->id, + (unsigned long long)s->start, + (unsigned long long)len, + (unsigned long long)s->step, + s->program_id, + s->aux_data); + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) + DMEMIT(" precise_timestamps"); + if (s->n_histogram_entries) { + unsigned int i; + DMEMIT(" histogram:"); + for (i = 0; i < s->n_histogram_entries; i++) { + if (i) + DMEMIT(","); + DMEMIT("%llu", s->histogram_boundaries[i]); + } + } + DMEMIT("\n"); + } + cond_resched(); + } + mutex_unlock(&stats->mutex); + + return 1; +} + +static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, + struct dm_stat_percpu *p) +{ + /* + * This is racy, but so is part_round_stats_single. + */ + unsigned long long now, difference; + unsigned int in_flight_read, in_flight_write; + + if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) + now = jiffies; + else + now = ktime_to_ns(ktime_get()); + + difference = now - shared->stamp; + if (!difference) + return; + + in_flight_read = (unsigned int)atomic_read(&shared->in_flight[READ]); + in_flight_write = (unsigned int)atomic_read(&shared->in_flight[WRITE]); + if (in_flight_read) + p->io_ticks[READ] += difference; + if (in_flight_write) + p->io_ticks[WRITE] += difference; + if (in_flight_read + in_flight_write) { + p->io_ticks_total += difference; + p->time_in_queue += (in_flight_read + in_flight_write) * difference; + } + shared->stamp = now; +} + +static void dm_stat_for_entry(struct dm_stat *s, size_t entry, + int idx, sector_t len, + struct dm_stats_aux *stats_aux, bool end, + unsigned long duration_jiffies) +{ + struct dm_stat_shared *shared = &s->stat_shared[entry]; + struct dm_stat_percpu *p; + + /* + * For strict correctness we should use local_irq_save/restore + * instead of preempt_disable/enable. + * + * preempt_disable/enable is racy if the driver finishes bios + * from non-interrupt context as well as from interrupt context + * or from more different interrupts. + * + * On 64-bit architectures the race only results in not counting some + * events, so it is acceptable. On 32-bit architectures the race could + * cause the counter going off by 2^32, so we need to do proper locking + * there. + * + * part_stat_lock()/part_stat_unlock() have this race too. + */ +#if BITS_PER_LONG == 32 + unsigned long flags; + local_irq_save(flags); +#else + preempt_disable(); +#endif + p = &s->stat_percpu[smp_processor_id()][entry]; + + if (!end) { + dm_stat_round(s, shared, p); + atomic_inc(&shared->in_flight[idx]); + } else { + unsigned long long duration; + dm_stat_round(s, shared, p); + atomic_dec(&shared->in_flight[idx]); + p->sectors[idx] += len; + p->ios[idx] += 1; + p->merges[idx] += stats_aux->merged; + if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { + p->ticks[idx] += duration_jiffies; + duration = jiffies_to_msecs(duration_jiffies); + } else { + p->ticks[idx] += stats_aux->duration_ns; + duration = stats_aux->duration_ns; + } + if (s->n_histogram_entries) { + unsigned int lo = 0, hi = s->n_histogram_entries + 1; + while (lo + 1 < hi) { + unsigned int mid = (lo + hi) / 2; + if (s->histogram_boundaries[mid - 1] > duration) { + hi = mid; + } else { + lo = mid; + } + + } + p->histogram[lo]++; + } + } + +#if BITS_PER_LONG == 32 + local_irq_restore(flags); +#else + preempt_enable(); +#endif +} + +static void __dm_stat_bio(struct dm_stat *s, int bi_rw, + sector_t bi_sector, sector_t end_sector, + bool end, unsigned long duration_jiffies, + struct dm_stats_aux *stats_aux) +{ + sector_t rel_sector, offset, todo, fragment_len; + size_t entry; + + if (end_sector <= s->start || bi_sector >= s->end) + return; + if (unlikely(bi_sector < s->start)) { + rel_sector = 0; + todo = end_sector - s->start; + } else { + rel_sector = bi_sector - s->start; + todo = end_sector - bi_sector; + } + if (unlikely(end_sector > s->end)) + todo -= (end_sector - s->end); + + offset = dm_sector_div64(rel_sector, s->step); + entry = rel_sector; + do { + if (WARN_ON_ONCE(entry >= s->n_entries)) { + DMCRIT("Invalid area access in region id %d", s->id); + return; + } + fragment_len = todo; + if (fragment_len > s->step - offset) + fragment_len = s->step - offset; + dm_stat_for_entry(s, entry, bi_rw, fragment_len, + stats_aux, end, duration_jiffies); + todo -= fragment_len; + entry++; + offset = 0; + } while (unlikely(todo != 0)); +} + +void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned int bi_sectors, bool end, + unsigned long start_time, + struct dm_stats_aux *stats_aux) +{ + struct dm_stat *s; + sector_t end_sector; + struct dm_stats_last_position *last; + bool got_precise_time; + unsigned long duration_jiffies = 0; + + if (unlikely(!bi_sectors)) + return; + + end_sector = bi_sector + bi_sectors; + + if (!end) { + /* + * A race condition can at worst result in the merged flag being + * misrepresented, so we don't have to disable preemption here. + */ + last = raw_cpu_ptr(stats->last); + stats_aux->merged = + (bi_sector == (READ_ONCE(last->last_sector) && + ((bi_rw == WRITE) == + (READ_ONCE(last->last_rw) == WRITE)) + )); + WRITE_ONCE(last->last_sector, end_sector); + WRITE_ONCE(last->last_rw, bi_rw); + } else + duration_jiffies = jiffies - start_time; + + rcu_read_lock(); + + got_precise_time = false; + list_for_each_entry_rcu(s, &stats->list, list_entry) { + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { + /* start (!end) duration_ns is set by DM core's alloc_io() */ + if (end) + stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; + got_precise_time = true; + } + __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); + } + + rcu_read_unlock(); +} + +static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, + struct dm_stat *s, size_t x) +{ + int cpu; + struct dm_stat_percpu *p; + + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + dm_stat_round(s, shared, p); + local_irq_enable(); + + shared->tmp.sectors[READ] = 0; + shared->tmp.sectors[WRITE] = 0; + shared->tmp.ios[READ] = 0; + shared->tmp.ios[WRITE] = 0; + shared->tmp.merges[READ] = 0; + shared->tmp.merges[WRITE] = 0; + shared->tmp.ticks[READ] = 0; + shared->tmp.ticks[WRITE] = 0; + shared->tmp.io_ticks[READ] = 0; + shared->tmp.io_ticks[WRITE] = 0; + shared->tmp.io_ticks_total = 0; + shared->tmp.time_in_queue = 0; + + if (s->n_histogram_entries) + memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); + + for_each_possible_cpu(cpu) { + p = &s->stat_percpu[cpu][x]; + shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]); + shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]); + shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]); + shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]); + shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]); + shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]); + shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]); + shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]); + shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]); + shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]); + shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total); + shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue); + if (s->n_histogram_entries) { + unsigned int i; + for (i = 0; i < s->n_histogram_entries + 1; i++) + shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]); + } + } +} + +static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, + bool init_tmp_percpu_totals) +{ + size_t x; + struct dm_stat_shared *shared; + struct dm_stat_percpu *p; + + for (x = idx_start; x < idx_end; x++) { + shared = &s->stat_shared[x]; + if (init_tmp_percpu_totals) + __dm_stat_init_temporary_percpu_totals(shared, s, x); + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + p->sectors[READ] -= shared->tmp.sectors[READ]; + p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; + p->ios[READ] -= shared->tmp.ios[READ]; + p->ios[WRITE] -= shared->tmp.ios[WRITE]; + p->merges[READ] -= shared->tmp.merges[READ]; + p->merges[WRITE] -= shared->tmp.merges[WRITE]; + p->ticks[READ] -= shared->tmp.ticks[READ]; + p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; + p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; + p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; + p->io_ticks_total -= shared->tmp.io_ticks_total; + p->time_in_queue -= shared->tmp.time_in_queue; + local_irq_enable(); + if (s->n_histogram_entries) { + unsigned int i; + for (i = 0; i < s->n_histogram_entries + 1; i++) { + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + p->histogram[i] -= shared->tmp.histogram[i]; + local_irq_enable(); + } + } + cond_resched(); + } +} + +static int dm_stats_clear(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + __dm_stat_clear(s, 0, s->n_entries, true); + + mutex_unlock(&stats->mutex); + + return 1; +} + +/* + * This is like jiffies_to_msec, but works for 64-bit values. + */ +static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) +{ + unsigned long long result; + unsigned int mult; + + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) + return j; + + result = 0; + if (j) + result = jiffies_to_msecs(j & 0x3fffff); + if (j >= 1 << 22) { + mult = jiffies_to_msecs(1 << 22); + result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); + } + if (j >= 1ULL << 44) + result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); + + return result; +} + +static int dm_stats_print(struct dm_stats *stats, int id, + size_t idx_start, size_t idx_len, + bool clear, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct dm_stat *s; + size_t x; + sector_t start, end, step; + size_t idx_end; + struct dm_stat_shared *shared; + + /* + * Output format: + * + counters + */ + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + idx_end = idx_start + idx_len; + if (idx_end < idx_start || + idx_end > s->n_entries) + idx_end = s->n_entries; + + if (idx_start > idx_end) + idx_start = idx_end; + + step = s->step; + start = s->start + (step * idx_start); + + for (x = idx_start; x < idx_end; x++, start = end) { + shared = &s->stat_shared[x]; + end = start + step; + if (unlikely(end > s->end)) + end = s->end; + + __dm_stat_init_temporary_percpu_totals(shared, s, x); + + DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", + (unsigned long long)start, + (unsigned long long)step, + shared->tmp.ios[READ], + shared->tmp.merges[READ], + shared->tmp.sectors[READ], + dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), + shared->tmp.ios[WRITE], + shared->tmp.merges[WRITE], + shared->tmp.sectors[WRITE], + dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), + dm_stat_in_flight(shared), + dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), + dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), + dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), + dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); + if (s->n_histogram_entries) { + unsigned int i; + for (i = 0; i < s->n_histogram_entries + 1; i++) { + DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); + } + } + DMEMIT("\n"); + + if (unlikely(sz + 1 >= maxlen)) + goto buffer_overflow; + + cond_resched(); + } + + if (clear) + __dm_stat_clear(s, idx_start, idx_end, false); + +buffer_overflow: + mutex_unlock(&stats->mutex); + + return 1; +} + +static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) +{ + struct dm_stat *s; + const char *new_aux_data; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + new_aux_data = kstrdup(aux_data, GFP_KERNEL); + if (!new_aux_data) { + mutex_unlock(&stats->mutex); + return -ENOMEM; + } + + kfree(s->aux_data); + s->aux_data = new_aux_data; + + mutex_unlock(&stats->mutex); + + return 0; +} + +static int parse_histogram(const char *h, unsigned int *n_histogram_entries, + unsigned long long **histogram_boundaries) +{ + const char *q; + unsigned int n; + unsigned long long last; + + *n_histogram_entries = 1; + for (q = h; *q; q++) + if (*q == ',') + (*n_histogram_entries)++; + + *histogram_boundaries = kmalloc_array(*n_histogram_entries, + sizeof(unsigned long long), + GFP_KERNEL); + if (!*histogram_boundaries) + return -ENOMEM; + + n = 0; + last = 0; + while (1) { + unsigned long long hi; + int s; + char ch; + s = sscanf(h, "%llu%c", &hi, &ch); + if (!s || (s == 2 && ch != ',')) + return -EINVAL; + if (hi <= last) + return -EINVAL; + last = hi; + (*histogram_boundaries)[n] = hi; + if (s == 1) + return 0; + h = strchr(h, ',') + 1; + n++; + } +} + +static int message_stats_create(struct mapped_device *md, + unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r; + int id; + char dummy; + unsigned long long start, end, len, step; + unsigned int divisor; + const char *program_id, *aux_data; + unsigned int stat_flags = 0; + + unsigned int n_histogram_entries = 0; + unsigned long long *histogram_boundaries = NULL; + + struct dm_arg_set as, as_backup; + const char *a; + unsigned int feature_args; + + /* + * Input format: + * [ ] [ []] + */ + + if (argc < 3) + goto ret_einval; + + as.argc = argc; + as.argv = argv; + dm_consume_args(&as, 1); + + a = dm_shift_arg(&as); + if (!strcmp(a, "-")) { + start = 0; + len = dm_get_size(md); + if (!len) + len = 1; + } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || + start != (sector_t)start || len != (sector_t)len) + goto ret_einval; + + end = start + len; + if (start >= end) + goto ret_einval; + + a = dm_shift_arg(&as); + if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { + if (!divisor) + return -EINVAL; + step = end - start; + if (do_div(step, divisor)) + step++; + if (!step) + step = 1; + } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || + step != (sector_t)step || !step) + goto ret_einval; + + as_backup = as; + a = dm_shift_arg(&as); + if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { + while (feature_args--) { + a = dm_shift_arg(&as); + if (!a) + goto ret_einval; + if (!strcasecmp(a, "precise_timestamps")) + stat_flags |= STAT_PRECISE_TIMESTAMPS; + else if (!strncasecmp(a, "histogram:", 10)) { + if (n_histogram_entries) + goto ret_einval; + if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) + goto ret; + } else + goto ret_einval; + } + } else { + as = as_backup; + } + + program_id = "-"; + aux_data = "-"; + + a = dm_shift_arg(&as); + if (a) + program_id = a; + + a = dm_shift_arg(&as); + if (a) + aux_data = a; + + if (as.argc) + goto ret_einval; + + /* + * If a buffer overflow happens after we created the region, + * it's too late (the userspace would retry with a larger + * buffer, but the region id that caused the overflow is already + * leaked). So we must detect buffer overflow in advance. + */ + snprintf(result, maxlen, "%d", INT_MAX); + if (dm_message_test_buffer_overflow(result, maxlen)) { + r = 1; + goto ret; + } + + id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, + n_histogram_entries, histogram_boundaries, program_id, aux_data, + dm_internal_suspend_fast, dm_internal_resume_fast, md); + if (id < 0) { + r = id; + goto ret; + } + + snprintf(result, maxlen, "%d", id); + + r = 1; + goto ret; + +ret_einval: + r = -EINVAL; +ret: + kfree(histogram_boundaries); + return r; +} + +static int message_stats_delete(struct mapped_device *md, + unsigned int argc, char **argv) +{ + int id; + char dummy; + + if (argc != 2) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_delete(dm_get_stats(md), id); +} + +static int message_stats_clear(struct mapped_device *md, + unsigned int argc, char **argv) +{ + int id; + char dummy; + + if (argc != 2) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_clear(dm_get_stats(md), id); +} + +static int message_stats_list(struct mapped_device *md, + unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r; + const char *program = NULL; + + if (argc < 1 || argc > 2) + return -EINVAL; + + if (argc > 1) { + program = kstrdup(argv[1], GFP_KERNEL); + if (!program) + return -ENOMEM; + } + + r = dm_stats_list(dm_get_stats(md), program, result, maxlen); + + kfree(program); + + return r; +} + +static int message_stats_print(struct mapped_device *md, + unsigned int argc, char **argv, bool clear, + char *result, unsigned int maxlen) +{ + int id; + char dummy; + unsigned long idx_start = 0, idx_len = ULONG_MAX; + + if (argc != 2 && argc != 4) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + if (argc > 3) { + if (strcmp(argv[2], "-") && + sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) + return -EINVAL; + if (strcmp(argv[3], "-") && + sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) + return -EINVAL; + } + + return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, + result, maxlen); +} + +static int message_stats_set_aux(struct mapped_device *md, + unsigned int argc, char **argv) +{ + int id; + char dummy; + + if (argc != 3) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); +} + +int dm_stats_message(struct mapped_device *md, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r; + + /* All messages here must start with '@' */ + if (!strcasecmp(argv[0], "@stats_create")) + r = message_stats_create(md, argc, argv, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_delete")) + r = message_stats_delete(md, argc, argv); + else if (!strcasecmp(argv[0], "@stats_clear")) + r = message_stats_clear(md, argc, argv); + else if (!strcasecmp(argv[0], "@stats_list")) + r = message_stats_list(md, argc, argv, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_print")) + r = message_stats_print(md, argc, argv, false, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_print_clear")) + r = message_stats_print(md, argc, argv, true, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_set_aux")) + r = message_stats_set_aux(md, argc, argv); + else + return 2; /* this wasn't a stats message */ + + if (r == -EINVAL) + DMCRIT("Invalid parameters for message %s", argv[0]); + + return r; +} + +int __init dm_statistics_init(void) +{ + shared_memory_amount = 0; + dm_stat_need_rcu_barrier = 0; + return 0; +} + +void dm_statistics_exit(void) +{ + if (dm_stat_need_rcu_barrier) + rcu_barrier(); + if (WARN_ON(shared_memory_amount)) + DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); +} + +module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); +MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h new file mode 100644 index 000000000..c6728c8b4 --- /dev/null +++ b/drivers/md/dm-stats.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef DM_STATS_H +#define DM_STATS_H + +#include +#include +#include + +int dm_statistics_init(void); +void dm_statistics_exit(void); + +struct dm_stats { + struct mutex mutex; + struct list_head list; /* list of struct dm_stat */ + struct dm_stats_last_position __percpu *last; + bool precise_timestamps; +}; + +struct dm_stats_aux { + bool merged; + unsigned long long duration_ns; +}; + +int dm_stats_init(struct dm_stats *st); +void dm_stats_cleanup(struct dm_stats *st); + +struct mapped_device; + +int dm_stats_message(struct mapped_device *md, unsigned int argc, char **argv, + char *result, unsigned int maxlen); + +void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned int bi_sectors, bool end, + unsigned long start_time, + struct dm_stats_aux *aux); + +static inline bool dm_stats_used(struct dm_stats *st) +{ + return !list_empty(&st->list); +} + +static inline void dm_stats_record_start(struct dm_stats *stats, struct dm_stats_aux *aux) +{ + if (unlikely(stats->precise_timestamps)) + aux->duration_ns = ktime_to_ns(ktime_get()); +} + +#endif diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c new file mode 100644 index 000000000..547aefe85 --- /dev/null +++ b/drivers/md/dm-stripe.c @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2001-2003 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "striped" +#define DM_IO_ERROR_THRESHOLD 15 + +struct stripe { + struct dm_dev *dev; + sector_t physical_start; + + atomic_t error_count; +}; + +struct stripe_c { + uint32_t stripes; + int stripes_shift; + + /* The size of this target / num. stripes */ + sector_t stripe_width; + + uint32_t chunk_size; + int chunk_size_shift; + + /* Needed for handling events */ + struct dm_target *ti; + + /* Work struct used for triggering events*/ + struct work_struct trigger_event; + + struct stripe stripe[]; +}; + +/* + * An event is triggered whenever a drive + * drops out of a stripe volume. + */ +static void trigger_event(struct work_struct *work) +{ + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); + dm_table_event(sc->ti->table); +} + +/* + * Parse a single pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, + unsigned int stripe, char **argv) +{ + unsigned long long start; + char dummy; + int ret; + + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) + return -EINVAL; + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev); + if (ret) + return ret; + + sc->stripe[stripe].physical_start = start; + + return 0; +} + +/* + * Construct a striped mapping. + * [ ]+ + */ +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct stripe_c *sc; + sector_t width, tmp_len; + uint32_t stripes; + uint32_t chunk_size; + int r; + unsigned int i; + + if (argc < 2) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + if (kstrtouint(argv[0], 10, &stripes) || !stripes) { + ti->error = "Invalid stripe count"; + return -EINVAL; + } + + if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) { + ti->error = "Invalid chunk_size"; + return -EINVAL; + } + + width = ti->len; + if (sector_div(width, stripes)) { + ti->error = "Target length not divisible by number of stripes"; + return -EINVAL; + } + + tmp_len = width; + if (sector_div(tmp_len, chunk_size)) { + ti->error = "Target length not divisible by chunk size"; + return -EINVAL; + } + + /* + * Do we have enough arguments for that many stripes ? + */ + if (argc != (2 + 2 * stripes)) { + ti->error = "Not enough destinations specified"; + return -EINVAL; + } + + sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL); + if (!sc) { + ti->error = "Memory allocation for striped context failed"; + return -ENOMEM; + } + + INIT_WORK(&sc->trigger_event, trigger_event); + + /* Set pointer to dm target; used in trigger_event */ + sc->ti = ti; + sc->stripes = stripes; + sc->stripe_width = width; + + if (stripes & (stripes - 1)) + sc->stripes_shift = -1; + else + sc->stripes_shift = __ffs(stripes); + + r = dm_set_target_max_io_len(ti, chunk_size); + if (r) { + kfree(sc); + return r; + } + + ti->num_flush_bios = stripes; + ti->num_discard_bios = stripes; + ti->num_secure_erase_bios = stripes; + ti->num_write_zeroes_bios = stripes; + + sc->chunk_size = chunk_size; + if (chunk_size & (chunk_size - 1)) + sc->chunk_size_shift = -1; + else + sc->chunk_size_shift = __ffs(chunk_size); + + /* + * Get the stripe destinations. + */ + for (i = 0; i < stripes; i++) { + argv += 2; + + r = get_stripe(ti, sc, i, argv); + if (r < 0) { + ti->error = "Couldn't parse stripe destination"; + while (i--) + dm_put_device(ti, sc->stripe[i].dev); + kfree(sc); + return r; + } + atomic_set(&(sc->stripe[i].error_count), 0); + } + + ti->private = sc; + + return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ + unsigned int i; + struct stripe_c *sc = (struct stripe_c *) ti->private; + + for (i = 0; i < sc->stripes; i++) + dm_put_device(ti, sc->stripe[i].dev); + + flush_work(&sc->trigger_event); + kfree(sc); +} + +static void stripe_map_sector(struct stripe_c *sc, sector_t sector, + uint32_t *stripe, sector_t *result) +{ + sector_t chunk = dm_target_offset(sc->ti, sector); + sector_t chunk_offset; + + if (sc->chunk_size_shift < 0) + chunk_offset = sector_div(chunk, sc->chunk_size); + else { + chunk_offset = chunk & (sc->chunk_size - 1); + chunk >>= sc->chunk_size_shift; + } + + if (sc->stripes_shift < 0) + *stripe = sector_div(chunk, sc->stripes); + else { + *stripe = chunk & (sc->stripes - 1); + chunk >>= sc->stripes_shift; + } + + if (sc->chunk_size_shift < 0) + chunk *= sc->chunk_size; + else + chunk <<= sc->chunk_size_shift; + + *result = chunk + chunk_offset; +} + +static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, + uint32_t target_stripe, sector_t *result) +{ + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, result); + if (stripe == target_stripe) + return; + + /* round down */ + sector = *result; + if (sc->chunk_size_shift < 0) + *result -= sector_div(sector, sc->chunk_size); + else + *result = sector & ~(sector_t)(sc->chunk_size - 1); + + if (target_stripe < stripe) + *result += sc->chunk_size; /* next chunk */ +} + +static int stripe_map_range(struct stripe_c *sc, struct bio *bio, + uint32_t target_stripe) +{ + sector_t begin, end; + + stripe_map_range_sector(sc, bio->bi_iter.bi_sector, + target_stripe, &begin); + stripe_map_range_sector(sc, bio_end_sector(bio), + target_stripe, &end); + if (begin < end) { + bio_set_dev(bio, sc->stripe[target_stripe].dev->bdev); + bio->bi_iter.bi_sector = begin + + sc->stripe[target_stripe].physical_start; + bio->bi_iter.bi_size = to_bytes(end - begin); + return DM_MAPIO_REMAPPED; + } else { + /* The range doesn't map to the target stripe */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } +} + +static int stripe_map(struct dm_target *ti, struct bio *bio) +{ + struct stripe_c *sc = ti->private; + uint32_t stripe; + unsigned int target_bio_nr; + + if (bio->bi_opf & REQ_PREFLUSH) { + target_bio_nr = dm_bio_get_target_bio_nr(bio); + BUG_ON(target_bio_nr >= sc->stripes); + bio_set_dev(bio, sc->stripe[target_bio_nr].dev->bdev); + return DM_MAPIO_REMAPPED; + } + if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || + unlikely(bio_op(bio) == REQ_OP_SECURE_ERASE) || + unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) { + target_bio_nr = dm_bio_get_target_bio_nr(bio); + BUG_ON(target_bio_nr >= sc->stripes); + return stripe_map_range(sc, bio, target_bio_nr); + } + + stripe_map_sector(sc, bio->bi_iter.bi_sector, + &stripe, &bio->bi_iter.bi_sector); + + bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start; + bio_set_dev(bio, sc->stripe[stripe].dev->bdev); + + return DM_MAPIO_REMAPPED; +} + +#if IS_ENABLED(CONFIG_FS_DAX) +static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) +{ + struct stripe_c *sc = ti->private; + struct block_device *bdev; + sector_t dev_sector; + uint32_t stripe; + + stripe_map_sector(sc, *pgoff * PAGE_SECTORS, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + bdev = sc->stripe[stripe].dev->bdev; + + *pgoff = (get_start_sect(bdev) + dev_sector) >> PAGE_SECTORS_SHIFT; + return sc->stripe[stripe].dev->dax_dev; +} + +static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) +{ + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); + + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); +} + +static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, + size_t nr_pages) +{ + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); + + return dax_zero_page_range(dax_dev, pgoff, nr_pages); +} + +static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + +#else +#define stripe_dax_direct_access NULL +#define stripe_dax_zero_page_range NULL +#define stripe_dax_recovery_write NULL +#endif + +/* + * Stripe status: + * + * INFO + * #stripes [stripe_name ] [group word count] + * [error count 'A|D' ] + * + * TABLE + * #stripes [stripe chunk size] + * [stripe_name physical_start ] + * + */ + +static void stripe_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + unsigned int sz = 0; + unsigned int i; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", sc->stripes); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%s ", sc->stripe[i].dev->name); + } + DMEMIT("1 "); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%c", atomic_read(&(sc->stripe[i].error_count)) ? + 'D' : 'A'); + } + break; + + case STATUSTYPE_TABLE: + DMEMIT("%d %llu", sc->stripes, + (unsigned long long)sc->chunk_size); + for (i = 0; i < sc->stripes; i++) + DMEMIT(" %s %llu", sc->stripe[i].dev->name, + (unsigned long long)sc->stripe[i].physical_start); + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",stripes=%d,chunk_size=%llu", sc->stripes, + (unsigned long long)sc->chunk_size); + + for (i = 0; i < sc->stripes; i++) { + DMEMIT(",stripe_%d_device_name=%s", i, sc->stripe[i].dev->name); + DMEMIT(",stripe_%d_physical_start=%llu", i, + (unsigned long long)sc->stripe[i].physical_start); + DMEMIT(",stripe_%d_status=%c", i, + atomic_read(&(sc->stripe[i].error_count)) ? 'D' : 'A'); + } + DMEMIT(";"); + break; + } +} + +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) +{ + unsigned int i; + char major_minor[16]; + struct stripe_c *sc = ti->private; + + if (!*error) + return DM_ENDIO_DONE; /* I/O complete */ + + if (bio->bi_opf & REQ_RAHEAD) + return DM_ENDIO_DONE; + + if (*error == BLK_STS_NOTSUPP) + return DM_ENDIO_DONE; + + memset(major_minor, 0, sizeof(major_minor)); + sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); + + /* + * Test to see which stripe drive triggered the event + * and increment error count for all stripes on that device. + * If the error count for a given device exceeds the threshold + * value we will no longer trigger any further events. + */ + for (i = 0; i < sc->stripes; i++) + if (!strcmp(sc->stripe[i].dev->name, major_minor)) { + atomic_inc(&(sc->stripe[i].error_count)); + if (atomic_read(&(sc->stripe[i].error_count)) < + DM_IO_ERROR_THRESHOLD) + schedule_work(&sc->trigger_event); + } + + return DM_ENDIO_DONE; +} + +static int stripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct stripe_c *sc = ti->private; + int ret = 0; + unsigned int i = 0; + + do { + ret = fn(ti, sc->stripe[i].dev, + sc->stripe[i].physical_start, + sc->stripe_width, data); + } while (!ret && ++i < sc->stripes); + + return ret; +} + +static void stripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct stripe_c *sc = ti->private; + unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * sc->stripes); +} + +static struct target_type stripe_target = { + .name = "striped", + .version = {1, 6, 0}, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT, + .module = THIS_MODULE, + .ctr = stripe_ctr, + .dtr = stripe_dtr, + .map = stripe_map, + .end_io = stripe_end_io, + .status = stripe_status, + .iterate_devices = stripe_iterate_devices, + .io_hints = stripe_io_hints, + .direct_access = stripe_dax_direct_access, + .dax_zero_page_range = stripe_dax_zero_page_range, + .dax_recovery_write = stripe_dax_recovery_write, +}; + +int __init dm_stripe_init(void) +{ + int r; + + r = dm_register_target(&stripe_target); + if (r < 0) + DMWARN("target registration failed"); + + return r; +} + +void dm_stripe_exit(void) +{ + dm_unregister_target(&stripe_target); +} diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c new file mode 100644 index 000000000..f734b5a09 --- /dev/null +++ b/drivers/md/dm-switch.c @@ -0,0 +1,592 @@ +/* + * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. + * Copyright (C) 2011-2013 Red Hat, Inc. + * + * This file is released under the GPL. + * + * dm-switch is a device-mapper target that maps IO to underlying block + * devices efficiently when there are a large number of fixed-sized + * address regions but there is no simple pattern to allow for a compact + * mapping representation such as dm-stripe. + */ + +#include + +#include +#include +#include + +#define DM_MSG_PREFIX "switch" + +/* + * One region_table_slot_t holds region table + * entries each of which is in size. + */ +typedef unsigned long region_table_slot_t; + +/* + * A device with the offset to its start sector. + */ +struct switch_path { + struct dm_dev *dmdev; + sector_t start; +}; + +/* + * Context block for a dm switch device. + */ +struct switch_ctx { + struct dm_target *ti; + + unsigned int nr_paths; /* Number of paths in path_list. */ + + unsigned int region_size; /* Region size in 512-byte sectors */ + unsigned long nr_regions; /* Number of regions making up the device */ + signed char region_size_bits; /* log2 of region_size or -1 */ + + unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ + unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ + signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ + + region_table_slot_t *region_table; /* Region table */ + + /* + * Array of dm devices to switch between. + */ + struct switch_path path_list[]; +}; + +static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned int nr_paths, + unsigned int region_size) +{ + struct switch_ctx *sctx; + + sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL); + if (!sctx) + return NULL; + + sctx->ti = ti; + sctx->region_size = region_size; + + ti->private = sctx; + + return sctx; +} + +static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths) +{ + struct switch_ctx *sctx = ti->private; + sector_t nr_regions = ti->len; + sector_t nr_slots; + + if (!(sctx->region_size & (sctx->region_size - 1))) + sctx->region_size_bits = __ffs(sctx->region_size); + else + sctx->region_size_bits = -1; + + sctx->region_table_entry_bits = 1; + while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && + (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) + sctx->region_table_entry_bits++; + + sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; + if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) + sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); + else + sctx->region_entries_per_slot_bits = -1; + + if (sector_div(nr_regions, sctx->region_size)) + nr_regions++; + + if (nr_regions >= ULONG_MAX) { + ti->error = "Region table too large"; + return -EINVAL; + } + sctx->nr_regions = nr_regions; + + nr_slots = nr_regions; + if (sector_div(nr_slots, sctx->region_entries_per_slot)) + nr_slots++; + + if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { + ti->error = "Region table too large"; + return -EINVAL; + } + + sctx->region_table = vmalloc(array_size(nr_slots, + sizeof(region_table_slot_t))); + if (!sctx->region_table) { + ti->error = "Cannot allocate region table"; + return -ENOMEM; + } + + return 0; +} + +static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, + unsigned long *region_index, unsigned int *bit) +{ + if (sctx->region_entries_per_slot_bits >= 0) { + *region_index = region_nr >> sctx->region_entries_per_slot_bits; + *bit = region_nr & (sctx->region_entries_per_slot - 1); + } else { + *region_index = region_nr / sctx->region_entries_per_slot; + *bit = region_nr % sctx->region_entries_per_slot; + } + + *bit *= sctx->region_table_entry_bits; +} + +static unsigned int switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) +{ + unsigned long region_index; + unsigned int bit; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + return (READ_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); +} + +/* + * Find which path to use at given offset. + */ +static unsigned int switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) +{ + unsigned int path_nr; + sector_t p; + + p = offset; + if (sctx->region_size_bits >= 0) + p >>= sctx->region_size_bits; + else + sector_div(p, sctx->region_size); + + path_nr = switch_region_table_read(sctx, p); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(path_nr >= sctx->nr_paths)) + path_nr = 0; + + return path_nr; +} + +static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, + unsigned int value) +{ + unsigned long region_index; + unsigned int bit; + region_table_slot_t pte; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + pte = sctx->region_table[region_index]; + pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); + pte |= (region_table_slot_t)value << bit; + sctx->region_table[region_index] = pte; +} + +/* + * Fill the region table with an initial round robin pattern. + */ +static void initialise_region_table(struct switch_ctx *sctx) +{ + unsigned int path_nr = 0; + unsigned long region_nr; + + for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { + switch_region_table_write(sctx, region_nr, path_nr); + if (++path_nr >= sctx->nr_paths) + path_nr = 0; + } +} + +static int parse_path(struct dm_arg_set *as, struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + unsigned long long start; + int r; + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &sctx->path_list[sctx->nr_paths].dmdev); + if (r) { + ti->error = "Device lookup failed"; + return r; + } + + if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + return -EINVAL; + } + + sctx->path_list[sctx->nr_paths].start = start; + + sctx->nr_paths++; + + return 0; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + + while (sctx->nr_paths--) + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + + vfree(sctx->region_table); + kfree(sctx); +} + +/* + * Constructor arguments: + * [...] + * [ ]+ + * + * Optional args are to allow for future extension: currently this + * parameter must be 0. + */ +static int switch_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + static const struct dm_arg _args[] = { + {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, + {1, UINT_MAX, "Invalid region size"}, + {0, 0, "Invalid number of optional args"}, + }; + + struct switch_ctx *sctx; + struct dm_arg_set as; + unsigned int nr_paths, region_size, nr_optional_args; + int r; + + as.argc = argc; + as.argv = argv; + + r = dm_read_arg(_args, &as, &nr_paths, &ti->error); + if (r) + return -EINVAL; + + r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); + if (r) + return r; + + r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); + if (r) + return r; + /* parse optional arguments here, if we add any */ + + if (as.argc != nr_paths * 2) { + ti->error = "Incorrect number of path arguments"; + return -EINVAL; + } + + sctx = alloc_switch_ctx(ti, nr_paths, region_size); + if (!sctx) { + ti->error = "Cannot allocate redirection context"; + return -ENOMEM; + } + + r = dm_set_target_max_io_len(ti, region_size); + if (r) + goto error; + + while (as.argc) { + r = parse_path(&as, ti); + if (r) + goto error; + } + + r = alloc_region_table(ti, nr_paths); + if (r) + goto error; + + initialise_region_table(sctx); + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_bios = 1; + + return 0; + +error: + switch_dtr(ti); + + return r; +} + +static int switch_map(struct dm_target *ti, struct bio *bio) +{ + struct switch_ctx *sctx = ti->private; + sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); + unsigned int path_nr = switch_get_path_nr(sctx, offset); + + bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev); + bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; + + return DM_MAPIO_REMAPPED; +} + +/* + * We need to parse hex numbers in the message as quickly as possible. + * + * This table-based hex parser improves performance. + * It improves a time to load 1000000 entries compared to the condition-based + * parser. + * table-based parser condition-based parser + * PA-RISC 0.29s 0.31s + * Opteron 0.0495s 0.0498s + */ +static const unsigned char hex_table[256] = { +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 +}; + +static __always_inline unsigned long parse_hex(const char **string) +{ + unsigned char d; + unsigned long r = 0; + + while ((d = hex_table[(unsigned char)**string]) < 16) { + r = (r << 4) | d; + (*string)++; + } + + return r; +} + +static int process_set_region_mappings(struct switch_ctx *sctx, + unsigned int argc, char **argv) +{ + unsigned int i; + unsigned long region_index = 0; + + for (i = 1; i < argc; i++) { + unsigned long path_nr; + const char *string = argv[i]; + + if ((*string & 0xdf) == 'R') { + unsigned long cycle_length, num_write; + + string++; + if (unlikely(*string == ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + cycle_length = parse_hex(&string); + if (unlikely(*string != ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + num_write = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { + DMWARN("invalid set_region_mappings cycle length: %lu > %lu", + cycle_length - 1, region_index); + return -EINVAL; + } + if (unlikely(region_index + num_write < region_index) || + unlikely(region_index + num_write >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu", + region_index, num_write, sctx->nr_regions); + return -EINVAL; + } + + while (num_write--) { + region_index++; + path_nr = switch_region_table_read(sctx, region_index - cycle_length); + switch_region_table_write(sctx, region_index, path_nr); + } + + continue; + } + + if (*string == ':') + region_index++; + else { + region_index = parse_hex(&string); + if (unlikely(*string != ':')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + } + + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + path_nr = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + if (unlikely(region_index >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); + return -EINVAL; + } + if (unlikely(path_nr >= sctx->nr_paths)) { + DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); + return -EINVAL; + } + + switch_region_table_write(sctx, region_index, path_nr); + } + + return 0; +} + +/* + * Messages are processed one-at-a-time. + * + * Only set_region_mappings is supported. + */ +static int switch_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + static DEFINE_MUTEX(message_mutex); + + struct switch_ctx *sctx = ti->private; + int r = -EINVAL; + + mutex_lock(&message_mutex); + + if (!strcasecmp(argv[0], "set_region_mappings")) + r = process_set_region_mappings(sctx, argc, argv); + else + DMWARN("Unrecognised message received."); + + mutex_unlock(&message_mutex); + + return r; +} + +static void switch_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct switch_ctx *sctx = ti->private; + unsigned int sz = 0; + int path_nr; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) + DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, + (unsigned long long)sctx->path_list[path_nr].start); + break; + + case STATUSTYPE_IMA: + result[0] = '\0'; + break; + } +} + +/* + * Switch ioctl: + * + * Passthrough all ioctls to the path for sector 0 + */ +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct switch_ctx *sctx = ti->private; + unsigned int path_nr; + + path_nr = switch_get_path_nr(sctx, 0); + + *bdev = sctx->path_list[path_nr].dmdev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (ti->len + sctx->path_list[path_nr].start != + bdev_nr_sectors((*bdev))) + return 1; + return 0; +} + +static int switch_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct switch_ctx *sctx = ti->private; + int path_nr; + int r; + + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { + r = fn(ti, sctx->path_list[path_nr].dmdev, + sctx->path_list[path_nr].start, ti->len, data); + if (r) + return r; + } + + return 0; +} + +static struct target_type switch_target = { + .name = "switch", + .version = {1, 1, 0}, + .features = DM_TARGET_NOWAIT, + .module = THIS_MODULE, + .ctr = switch_ctr, + .dtr = switch_dtr, + .map = switch_map, + .message = switch_message, + .status = switch_status, + .prepare_ioctl = switch_prepare_ioctl, + .iterate_devices = switch_iterate_devices, +}; + +static int __init dm_switch_init(void) +{ + int r; + + r = dm_register_target(&switch_target); + if (r < 0) + DMERR("dm_register_target() failed %d", r); + + return r; +} + +static void __exit dm_switch_exit(void) +{ + dm_unregister_target(&switch_target); +} + +module_init(dm_switch_init); +module_exit(dm_switch_exit); + +MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); +MODULE_AUTHOR("Kevin D. O'Kelley "); +MODULE_AUTHOR("Narendran Ganapathy "); +MODULE_AUTHOR("Jim Ramsay "); +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c new file mode 100644 index 000000000..e28c92478 --- /dev/null +++ b/drivers/md/dm-sysfs.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include +#include +#include "dm-core.h" +#include "dm-rq.h" + +struct dm_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct mapped_device *, char *); + ssize_t (*store)(struct mapped_device *, const char *, size_t count); +}; + +#define DM_ATTR_RO(_name) \ +struct dm_sysfs_attr dm_attr_##_name = \ + __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL) + +static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct dm_sysfs_attr *dm_attr; + struct mapped_device *md; + ssize_t ret; + + dm_attr = container_of(attr, struct dm_sysfs_attr, attr); + if (!dm_attr->show) + return -EIO; + + md = dm_get_from_kobject(kobj); + if (!md) + return -EINVAL; + + ret = dm_attr->show(md, page); + dm_put(md); + + return ret; +} + +#define DM_ATTR_RW(_name) \ +struct dm_sysfs_attr dm_attr_##_name = \ + __ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store) + +static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t count) +{ + struct dm_sysfs_attr *dm_attr; + struct mapped_device *md; + ssize_t ret; + + dm_attr = container_of(attr, struct dm_sysfs_attr, attr); + if (!dm_attr->store) + return -EIO; + + md = dm_get_from_kobject(kobj); + if (!md) + return -EINVAL; + + ret = dm_attr->store(md, page, count); + dm_put(md); + + return ret; +} + +static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf) +{ + if (dm_copy_name_and_uuid(md, buf, NULL)) + return -EIO; + + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) +{ + if (dm_copy_name_and_uuid(md, NULL, buf)) + return -EIO; + + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) +{ + sprintf(buf, "%d\n", dm_suspended_md(md)); + + return strlen(buf); +} + +static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf) +{ + /* Purely for userspace compatibility */ + sprintf(buf, "%d\n", true); + + return strlen(buf); +} + +static DM_ATTR_RO(name); +static DM_ATTR_RO(uuid); +static DM_ATTR_RO(suspended); +static DM_ATTR_RO(use_blk_mq); +static DM_ATTR_RW(rq_based_seq_io_merge_deadline); + +static struct attribute *dm_attrs[] = { + &dm_attr_name.attr, + &dm_attr_uuid.attr, + &dm_attr_suspended.attr, + &dm_attr_use_blk_mq.attr, + &dm_attr_rq_based_seq_io_merge_deadline.attr, + NULL, +}; +ATTRIBUTE_GROUPS(dm); + +static const struct sysfs_ops dm_sysfs_ops = { + .show = dm_attr_show, + .store = dm_attr_store, +}; + +static struct kobj_type dm_ktype = { + .sysfs_ops = &dm_sysfs_ops, + .default_groups = dm_groups, + .release = dm_kobject_release, +}; + +/* + * Initialize kobj + * because nobody using md yet, no need to call explicit dm_get/put + */ +int dm_sysfs_init(struct mapped_device *md) +{ + return kobject_init_and_add(dm_kobject(md), &dm_ktype, + &disk_to_dev(dm_disk(md))->kobj, + "%s", "dm"); +} + +/* + * Remove kobj, called after all references removed + */ +void dm_sysfs_exit(struct mapped_device *md) +{ + struct kobject *kobj = dm_kobject(md); + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); +} diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c new file mode 100644 index 000000000..dac6a5f25 --- /dev/null +++ b/drivers/md/dm-table.c @@ -0,0 +1,2163 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-core.h" +#include "dm-rq.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "table" + +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned int n, unsigned int base) +{ + int result = 0; + + while (n > 1) { + n = dm_div_up(n, base); + result++; + } + + return result; +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, + unsigned int l, unsigned int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ + unsigned int n, k; + sector_t *node; + + for (n = 0U; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0U; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + +/* + * highs, and targets are managed as dynamic arrays during a + * table load. + */ +static int alloc_targets(struct dm_table *t, unsigned int num) +{ + sector_t *n_highs; + struct dm_target *n_targets; + + /* + * Allocate both the target array and offset array at once. + */ + n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t), + GFP_KERNEL); + if (!n_highs) + return -ENOMEM; + + n_targets = (struct dm_target *) (n_highs + num); + + memset(n_highs, -1, sizeof(*n_highs) * num); + kvfree(t->highs); + + t->num_allocated = num; + t->highs = n_highs; + t->targets = n_targets; + + return 0; +} + +int dm_table_create(struct dm_table **result, fmode_t mode, + unsigned int num_targets, struct mapped_device *md) +{ + struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); + + if (!t) + return -ENOMEM; + + INIT_LIST_HEAD(&t->devices); + init_rwsem(&t->devices_lock); + + if (!num_targets) + num_targets = KEYS_PER_NODE; + + num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + + if (alloc_targets(t, num_targets)) { + kfree(t); + return -ENOMEM; + } + + t->type = DM_TYPE_NONE; + t->mode = mode; + t->md = md; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices, struct mapped_device *md) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct dm_dev_internal *dd = + list_entry(tmp, struct dm_dev_internal, list); + DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", + dm_device_name(md), dd->dm_dev->name); + dm_put_table_device(md, dd->dm_dev); + kfree(dd); + } +} + +static void dm_table_destroy_crypto_profile(struct dm_table *t); + +void dm_table_destroy(struct dm_table *t) +{ + if (!t) + return; + + /* free the indexes */ + if (t->depth >= 2) + kvfree(t->index[t->depth - 2]); + + /* free the targets */ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->type->dtr) + ti->type->dtr(ti); + + dm_put_target_type(ti->type); + } + + kvfree(t->highs); + + /* free the device list */ + free_devices(&t->devices, t->md); + + dm_free_md_mempools(t->mempools); + + dm_table_destroy_crypto_profile(t); + + kfree(t); +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) +{ + struct dm_dev_internal *dd; + + list_for_each_entry(dd, l, list) + if (dd->dm_dev->bdev->bd_dev == dev) + return dd; + + return NULL; +} + +/* + * If possible, this checks an area of a destination device is invalid. + */ +static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; + sector_t dev_size = bdev_nr_sectors(bdev); + unsigned short logical_block_size_sectors = + limits->logical_block_size >> SECTOR_SHIFT; + + if (!dev_size) + return 0; + + if ((start >= dev_size) || (start + len > dev_size)) { + DMERR("%s: %pg too small for target: start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdev, + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); + return 1; + } + + /* + * If the target is mapped to zoned block device(s), check + * that the zones are not partially mapped. + */ + if (bdev_is_zoned(bdev)) { + unsigned int zone_sectors = bdev_zone_sectors(bdev); + + if (start & (zone_sectors - 1)) { + DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)start, + zone_sectors, bdev); + return 1; + } + + /* + * Note: The last zone of a zoned block device may be smaller + * than other zones. So for a target mapping the end of a + * zoned block device with such a zone, len would not be zone + * aligned. We do not allow such last smaller zone to be part + * of the mapping here to ensure that mappings with multiple + * devices do not end up with a smaller zone in the middle of + * the sector range. + */ + if (len & (zone_sectors - 1)) { + DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)len, + zone_sectors, bdev); + return 1; + } + } + + if (logical_block_size_sectors <= 1) + return 0; + + if (start & (logical_block_size_sectors - 1)) { + DMERR("%s: start=%llu not aligned to h/w logical block size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)start, + limits->logical_block_size, bdev); + return 1; + } + + if (len & (logical_block_size_sectors - 1)) { + DMERR("%s: len=%llu not aligned to h/w logical block size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)len, + limits->logical_block_size, bdev); + return 1; + } + + return 0; +} + +/* + * This upgrades the mode on an already open dm_dev, being + * careful to leave things as they were if we fail to reopen the + * device and not to touch the existing bdev field in case + * it is accessed concurrently. + */ +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, + struct mapped_device *md) +{ + int r; + struct dm_dev *old_dev, *new_dev; + + old_dev = dd->dm_dev; + + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, + dd->dm_dev->mode | new_mode, &new_dev); + if (r) + return r; + + dd->dm_dev = new_dev; + dm_put_table_device(md, old_dev); + + return 0; +} + +/* + * Convert the path to a device + */ +dev_t dm_get_dev_t(const char *path) +{ + dev_t dev; + + if (lookup_bdev(path, &dev)) + dev = name_to_dev_t(path); + return dev; +} +EXPORT_SYMBOL_GPL(dm_get_dev_t); + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) +{ + int r; + dev_t dev; + unsigned int major, minor; + char dummy; + struct dm_dev_internal *dd; + struct dm_table *t = ti->table; + + BUG_ON(!t); + + if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { + /* Extract the major/minor numbers */ + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) + return -EOVERFLOW; + } else { + dev = dm_get_dev_t(path); + if (!dev) + return -ENODEV; + } + + down_write(&t->devices_lock); + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) { + r = -ENOMEM; + goto unlock_ret_r; + } + + if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { + kfree(dd); + goto unlock_ret_r; + } + + refcount_set(&dd->count, 1); + list_add(&dd->list, &t->devices); + goto out; + + } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { + r = upgrade_mode(dd, mode, t->md); + if (r) + goto unlock_ret_r; + } + refcount_inc(&dd->count); +out: + up_write(&t->devices_lock); + *result = dd->dm_dev; + return 0; + +unlock_ret_r: + up_write(&t->devices_lock); + return r; +} +EXPORT_SYMBOL(dm_get_device); + +static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + if (unlikely(!q)) { + DMWARN("%s: Cannot set limits for nonexistent device %pg", + dm_device_name(ti->table->md), bdev); + return 0; + } + + if (blk_stack_limits(limits, &q->limits, + get_start_sect(bdev) + start) < 0) + DMWARN("%s: adding target device %pg caused an alignment inconsistency: " + "physical_block_size=%u, logical_block_size=%u, " + "alignment_offset=%u, start=%llu", + dm_device_name(ti->table->md), bdev, + q->limits.physical_block_size, + q->limits.logical_block_size, + q->limits.alignment_offset, + (unsigned long long) start << SECTOR_SHIFT); + return 0; +} + +/* + * Decrement a device's use count and remove it if necessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *d) +{ + int found = 0; + struct dm_table *t = ti->table; + struct list_head *devices = &t->devices; + struct dm_dev_internal *dd; + + down_write(&t->devices_lock); + + list_for_each_entry(dd, devices, list) { + if (dd->dm_dev == d) { + found = 1; + break; + } + } + if (!found) { + DMERR("%s: device %s not in table devices list", + dm_device_name(t->md), d->name); + goto unlock_ret; + } + if (refcount_dec_and_test(&dd->count)) { + dm_put_table_device(t->md, d); + list_del(&dd->list); + kfree(dd); + } + +unlock_ret: + up_write(&t->devices_lock); +} +EXPORT_SYMBOL(dm_put_device); + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *t, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!t->num_targets) + return !ti->begin; + + prev = &t->targets[t->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Used to dynamically allocate the arg array. + * + * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must + * process messages even if some device is suspended. These messages have a + * small fixed number of arguments. + * + * On the other hand, dm-switch needs to process bulk data using messages and + * excessive use of GFP_NOIO could cause trouble. + */ +static char **realloc_argv(unsigned int *size, char **old_argv) +{ + char **argv; + unsigned int new_size; + gfp_t gfp; + + if (*size) { + new_size = *size * 2; + gfp = GFP_KERNEL; + } else { + new_size = 8; + gfp = GFP_NOIO; + } + argv = kmalloc_array(new_size, sizeof(*argv), gfp); + if (argv && old_argv) { + memcpy(argv, old_argv, *size * sizeof(*argv)); + *size = new_size; + } + + kfree(old_argv); + return argv; +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +int dm_split_args(int *argc, char ***argvp, char *input) +{ + char *start, *end = input, *out, **argv = NULL; + unsigned int array_size = 0; + + *argc = 0; + + if (!input) { + *argvp = NULL; + return 0; + } + + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + + while (1) { + /* Skip whitespace */ + start = skip_spaces(end); + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > array_size) { + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + } + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + *argvp = argv; + return 0; +} + +/* + * Impose necessary and sufficient conditions on a devices's table such + * that any incoming bio which respects its logical_block_size can be + * processed successfully. If it falls across the boundary between + * two or more targets, the size of each piece it gets split into must + * be compatible with the logical_block_size of the target processing it. + */ +static int validate_hardware_logical_block_alignment(struct dm_table *t, + struct queue_limits *limits) +{ + /* + * This function uses arithmetic modulo the logical_block_size + * (in units of 512-byte sectors). + */ + unsigned short device_logical_block_size_sects = + limits->logical_block_size >> SECTOR_SHIFT; + + /* + * Offset of the start of the next table entry, mod logical_block_size. + */ + unsigned short next_target_start = 0; + + /* + * Given an aligned bio that extends beyond the end of a + * target, how many sectors must the next target handle? + */ + unsigned short remaining = 0; + + struct dm_target *ti; + struct queue_limits ti_limits; + unsigned int i; + + /* + * Check each entry in the table in turn. + */ + for (i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + + blk_set_stacking_limits(&ti_limits); + + /* combine all target devices' limits */ + if (ti->type->iterate_devices) + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* + * If the remaining sectors fall entirely within this + * table entry are they compatible with its logical_block_size? + */ + if (remaining < ti->len && + remaining & ((ti_limits.logical_block_size >> + SECTOR_SHIFT) - 1)) + break; /* Error */ + + next_target_start = + (unsigned short) ((next_target_start + ti->len) & + (device_logical_block_size_sects - 1)); + remaining = next_target_start ? + device_logical_block_size_sects - next_target_start : 0; + } + + if (remaining) { + DMERR("%s: table line %u (start sect %llu len %llu) " + "not aligned to h/w logical block size %u", + dm_device_name(t->md), i, + (unsigned long long) ti->begin, + (unsigned long long) ti->len, + limits->logical_block_size); + return -EINVAL; + } + + return 0; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char **argv; + struct dm_target *ti; + + if (t->singleton) { + DMERR("%s: target type %s must appear alone in table", + dm_device_name(t->md), t->targets->type->name); + return -EINVAL; + } + + BUG_ON(t->num_targets >= t->num_allocated); + + ti = t->targets + t->num_targets; + memset(ti, 0, sizeof(*ti)); + + if (!len) { + DMERR("%s: zero-length target", dm_device_name(t->md)); + return -EINVAL; + } + + ti->type = dm_get_target_type(type); + if (!ti->type) { + DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); + return -EINVAL; + } + + if (dm_target_needs_singleton(ti->type)) { + if (t->num_targets) { + ti->error = "singleton target type must appear alone in table"; + goto bad; + } + t->singleton = true; + } + + if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { + ti->error = "target type may not be included in a read-only table"; + goto bad; + } + + if (t->immutable_target_type) { + if (t->immutable_target_type != ti->type) { + ti->error = "immutable target type cannot be mixed with other target types"; + goto bad; + } + } else if (dm_target_is_immutable(ti->type)) { + if (t->num_targets) { + ti->error = "immutable target type cannot be mixed with other target types"; + goto bad; + } + t->immutable_target_type = ti->type; + } + + if (dm_target_has_integrity(ti->type)) + t->integrity_added = 1; + + ti->table = t; + ti->begin = start; + ti->len = len; + ti->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, ti)) { + ti->error = "Gap in table"; + goto bad; + } + + r = dm_split_args(&argc, &argv, params); + if (r) { + ti->error = "couldn't split parameters"; + goto bad; + } + + r = ti->type->ctr(ti, argc, argv); + kfree(argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = ti->begin + ti->len - 1; + + if (!ti->num_discard_bios && ti->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", + dm_device_name(t->md), type); + + if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) + static_branch_enable(&swap_bios_enabled); + + return 0; + + bad: + DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, ti->error, ERR_PTR(r)); + dm_put_target_type(ti->type); + return r; +} + +/* + * Target argument parsing helpers. + */ +static int validate_next_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned int *value, char **error, unsigned int grouped) +{ + const char *arg_str = dm_shift_arg(arg_set); + char dummy; + + if (!arg_str || + (sscanf(arg_str, "%u%c", value, &dummy) != 1) || + (*value < arg->min) || + (*value > arg->max) || + (grouped && arg_set->argc < *value)) { + *error = arg->error; + return -EINVAL; + } + + return 0; +} + +int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned int *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 0); +} +EXPORT_SYMBOL(dm_read_arg); + +int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned int *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 1); +} +EXPORT_SYMBOL(dm_read_arg_group); + +const char *dm_shift_arg(struct dm_arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} +EXPORT_SYMBOL(dm_shift_arg); + +void dm_consume_args(struct dm_arg_set *as, unsigned int num_args) +{ + BUG_ON(as->argc < num_args); + as->argc -= num_args; + as->argv += num_args; +} +EXPORT_SYMBOL(dm_consume_args); + +static bool __table_type_bio_based(enum dm_queue_mode table_type) +{ + return (table_type == DM_TYPE_BIO_BASED || + table_type == DM_TYPE_DAX_BIO_BASED); +} + +static bool __table_type_request_based(enum dm_queue_mode table_type) +{ + return table_type == DM_TYPE_REQUEST_BASED; +} + +void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) +{ + t->type = type; +} +EXPORT_SYMBOL_GPL(dm_table_set_type); + +/* validate the dax capability of the target device span */ +static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + if (dev->dax_dev) + return false; + + DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev); + return true; +} + +/* Check devices support synchronous DAX */ +static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + return !dev->dax_dev || !dax_synchronous(dev->dax_dev); +} + +static bool dm_table_supports_dax(struct dm_table *t, + iterate_devices_callout_fn iterate_fn) +{ + /* Ensure that all targets support DAX. */ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->type->direct_access) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, iterate_fn, NULL)) + return false; + } + + return true; +} + +static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + /* request-based cannot stack on partitions! */ + if (bdev_is_partition(bdev)) + return false; + + return queue_is_mq(q); +} + +static int dm_table_determine_type(struct dm_table *t) +{ + unsigned int bio_based = 0, request_based = 0, hybrid = 0; + struct dm_target *ti; + struct list_head *devices = dm_table_get_devices(t); + enum dm_queue_mode live_md_type = dm_get_md_type(t->md); + + if (t->type != DM_TYPE_NONE) { + /* target already set the table's type */ + if (t->type == DM_TYPE_BIO_BASED) { + /* possibly upgrade to a variant of bio-based */ + goto verify_bio_based; + } + BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); + goto verify_rq_based; + } + + for (unsigned int i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + if (dm_target_hybrid(ti)) + hybrid = 1; + else if (dm_target_request_based(ti)) + request_based = 1; + else + bio_based = 1; + + if (bio_based && request_based) { + DMERR("Inconsistent table: different target types can't be mixed up"); + return -EINVAL; + } + } + + if (hybrid && !bio_based && !request_based) { + /* + * The targets can work either way. + * Determine the type from the live device. + * Default to bio-based if device is new. + */ + if (__table_type_request_based(live_md_type)) + request_based = 1; + else + bio_based = 1; + } + + if (bio_based) { +verify_bio_based: + /* We must use this table as bio-based */ + t->type = DM_TYPE_BIO_BASED; + if (dm_table_supports_dax(t, device_not_dax_capable) || + (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { + t->type = DM_TYPE_DAX_BIO_BASED; + } + return 0; + } + + BUG_ON(!request_based); /* No targets in this table */ + + t->type = DM_TYPE_REQUEST_BASED; + +verify_rq_based: + /* + * Request-based dm supports only tables that have a single target now. + * To support multiple targets, request splitting support is needed, + * and that needs lots of changes in the block-layer. + * (e.g. request completion process for partial completion.) + */ + if (t->num_targets > 1) { + DMERR("request-based DM doesn't support multiple targets"); + return -EINVAL; + } + + if (list_empty(devices)) { + int srcu_idx; + struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); + + /* inherit live table's type */ + if (live_table) + t->type = live_table->type; + dm_put_live_table(t->md, srcu_idx); + return 0; + } + + ti = dm_table_get_immutable_target(t); + if (!ti) { + DMERR("table load rejected: immutable target is required"); + return -EINVAL; + } else if (ti->max_io_len) { + DMERR("table load rejected: immutable target that splits IO is not supported"); + return -EINVAL; + } + + /* Non-request-stackable devices can't be used for request-based dm */ + if (!ti->type->iterate_devices || + !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { + DMERR("table load rejected: including non-request-stackable devices"); + return -EINVAL; + } + + return 0; +} + +enum dm_queue_mode dm_table_get_type(struct dm_table *t) +{ + return t->type; +} + +struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) +{ + return t->immutable_target_type; +} + +struct dm_target *dm_table_get_immutable_target(struct dm_table *t) +{ + /* Immutable target is implicitly a singleton */ + if (t->num_targets > 1 || + !dm_target_is_immutable(t->targets[0].type)) + return NULL; + + return t->targets; +} + +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (dm_target_is_wildcard(ti->type)) + return ti; + } + + return NULL; +} + +bool dm_table_bio_based(struct dm_table *t) +{ + return __table_type_bio_based(dm_table_get_type(t)); +} + +bool dm_table_request_based(struct dm_table *t) +{ + return __table_type_request_based(dm_table_get_type(t)); +} + +static bool dm_table_supports_poll(struct dm_table *t); + +static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) +{ + enum dm_queue_mode type = dm_table_get_type(t); + unsigned int per_io_data_size = 0, front_pad, io_front_pad; + unsigned int min_pool_size = 0, pool_size; + struct dm_md_mempools *pools; + + if (unlikely(type == DM_TYPE_NONE)) { + DMERR("no table type is set, can't allocate mempools"); + return -EINVAL; + } + + pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); + if (!pools) + return -ENOMEM; + + if (type == DM_TYPE_REQUEST_BASED) { + pool_size = dm_get_reserved_rq_based_ios(); + front_pad = offsetof(struct dm_rq_clone_bio_info, clone); + goto init_bs; + } + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + per_io_data_size = max(per_io_data_size, ti->per_io_data_size); + min_pool_size = max(min_pool_size, ti->num_flush_bios); + } + pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); + front_pad = roundup(per_io_data_size, + __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; + + io_front_pad = roundup(per_io_data_size, + __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; + if (bioset_init(&pools->io_bs, pool_size, io_front_pad, + dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) + goto out_free_pools; + if (t->integrity_supported && + bioset_integrity_create(&pools->io_bs, pool_size)) + goto out_free_pools; +init_bs: + if (bioset_init(&pools->bs, pool_size, front_pad, 0)) + goto out_free_pools; + if (t->integrity_supported && + bioset_integrity_create(&pools->bs, pool_size)) + goto out_free_pools; + + t->mempools = pools; + return 0; + +out_free_pools: + dm_free_md_mempools(pools); + return -ENOMEM; +} + +static int setup_indexes(struct dm_table *t) +{ + int i; + unsigned int total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +static int dm_table_build_index(struct dm_table *t) +{ + int r = 0; + unsigned int leaf_nodes; + + /* how many indexes will the btree have ? */ + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +static bool integrity_profile_exists(struct gendisk *disk) +{ + return !!blk_get_integrity(disk); +} + +/* + * Get a disk whose integrity profile reflects the table's profile. + * Returns NULL if integrity support was inconsistent or unavailable. + */ +static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd = NULL; + struct gendisk *prev_disk = NULL, *template_disk = NULL; + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_passes_integrity(ti->type)) + goto no_integrity; + } + + list_for_each_entry(dd, devices, list) { + template_disk = dd->dm_dev->bdev->bd_disk; + if (!integrity_profile_exists(template_disk)) + goto no_integrity; + else if (prev_disk && + blk_integrity_compare(prev_disk, template_disk) < 0) + goto no_integrity; + prev_disk = template_disk; + } + + return template_disk; + +no_integrity: + if (prev_disk) + DMWARN("%s: integrity not set: %s and %s profile mismatch", + dm_device_name(t->md), + prev_disk->disk_name, + template_disk->disk_name); + return NULL; +} + +/* + * Register the mapped device for blk_integrity support if the + * underlying devices have an integrity profile. But all devices may + * not have matching profiles (checking all devices isn't reliable + * during table load because this table may use other DM device(s) which + * must be resumed before they will have an initialized integity + * profile). Consequently, stacked DM devices force a 2 stage integrity + * profile validation: First pass during table load, final pass during + * resume. + */ +static int dm_table_register_integrity(struct dm_table *t) +{ + struct mapped_device *md = t->md; + struct gendisk *template_disk = NULL; + + /* If target handles integrity itself do not register it here. */ + if (t->integrity_added) + return 0; + + template_disk = dm_table_get_integrity_disk(t); + if (!template_disk) + return 0; + + if (!integrity_profile_exists(dm_disk(md))) { + t->integrity_supported = true; + /* + * Register integrity profile during table load; we can do + * this because the final profile must match during resume. + */ + blk_integrity_register(dm_disk(md), + blk_get_integrity(template_disk)); + return 0; + } + + /* + * If DM device already has an initialized integrity + * profile the new profile should not conflict. + */ + if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { + DMERR("%s: conflict with existing integrity profile: %s profile mismatch", + dm_device_name(t->md), + template_disk->disk_name); + return 1; + } + + /* Preserve existing integrity profile */ + t->integrity_supported = true; + return 0; +} + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +struct dm_crypto_profile { + struct blk_crypto_profile profile; + struct mapped_device *md; +}; + +static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + const struct blk_crypto_key *key = data; + + blk_crypto_evict_key(dev->bdev, key); + return 0; +} + +/* + * When an inline encryption key is evicted from a device-mapper device, evict + * it from all the underlying devices. + */ +static int dm_keyslot_evict(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, unsigned int slot) +{ + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; + struct dm_table *t; + int srcu_idx; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return 0; + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_keyslot_evict_callback, + (void *)key); + } + + dm_put_live_table(md, srcu_idx); + return 0; +} + +static int +device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct blk_crypto_profile *parent = data; + struct blk_crypto_profile *child = + bdev_get_queue(dev->bdev)->crypto_profile; + + blk_crypto_intersect_capabilities(parent, child); + return 0; +} + +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) +{ + struct dm_crypto_profile *dmcp = container_of(profile, + struct dm_crypto_profile, + profile); + + if (!profile) + return; + + blk_crypto_profile_destroy(profile); + kfree(dmcp); +} + +static void dm_table_destroy_crypto_profile(struct dm_table *t) +{ + dm_destroy_crypto_profile(t->crypto_profile); + t->crypto_profile = NULL; +} + +/* + * Constructs and initializes t->crypto_profile with a crypto profile that + * represents the common set of crypto capabilities of the devices described by + * the dm_table. However, if the constructed crypto profile doesn't support all + * crypto capabilities that are supported by the current mapped_device, it + * returns an error instead, since we don't support removing crypto capabilities + * on table changes. Finally, if the constructed crypto profile is "empty" (has + * no crypto capabilities at all), it just sets t->crypto_profile to NULL. + */ +static int dm_table_construct_crypto_profile(struct dm_table *t) +{ + struct dm_crypto_profile *dmcp; + struct blk_crypto_profile *profile; + unsigned int i; + bool empty_profile = true; + + dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL); + if (!dmcp) + return -ENOMEM; + dmcp->md = t->md; + + profile = &dmcp->profile; + blk_crypto_profile_init(profile, 0); + profile->ll_ops.keyslot_evict = dm_keyslot_evict; + profile->max_dun_bytes_supported = UINT_MAX; + memset(profile->modes_supported, 0xFF, + sizeof(profile->modes_supported)); + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_passes_crypto(ti->type)) { + blk_crypto_intersect_capabilities(profile, NULL); + break; + } + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, + device_intersect_crypto_capabilities, + profile); + } + + if (t->md->queue && + !blk_crypto_has_capabilities(profile, + t->md->queue->crypto_profile)) { + DMERR("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); + dm_destroy_crypto_profile(profile); + return -EINVAL; + } + + /* + * If the new profile doesn't actually support any crypto capabilities, + * we may as well represent it with a NULL profile. + */ + for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) { + if (profile->modes_supported[i]) { + empty_profile = false; + break; + } + } + + if (empty_profile) { + dm_destroy_crypto_profile(profile); + profile = NULL; + } + + /* + * t->crypto_profile is only set temporarily while the table is being + * set up, and it gets set to NULL after the profile has been + * transferred to the request_queue. + */ + t->crypto_profile = profile; + + return 0; +} + +static void dm_update_crypto_profile(struct request_queue *q, + struct dm_table *t) +{ + if (!t->crypto_profile) + return; + + /* Make the crypto profile less restrictive. */ + if (!q->crypto_profile) { + blk_crypto_register(t->crypto_profile, q); + } else { + blk_crypto_update_capabilities(q->crypto_profile, + t->crypto_profile); + dm_destroy_crypto_profile(t->crypto_profile); + } + t->crypto_profile = NULL; +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static int dm_table_construct_crypto_profile(struct dm_table *t) +{ + return 0; +} + +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) +{ +} + +static void dm_table_destroy_crypto_profile(struct dm_table *t) +{ +} + +static void dm_update_crypto_profile(struct request_queue *q, + struct dm_table *t) +{ +} + +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ + +/* + * Prepares the table for use by building the indices, + * setting the type, and allocating mempools. + */ +int dm_table_complete(struct dm_table *t) +{ + int r; + + r = dm_table_determine_type(t); + if (r) { + DMERR("unable to determine table type"); + return r; + } + + r = dm_table_build_index(t); + if (r) { + DMERR("unable to build btrees"); + return r; + } + + r = dm_table_register_integrity(t); + if (r) { + DMERR("could not register integrity profile."); + return r; + } + + r = dm_table_construct_crypto_profile(t); + if (r) { + DMERR("could not construct crypto profile."); + return r; + } + + r = dm_table_alloc_md_mempools(t, t->md); + if (r) + DMERR("unable to allocate mempools"); + + return r; +} + +static DEFINE_MUTEX(_event_lock); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context) +{ + mutex_lock(&_event_lock); + t->event_fn = fn; + t->event_context = context; + mutex_unlock(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ + mutex_lock(&_event_lock); + if (t->event_fn) + t->event_fn(t->event_context); + mutex_unlock(&_event_lock); +} +EXPORT_SYMBOL(dm_table_event); + +inline sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} +EXPORT_SYMBOL(dm_table_get_size); + +/* + * Search the btree for the correct target. + * + * Caller should check returned pointer for NULL + * to trap I/O beyond end of device. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + unsigned int l, n = 0, k = 0; + sector_t *node; + + if (unlikely(sector >= dm_table_get_size(t))) + return NULL; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); +} + +/* + * type->iterate_devices() should be called when the sanity check needs to + * iterate and check all underlying data devices. iterate_devices() will + * iterate all underlying data devices until it encounters a non-zero return + * code, returned by whether the input iterate_devices_callout_fn, or + * iterate_devices() itself internally. + * + * For some target type (e.g. dm-stripe), one call of iterate_devices() may + * iterate multiple underlying devices internally, in which case a non-zero + * return code returned by iterate_devices_callout_fn will stop the iteration + * in advance. + * + * Cases requiring _any_ underlying device supporting some kind of attribute, + * should use the iteration structure like dm_table_any_dev_attr(), or call + * it directly. @func should handle semantics of positive examples, e.g. + * capable of something. + * + * Cases requiring _all_ underlying devices supporting some kind of attribute, + * should use the iteration structure like dm_table_supports_nowait() or + * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that + * uses an @anti_func that handle semantics of counter examples, e.g. not + * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); + */ +static bool dm_table_any_dev_attr(struct dm_table *t, + iterate_devices_callout_fn func, void *data) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, func, data)) + return true; + } + + return false; +} + +static int count_device(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned int *num_devices = data; + + (*num_devices)++; + + return 0; +} + +static bool dm_table_supports_poll(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) + return false; + } + + return true; +} + +/* + * Check whether a table has no data devices attached using each + * target's iterate_devices method. + * Returns false if the result is unknown because a target doesn't + * support iterate_devices. + */ +bool dm_table_has_no_data_devices(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + unsigned int num_devices = 0; + + if (!ti->type->iterate_devices) + return false; + + ti->type->iterate_devices(ti, count_device, &num_devices); + if (num_devices) + return false; + } + + return true; +} + +static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + enum blk_zoned_model *zoned_model = data; + + return blk_queue_zoned_model(q) != *zoned_model; +} + +/* + * Check the device zoned model based on the target feature flag. If the target + * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are + * also accepted but all devices must have the same zoned model. If the target + * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any + * zoned model with all zoned devices having the same zone size. + */ +static bool dm_table_supports_zoned_model(struct dm_table *t, + enum blk_zoned_model zoned_model) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (dm_target_supports_zoned_hm(ti->type)) { + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_zoned_model, + &zoned_model)) + return false; + } else if (!dm_target_supports_mixed_zoned_model(ti->type)) { + if (zoned_model == BLK_ZONED_HM) + return false; + } + } + + return true; +} + +static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned int *zone_sectors = data; + + if (!bdev_is_zoned(dev->bdev)) + return 0; + return bdev_zone_sectors(dev->bdev) != *zone_sectors; +} + +/* + * Check consistency of zoned model and zone sectors across all targets. For + * zone sectors, if the destination device is a zoned block device, it shall + * have the specified zone_sectors. + */ +static int validate_hardware_zoned_model(struct dm_table *t, + enum blk_zoned_model zoned_model, + unsigned int zone_sectors) +{ + if (zoned_model == BLK_ZONED_NONE) + return 0; + + if (!dm_table_supports_zoned_model(t, zoned_model)) { + DMERR("%s: zoned model is not consistent across all devices", + dm_device_name(t->md)); + return -EINVAL; + } + + /* Check zone size validity and compatibility */ + if (!zone_sectors || !is_power_of_2(zone_sectors)) + return -EINVAL; + + if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) { + DMERR("%s: zone sectors is not consistent across all zoned devices", + dm_device_name(t->md)); + return -EINVAL; + } + + return 0; +} + +/* + * Establish the new table's queue_limits and validate them. + */ +int dm_calculate_queue_limits(struct dm_table *t, + struct queue_limits *limits) +{ + struct queue_limits ti_limits; + enum blk_zoned_model zoned_model = BLK_ZONED_NONE; + unsigned int zone_sectors = 0; + + blk_set_stacking_limits(limits); + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + blk_set_stacking_limits(&ti_limits); + + if (!ti->type->iterate_devices) + goto combine_limits; + + /* + * Combine queue limits of all the devices this target uses. + */ + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { + /* + * After stacking all limits, validate all devices + * in table support this zoned model and zone sectors. + */ + zoned_model = ti_limits.zoned; + zone_sectors = ti_limits.chunk_sectors; + } + + /* Set I/O hints portion of queue limits */ + if (ti->type->io_hints) + ti->type->io_hints(ti, &ti_limits); + + /* + * Check each device area is consistent with the target's + * overall queue limits. + */ + if (ti->type->iterate_devices(ti, device_area_is_invalid, + &ti_limits)) + return -EINVAL; + +combine_limits: + /* + * Merge this target's queue limits into the overall limits + * for the table. + */ + if (blk_stack_limits(limits, &ti_limits, 0) < 0) + DMWARN("%s: adding target device (start sect %llu len %llu) " + "caused an alignment inconsistency", + dm_device_name(t->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + } + + /* + * Verify that the zoned model and zone sectors, as determined before + * any .io_hints override, are the same across all devices in the table. + * - this is especially relevant if .io_hints is emulating a disk-managed + * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices. + * BUT... + */ + if (limits->zoned != BLK_ZONED_NONE) { + /* + * ...IF the above limits stacking determined a zoned model + * validate that all of the table's devices conform to it. + */ + zoned_model = limits->zoned; + zone_sectors = limits->chunk_sectors; + } + if (validate_hardware_zoned_model(t, zoned_model, zone_sectors)) + return -EINVAL; + + return validate_hardware_logical_block_alignment(t, limits); +} + +/* + * Verify that all devices have an integrity profile that matches the + * DM device's registered integrity profile. If the profiles don't + * match then unregister the DM device's integrity profile. + */ +static void dm_table_verify_integrity(struct dm_table *t) +{ + struct gendisk *template_disk = NULL; + + if (t->integrity_added) + return; + + if (t->integrity_supported) { + /* + * Verify that the original integrity profile + * matches all the devices in this table. + */ + template_disk = dm_table_get_integrity_disk(t); + if (template_disk && + blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) + return; + } + + if (integrity_profile_exists(dm_disk(t->md))) { + DMWARN("%s: unable to establish an integrity profile", + dm_device_name(t->md)); + blk_integrity_unregister(dm_disk(t->md)); + } +} + +static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned long flush = (unsigned long) data; + struct request_queue *q = bdev_get_queue(dev->bdev); + + return (q->queue_flags & flush); +} + +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) +{ + /* + * Require at least one underlying device to support flushes. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting flushes must provide. + */ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->num_flush_bios) + continue; + + if (ti->flush_supported) + return true; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) + return true; + } + + return false; +} + +static int device_dax_write_cache_enabled(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dax_device *dax_dev = dev->dax_dev; + + if (!dax_dev) + return false; + + if (dax_write_cache_enabled(dax_dev)) + return true; + return false; +} + +static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + return !bdev_nonrot(dev->bdev); +} + +static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return !blk_queue_add_random(q); +} + +static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return !q->limits.max_write_zeroes_sectors; +} + +static bool dm_table_supports_write_zeroes(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->num_write_zeroes_bios) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL)) + return false; + } + + return true; +} + +static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + return !bdev_nowait(dev->bdev); +} + +static bool dm_table_supports_nowait(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_supports_nowait(ti->type)) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) + return false; + } + + return true; +} + +static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + return !bdev_max_discard_sectors(dev->bdev); +} + +static bool dm_table_supports_discards(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->num_discard_bios) + return false; + + /* + * Either the target provides discard support (as implied by setting + * 'discards_supported') or it relies on _all_ data devices having + * discard support. + */ + if (!ti->discards_supported && + (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_discard_capable, NULL))) + return false; + } + + return true; +} + +static int device_not_secure_erase_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return !bdev_max_secure_erase_sectors(dev->bdev); +} + +static bool dm_table_supports_secure_erase(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->num_secure_erase_bios) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) + return false; + } + + return true; +} + +static int device_requires_stable_pages(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return bdev_stable_writes(dev->bdev); +} + +int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits) +{ + bool wc = false, fua = false; + int r; + + /* + * Copy table's limits to the DM device's request_queue + */ + q->limits = *limits; + + if (dm_table_supports_nowait(t)) + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); + else + blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); + + if (!dm_table_supports_discards(t)) { + q->limits.max_discard_sectors = 0; + q->limits.max_hw_discard_sectors = 0; + q->limits.discard_granularity = 0; + q->limits.discard_alignment = 0; + q->limits.discard_misaligned = 0; + } + + if (!dm_table_supports_secure_erase(t)) + q->limits.max_secure_erase_sectors = 0; + + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { + wc = true; + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) + fua = true; + } + blk_queue_write_cache(q, wc, fua); + + if (dm_table_supports_dax(t, device_not_dax_capable)) { + blk_queue_flag_set(QUEUE_FLAG_DAX, q); + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) + set_dax_synchronous(t->md->dax_dev); + } + else + blk_queue_flag_clear(QUEUE_FLAG_DAX, q); + + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) + dax_write_cache(t->md->dax_dev, true); + + /* Ensure that all underlying devices are non-rotational. */ + if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + + if (!dm_table_supports_write_zeroes(t)) + q->limits.max_write_zeroes_sectors = 0; + + dm_table_verify_integrity(t); + + /* + * Some devices don't use blk_integrity but still want stable pages + * because they do their own checksumming. + * If any underlying device requires stable pages, a table must require + * them as well. Only targets that support iterate_devices are considered: + * don't want error, zero, etc to require stable pages. + */ + if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); + + /* + * Determine whether or not this queue's I/O timings contribute + * to the entropy pool, Only request-based targets use this. + * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not + * have it set. + */ + if (blk_queue_add_random(q) && + dm_table_any_dev_attr(t, device_is_not_random, NULL)) + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); + + /* + * For a zoned target, setup the zones related queue attributes + * and resources necessary for zone append emulation if necessary. + */ + if (blk_queue_is_zoned(q)) { + r = dm_set_zones_restrictions(t, q); + if (r) + return r; + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); + } + + dm_update_crypto_profile(q, t); + disk_update_readahead(t->md->disk); + + /* + * Check for request-based device is left to + * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). + * + * For bio-based device, only set QUEUE_FLAG_POLL when all + * underlying devices supporting polling. + */ + if (__table_type_bio_based(t->type)) { + if (dm_table_supports_poll(t)) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + } + + return 0; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +fmode_t dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} +EXPORT_SYMBOL(dm_table_get_mode); + +enum suspend_mode { + PRESUSPEND, + PRESUSPEND_UNDO, + POSTSUSPEND, +}; + +static void suspend_targets(struct dm_table *t, enum suspend_mode mode) +{ + lockdep_assert_held(&t->md->suspend_lock); + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + switch (mode) { + case PRESUSPEND: + if (ti->type->presuspend) + ti->type->presuspend(ti); + break; + case PRESUSPEND_UNDO: + if (ti->type->presuspend_undo) + ti->type->presuspend_undo(ti); + break; + case POSTSUSPEND: + if (ti->type->postsuspend) + ti->type->postsuspend(ti); + break; + } + } +} + +void dm_table_presuspend_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, PRESUSPEND); +} + +void dm_table_presuspend_undo_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, PRESUSPEND_UNDO); +} + +void dm_table_postsuspend_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, POSTSUSPEND); +} + +int dm_table_resume_targets(struct dm_table *t) +{ + unsigned int i; + int r = 0; + + lockdep_assert_held(&t->md->suspend_lock); + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->type->preresume) + continue; + + r = ti->type->preresume(ti); + if (r) { + DMERR("%s: %s: preresume failed, error = %d", + dm_device_name(t->md), ti->type->name, r); + return r; + } + } + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->type->resume) + ti->type->resume(ti); + } + + return 0; +} + +struct mapped_device *dm_table_get_md(struct dm_table *t) +{ + return t->md; +} +EXPORT_SYMBOL(dm_table_get_md); + +const char *dm_table_device_name(struct dm_table *t) +{ + return dm_device_name(t->md); +} +EXPORT_SYMBOL_GPL(dm_table_device_name); + +void dm_table_run_md_queue_async(struct dm_table *t) +{ + if (!dm_table_request_based(t)) + return; + + if (t->md->queue) + blk_mq_run_hw_queues(t->md->queue, true); +} +EXPORT_SYMBOL(dm_table_run_md_queue_async); + diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c new file mode 100644 index 000000000..8cd5184e6 --- /dev/null +++ b/drivers/md/dm-target.c @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm-core.h" + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "target" + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +static inline struct target_type *__find_target_type(const char *name) +{ + struct target_type *tt; + + list_for_each_entry(tt, &_targets, list) + if (!strcmp(name, tt->name)) + return tt; + + return NULL; +} + +static struct target_type *get_target_type(const char *name) +{ + struct target_type *tt; + + down_read(&_lock); + + tt = __find_target_type(name); + if (tt && !try_module_get(tt->module)) + tt = NULL; + + up_read(&_lock); + return tt; +} + +static void load_module(const char *name) +{ + request_module("dm-%s", name); +} + +struct target_type *dm_get_target_type(const char *name) +{ + struct target_type *tt = get_target_type(name); + + if (!tt) { + load_module(name); + tt = get_target_type(name); + } + + return tt; +} + +void dm_put_target_type(struct target_type *tt) +{ + down_read(&_lock); + module_put(tt->module); + up_read(&_lock); +} + +int dm_target_iterate(void (*iter_func)(struct target_type *tt, + void *param), void *param) +{ + struct target_type *tt; + + down_read(&_lock); + list_for_each_entry(tt, &_targets, list) + iter_func(tt, param); + up_read(&_lock); + + return 0; +} + +int dm_register_target(struct target_type *tt) +{ + int rv = 0; + + down_write(&_lock); + if (__find_target_type(tt->name)) + rv = -EEXIST; + else + list_add(&tt->list, &_targets); + + up_write(&_lock); + return rv; +} + +void dm_unregister_target(struct target_type *tt) +{ + down_write(&_lock); + if (!__find_target_type(tt->name)) { + DMCRIT("Unregistering unrecognised target: %s", tt->name); + BUG(); + } + + list_del(&tt->list); + + up_write(&_lock); +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) +{ + /* + * Return error for discards instead of -EOPNOTSUPP + */ + tt->num_discard_bios = 1; + + return 0; +} + +static void io_err_dtr(struct dm_target *tt) +{ + /* empty */ +} + +static int io_err_map(struct dm_target *tt, struct bio *bio) +{ + return DM_MAPIO_KILL; +} + +static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **clone) +{ + return DM_MAPIO_KILL; +} + +static void io_err_release_clone_rq(struct request *clone, + union map_info *map_context) +{ +} + +static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) +{ + return -EIO; +} + +static struct target_type error_target = { + .name = "error", + .version = {1, 5, 0}, + .features = DM_TARGET_WILDCARD, + .ctr = io_err_ctr, + .dtr = io_err_dtr, + .map = io_err_map, + .clone_and_map_rq = io_err_clone_and_map_rq, + .release_clone_rq = io_err_release_clone_rq, + .direct_access = io_err_dax_direct_access, +}; + +int __init dm_target_init(void) +{ + return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ + dm_unregister_target(&error_target); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c new file mode 100644 index 000000000..bb0e0a270 --- /dev/null +++ b/drivers/md/dm-thin-metadata.c @@ -0,0 +1,2154 @@ +/* + * Copyright (C) 2011-2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-thin-metadata.h" +#include "persistent-data/dm-btree.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-disk.h" +#include "persistent-data/dm-transaction-manager.h" + +#include +#include +#include + +/*-------------------------------------------------------------------------- + * As far as the metadata goes, there is: + * + * - A superblock in block zero, taking up fewer than 512 bytes for + * atomic writes. + * + * - A space map managing the metadata blocks. + * + * - A space map managing the data blocks. + * + * - A btree mapping our internal thin dev ids onto struct disk_device_details. + * + * - A hierarchical btree, with 2 levels which effectively maps (thin + * dev id, virtual block) -> block_time. Block time is a 64-bit + * field holding the time in the low 24 bits, and block in the top 40 + * bits. + * + * BTrees consist solely of btree_nodes, that fill a block. Some are + * internal nodes, as such their values are a __le64 pointing to other + * nodes. Leaf nodes can store data of any reasonable size (ie. much + * smaller than the block size). The nodes consist of the header, + * followed by an array of keys, followed by an array of values. We have + * to binary search on the keys so they're all held together to help the + * cpu cache. + * + * Space maps have 2 btrees: + * + * - One maps a uint64_t onto a struct index_entry. Which points to a + * bitmap block, and has some details about how many free entries there + * are etc. + * + * - The bitmap blocks have a header (for the checksum). Then the rest + * of the block is pairs of bits. With the meaning being: + * + * 0 - ref count is 0 + * 1 - ref count is 1 + * 2 - ref count is 2 + * 3 - ref count is higher than 2 + * + * - If the count is higher than 2 then the ref count is entered in a + * second btree that directly maps the block_address to a uint32_t ref + * count. + * + * The space map metadata variant doesn't have a bitmaps btree. Instead + * it has one single blocks worth of index_entries. This avoids + * recursive issues with the bitmap btree needing to allocate space in + * order to insert. With a small data block size such as 64k the + * metadata support data devices that are hundreds of terrabytes. + * + * The space maps allocate space linearly from front to back. Space that + * is freed in a transaction is never recycled within that transaction. + * To try and avoid fragmenting _free_ space the allocator always goes + * back and fills in gaps. + * + * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks + * from the block manager. + *--------------------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "thin metadata" + +#define THIN_SUPERBLOCK_MAGIC 27022010 +#define THIN_SUPERBLOCK_LOCATION 0 +#define THIN_VERSION 2 +#define SECTOR_TO_BLOCK_SHIFT 3 + +/* + * For btree insert: + * 3 for btree insert + + * 2 for btree lookup used within space map + * For btree remove: + * 2 for shadow spine + + * 4 for rebalance 3 child node + */ +#define THIN_MAX_CONCURRENT_LOCKS 6 + +/* This should be plenty */ +#define SPACE_MAP_ROOT_SIZE 128 + +/* + * Little endian on-disk superblock and device details. + */ +struct thin_disk_superblock { + __le32 csum; /* Checksum of superblock except for this field. */ + __le32 flags; + __le64 blocknr; /* This block number, dm_block_t. */ + + __u8 uuid[16]; + __le64 magic; + __le32 version; + __le32 time; + + __le64 trans_id; + + /* + * Root held by userspace transactions. + */ + __le64 held_root; + + __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + /* + * 2-level btree mapping (dev_id, (dev block, time)) -> data block + */ + __le64 data_mapping_root; + + /* + * Device detail root mapping dev_id -> device_details + */ + __le64 device_details_root; + + __le32 data_block_size; /* In 512-byte sectors. */ + + __le32 metadata_block_size; /* In 512-byte sectors. */ + __le64 metadata_nr_blocks; + + __le32 compat_flags; + __le32 compat_ro_flags; + __le32 incompat_flags; +} __packed; + +struct disk_device_details { + __le64 mapped_blocks; + __le64 transaction_id; /* When created. */ + __le32 creation_time; + __le32 snapshotted_time; +} __packed; + +struct dm_pool_metadata { + struct hlist_node hash; + + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_space_map *data_sm; + struct dm_transaction_manager *tm; + struct dm_transaction_manager *nb_tm; + + /* + * Two-level btree. + * First level holds thin_dev_t. + * Second level holds mappings. + */ + struct dm_btree_info info; + + /* + * Non-blocking version of the above. + */ + struct dm_btree_info nb_info; + + /* + * Just the top level for deleting whole devices. + */ + struct dm_btree_info tl_info; + + /* + * Just the bottom level for creating new devices. + */ + struct dm_btree_info bl_info; + + /* + * Describes the device details btree. + */ + struct dm_btree_info details_info; + + struct rw_semaphore root_lock; + uint32_t time; + dm_block_t root; + dm_block_t details_root; + struct list_head thin_devices; + uint64_t trans_id; + unsigned long flags; + sector_t data_block_size; + + /* + * Pre-commit callback. + * + * This allows the thin provisioning target to run a callback before + * the metadata are committed. + */ + dm_pool_pre_commit_fn pre_commit_fn; + void *pre_commit_context; + + /* + * We reserve a section of the metadata for commit overhead. + * All reported space does *not* include this. + */ + dm_block_t metadata_reserve; + + /* + * Set if a transaction has to be aborted but the attempt to roll back + * to the previous (good) transaction failed. The only pool metadata + * operation possible in this state is the closing of the device. + */ + bool fail_io:1; + + /* + * Set once a thin-pool has been accessed through one of the interfaces + * that imply the pool is in-service (e.g. thin devices created/deleted, + * thin-pool message, metadata snapshots, etc). + */ + bool in_service:1; + + /* + * Reading the space map roots can fail, so we read it into these + * buffers before the superblock is locked and updated. + */ + __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; +}; + +struct dm_thin_device { + struct list_head list; + struct dm_pool_metadata *pmd; + dm_thin_id id; + + int open_count; + bool changed:1; + bool aborted_with_changes:1; + uint64_t mapped_blocks; + uint64_t transaction_id; + uint32_t creation_time; + uint32_t snapshotted_time; +}; + +/*---------------------------------------------------------------- + * superblock validator + *--------------------------------------------------------------*/ + +#define SUPERBLOCK_CSUM_XOR 160774 + +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct thin_disk_superblock *disk_super = dm_block_data(b); + + disk_super->blocknr = cpu_to_le64(dm_block_location(b)); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct thin_disk_superblock *disk_super = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { + DMERR("sb_check failed: blocknr %llu: wanted %llu", + le64_to_cpu(disk_super->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: wanted %llu", + le64_to_cpu(disk_super->magic), + (unsigned long long)THIN_SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk_super->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*---------------------------------------------------------------- + * Methods for the btree value types + *--------------------------------------------------------------*/ + +static uint64_t pack_block_time(dm_block_t b, uint32_t t) +{ + return (b << 24) | t; +} + +static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) +{ + *b = v >> 24; + *t = v & ((1 << 24) - 1); +} + +/* + * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as + * possible. 'with_runs' reads contiguous runs of blocks, and calls the + * given sm function. + */ +typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t); + +static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned int count, run_fn fn) +{ + uint64_t b, begin, end; + uint32_t t; + bool in_run = false; + unsigned int i; + + for (i = 0; i < count; i++, value_le++) { + /* We know value_le is 8 byte aligned */ + unpack_block_time(le64_to_cpu(*value_le), &b, &t); + + if (in_run) { + if (b == end) { + end++; + } else { + fn(sm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(sm, begin, end); +} + +static void data_block_inc(void *context, const void *value_le, unsigned int count) +{ + with_runs((struct dm_space_map *) context, + (const __le64 *) value_le, count, dm_sm_inc_blocks); +} + +static void data_block_dec(void *context, const void *value_le, unsigned int count) +{ + with_runs((struct dm_space_map *) context, + (const __le64 *) value_le, count, dm_sm_dec_blocks); +} + +static int data_block_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + uint64_t b1, b2; + uint32_t t; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + unpack_block_time(le64_to_cpu(v1_le), &b1, &t); + unpack_block_time(le64_to_cpu(v2_le), &b2, &t); + + return b1 == b2; +} + +static void subtree_inc(void *context, const void *value, unsigned int count) +{ + struct dm_btree_info *info = context; + const __le64 *root_le = value; + unsigned int i; + + for (i = 0; i < count; i++, root_le++) + dm_tm_inc(info->tm, le64_to_cpu(*root_le)); +} + +static void subtree_dec(void *context, const void *value, unsigned int count) +{ + struct dm_btree_info *info = context; + const __le64 *root_le = value; + unsigned int i; + + for (i = 0; i < count; i++, root_le++) + if (dm_btree_del(info, le64_to_cpu(*root_le))) + DMERR("btree delete failed"); +} + +static int subtree_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + + return v1_le == v2_le; +} + +/*----------------------------------------------------------------*/ + +/* + * Variant that is used for in-core only changes or code that + * shouldn't put the pool in service on its own (e.g. commit). + */ +static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd) + __acquires(pmd->root_lock) +{ + down_write(&pmd->root_lock); +} + +static inline void pmd_write_lock(struct dm_pool_metadata *pmd) +{ + pmd_write_lock_in_core(pmd); + if (unlikely(!pmd->in_service)) + pmd->in_service = true; +} + +static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) + __releases(pmd->root_lock) +{ + up_write(&pmd->root_lock); +} + +/*----------------------------------------------------------------*/ + +static int superblock_lock_zero(struct dm_pool_metadata *pmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct dm_pool_metadata *pmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) +{ + int r; + unsigned int i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned int block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = 1; + for (i = 0; i < block_size; i++) { + if (data_le[i] != zero) { + *result = 0; + break; + } + } + + dm_bm_unlock(b); + + return 0; +} + +static void __setup_btree_details(struct dm_pool_metadata *pmd) +{ + pmd->info.tm = pmd->tm; + pmd->info.levels = 2; + pmd->info.value_type.context = pmd->data_sm; + pmd->info.value_type.size = sizeof(__le64); + pmd->info.value_type.inc = data_block_inc; + pmd->info.value_type.dec = data_block_dec; + pmd->info.value_type.equal = data_block_equal; + + memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); + pmd->nb_info.tm = pmd->nb_tm; + + pmd->tl_info.tm = pmd->tm; + pmd->tl_info.levels = 1; + pmd->tl_info.value_type.context = &pmd->bl_info; + pmd->tl_info.value_type.size = sizeof(__le64); + pmd->tl_info.value_type.inc = subtree_inc; + pmd->tl_info.value_type.dec = subtree_dec; + pmd->tl_info.value_type.equal = subtree_equal; + + pmd->bl_info.tm = pmd->tm; + pmd->bl_info.levels = 1; + pmd->bl_info.value_type.context = pmd->data_sm; + pmd->bl_info.value_type.size = sizeof(__le64); + pmd->bl_info.value_type.inc = data_block_inc; + pmd->bl_info.value_type.dec = data_block_dec; + pmd->bl_info.value_type.equal = data_block_equal; + + pmd->details_info.tm = pmd->tm; + pmd->details_info.levels = 1; + pmd->details_info.value_type.context = NULL; + pmd->details_info.value_type.size = sizeof(struct disk_device_details); + pmd->details_info.value_type.inc = NULL; + pmd->details_info.value_type.dec = NULL; + pmd->details_info.value_type.equal = NULL; +} + +static int save_sm_roots(struct dm_pool_metadata *pmd) +{ + int r; + size_t len; + + r = dm_sm_root_size(pmd->metadata_sm, &len); + if (r < 0) + return r; + + r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len); + if (r < 0) + return r; + + r = dm_sm_root_size(pmd->data_sm, &len); + if (r < 0) + return r; + + return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len); +} + +static void copy_sm_roots(struct dm_pool_metadata *pmd, + struct thin_disk_superblock *disk) +{ + memcpy(&disk->metadata_space_map_root, + &pmd->metadata_space_map_root, + sizeof(pmd->metadata_space_map_root)); + + memcpy(&disk->data_space_map_root, + &pmd->data_space_map_root, + sizeof(pmd->data_space_map_root)); +} + +static int __write_initial_superblock(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + sector_t bdev_size = bdev_nr_sectors(pmd->bdev); + + if (bdev_size > THIN_METADATA_MAX_SECTORS) + bdev_size = THIN_METADATA_MAX_SECTORS; + + r = dm_sm_commit(pmd->data_sm); + if (r < 0) + return r; + + r = dm_tm_pre_commit(pmd->tm); + if (r < 0) + return r; + + r = save_sm_roots(pmd); + if (r < 0) + return r; + + r = superblock_lock_zero(pmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->flags = 0; + memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); + disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); + disk_super->version = cpu_to_le32(THIN_VERSION); + disk_super->time = 0; + disk_super->trans_id = 0; + disk_super->held_root = 0; + + copy_sm_roots(pmd, disk_super); + + disk_super->data_mapping_root = cpu_to_le64(pmd->root); + disk_super->device_details_root = cpu_to_le64(pmd->details_root); + disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); + disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); + disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); + + return dm_tm_commit(pmd->tm, sblock); +} + +static int __format_metadata(struct dm_pool_metadata *pmd) +{ + int r; + + r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &pmd->tm, &pmd->metadata_sm); + if (r < 0) { + pmd->tm = NULL; + pmd->metadata_sm = NULL; + DMERR("tm_create_with_sm failed"); + return r; + } + + pmd->data_sm = dm_sm_disk_create(pmd->tm, 0); + if (IS_ERR(pmd->data_sm)) { + DMERR("sm_disk_create failed"); + r = PTR_ERR(pmd->data_sm); + pmd->data_sm = NULL; + goto bad_cleanup_tm; + } + + pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); + if (!pmd->nb_tm) { + DMERR("could not create non-blocking clone tm"); + r = -ENOMEM; + goto bad_cleanup_data_sm; + } + + __setup_btree_details(pmd); + + r = dm_btree_empty(&pmd->info, &pmd->root); + if (r < 0) + goto bad_cleanup_nb_tm; + + r = dm_btree_empty(&pmd->details_info, &pmd->details_root); + if (r < 0) { + DMERR("couldn't create devices root"); + goto bad_cleanup_nb_tm; + } + + r = __write_initial_superblock(pmd); + if (r) + goto bad_cleanup_nb_tm; + + return 0; + +bad_cleanup_nb_tm: + dm_tm_destroy(pmd->nb_tm); + pmd->nb_tm = NULL; +bad_cleanup_data_sm: + dm_sm_destroy(pmd->data_sm); + pmd->data_sm = NULL; +bad_cleanup_tm: + dm_tm_destroy(pmd->tm); + pmd->tm = NULL; + dm_sm_destroy(pmd->metadata_sm); + pmd->metadata_sm = NULL; + + return r; +} + +static int __check_incompat_features(struct thin_disk_superblock *disk_super, + struct dm_pool_metadata *pmd) +{ + uint32_t features; + + features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; + if (features) { + DMERR("could not access metadata due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + /* + * Check for read-only metadata to skip the following RDWR checks. + */ + if (bdev_read_only(pmd->bdev)) + return 0; + + features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; + if (features) { + DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + return 0; +} + +static int __open_metadata(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r < 0) { + DMERR("couldn't read superblock"); + return r; + } + + disk_super = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk_super->data_block_size), + (unsigned long long)pmd->data_block_size); + r = -EINVAL; + goto bad_unlock_sblock; + } + + r = __check_incompat_features(disk_super, pmd); + if (r < 0) + goto bad_unlock_sblock; + + r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, + disk_super->metadata_space_map_root, + sizeof(disk_super->metadata_space_map_root), + &pmd->tm, &pmd->metadata_sm); + if (r < 0) { + pmd->tm = NULL; + pmd->metadata_sm = NULL; + DMERR("tm_open_with_sm failed"); + goto bad_unlock_sblock; + } + + pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root, + sizeof(disk_super->data_space_map_root)); + if (IS_ERR(pmd->data_sm)) { + DMERR("sm_disk_open failed"); + r = PTR_ERR(pmd->data_sm); + pmd->data_sm = NULL; + goto bad_cleanup_tm; + } + + pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); + if (!pmd->nb_tm) { + DMERR("could not create non-blocking clone tm"); + r = -ENOMEM; + goto bad_cleanup_data_sm; + } + + /* + * For pool metadata opening process, root setting is redundant + * because it will be set again in __begin_transaction(). But dm + * pool aborting process really needs to get last transaction's + * root to avoid accessing broken btree. + */ + pmd->root = le64_to_cpu(disk_super->data_mapping_root); + pmd->details_root = le64_to_cpu(disk_super->device_details_root); + + __setup_btree_details(pmd); + dm_bm_unlock(sblock); + + return 0; + +bad_cleanup_data_sm: + dm_sm_destroy(pmd->data_sm); + pmd->data_sm = NULL; +bad_cleanup_tm: + dm_tm_destroy(pmd->tm); + pmd->tm = NULL; + dm_sm_destroy(pmd->metadata_sm); + pmd->metadata_sm = NULL; +bad_unlock_sblock: + dm_bm_unlock(sblock); + + return r; +} + +static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device) +{ + int r, unformatted; + + r = __superblock_all_zeroes(pmd->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return format_device ? __format_metadata(pmd) : -EPERM; + + return __open_metadata(pmd); +} + +static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device) +{ + int r; + + pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + THIN_MAX_CONCURRENT_LOCKS); + if (IS_ERR(pmd->bm)) { + DMERR("could not create block manager"); + r = PTR_ERR(pmd->bm); + pmd->bm = NULL; + return r; + } + + r = __open_or_format_metadata(pmd, format_device); + if (r) { + dm_block_manager_destroy(pmd->bm); + pmd->bm = NULL; + } + + return r; +} + +static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, + bool destroy_bm) +{ + dm_sm_destroy(pmd->data_sm); + pmd->data_sm = NULL; + dm_sm_destroy(pmd->metadata_sm); + pmd->metadata_sm = NULL; + dm_tm_destroy(pmd->nb_tm); + pmd->nb_tm = NULL; + dm_tm_destroy(pmd->tm); + pmd->tm = NULL; + if (destroy_bm) + dm_block_manager_destroy(pmd->bm); +} + +static int __begin_transaction(struct dm_pool_metadata *pmd) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We re-read the superblock every time. Shouldn't need to do this + * really. + */ + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + pmd->time = le32_to_cpu(disk_super->time); + pmd->root = le64_to_cpu(disk_super->data_mapping_root); + pmd->details_root = le64_to_cpu(disk_super->device_details_root); + pmd->trans_id = le64_to_cpu(disk_super->trans_id); + pmd->flags = le32_to_cpu(disk_super->flags); + pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + + dm_bm_unlock(sblock); + return 0; +} + +static int __write_changed_details(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_thin_device *td, *tmp; + struct disk_device_details details; + uint64_t key; + + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (!td->changed) + continue; + + key = td->id; + + details.mapped_blocks = cpu_to_le64(td->mapped_blocks); + details.transaction_id = cpu_to_le64(td->transaction_id); + details.creation_time = cpu_to_le32(td->creation_time); + details.snapshotted_time = cpu_to_le32(td->snapshotted_time); + __dm_bless_for_disk(&details); + + r = dm_btree_insert(&pmd->details_info, pmd->details_root, + &key, &details, &pmd->details_root); + if (r) + return r; + + if (td->open_count) + td->changed = false; + else { + list_del(&td->list); + kfree(td); + } + } + + return 0; +} + +static int __commit_transaction(struct dm_pool_metadata *pmd) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We need to know if the thin_disk_superblock exceeds a 512-byte sector. + */ + BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + BUG_ON(!rwsem_is_locked(&pmd->root_lock)); + + if (unlikely(!pmd->in_service)) + return 0; + + if (pmd->pre_commit_fn) { + r = pmd->pre_commit_fn(pmd->pre_commit_context); + if (r < 0) { + DMERR("pre-commit callback failed"); + return r; + } + } + + r = __write_changed_details(pmd); + if (r < 0) + return r; + + r = dm_sm_commit(pmd->data_sm); + if (r < 0) + return r; + + r = dm_tm_pre_commit(pmd->tm); + if (r < 0) + return r; + + r = save_sm_roots(pmd); + if (r < 0) + return r; + + r = superblock_lock(pmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->time = cpu_to_le32(pmd->time); + disk_super->data_mapping_root = cpu_to_le64(pmd->root); + disk_super->device_details_root = cpu_to_le64(pmd->details_root); + disk_super->trans_id = cpu_to_le64(pmd->trans_id); + disk_super->flags = cpu_to_le32(pmd->flags); + + copy_sm_roots(pmd, disk_super); + + return dm_tm_commit(pmd->tm, sblock); +} + +static void __set_metadata_reserve(struct dm_pool_metadata *pmd) +{ + int r; + dm_block_t total; + dm_block_t max_blocks = 4096; /* 16M */ + + r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total); + if (r) { + DMERR("could not get size of metadata device"); + pmd->metadata_reserve = max_blocks; + } else + pmd->metadata_reserve = min(max_blocks, div_u64(total, 10)); +} + +struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool format_device) +{ + int r; + struct dm_pool_metadata *pmd; + + pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); + if (!pmd) { + DMERR("could not allocate metadata struct"); + return ERR_PTR(-ENOMEM); + } + + init_rwsem(&pmd->root_lock); + pmd->time = 0; + INIT_LIST_HEAD(&pmd->thin_devices); + pmd->fail_io = false; + pmd->in_service = false; + pmd->bdev = bdev; + pmd->data_block_size = data_block_size; + pmd->pre_commit_fn = NULL; + pmd->pre_commit_context = NULL; + + r = __create_persistent_data_objects(pmd, format_device); + if (r) { + kfree(pmd); + return ERR_PTR(r); + } + + r = __begin_transaction(pmd); + if (r < 0) { + if (dm_pool_metadata_close(pmd) < 0) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + return ERR_PTR(r); + } + + __set_metadata_reserve(pmd); + + return pmd; +} + +int dm_pool_metadata_close(struct dm_pool_metadata *pmd) +{ + int r; + unsigned int open_devices = 0; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->open_count) + open_devices++; + else { + list_del(&td->list); + kfree(td); + } + } + up_read(&pmd->root_lock); + + if (open_devices) { + DMERR("attempt to close pmd when %u device(s) are still open", + open_devices); + return -EBUSY; + } + + pmd_write_lock_in_core(pmd); + if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) { + r = __commit_transaction(pmd); + if (r < 0) + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + } + pmd_write_unlock(pmd); + __destroy_persistent_data_objects(pmd, true); + + kfree(pmd); + return 0; +} + +/* + * __open_device: Returns @td corresponding to device with id @dev, + * creating it if @create is set and incrementing @td->open_count. + * On failure, @td is undefined. + */ +static int __open_device(struct dm_pool_metadata *pmd, + dm_thin_id dev, int create, + struct dm_thin_device **td) +{ + int r, changed = 0; + struct dm_thin_device *td2; + uint64_t key = dev; + struct disk_device_details details_le; + + /* + * If the device is already open, return it. + */ + list_for_each_entry(td2, &pmd->thin_devices, list) + if (td2->id == dev) { + /* + * May not create an already-open device. + */ + if (create) + return -EEXIST; + + td2->open_count++; + *td = td2; + return 0; + } + + /* + * Check the device exists. + */ + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &key, &details_le); + if (r) { + if (r != -ENODATA || !create) + return r; + + /* + * Create new device. + */ + changed = 1; + details_le.mapped_blocks = 0; + details_le.transaction_id = cpu_to_le64(pmd->trans_id); + details_le.creation_time = cpu_to_le32(pmd->time); + details_le.snapshotted_time = cpu_to_le32(pmd->time); + } + + *td = kmalloc(sizeof(**td), GFP_NOIO); + if (!*td) + return -ENOMEM; + + (*td)->pmd = pmd; + (*td)->id = dev; + (*td)->open_count = 1; + (*td)->changed = changed; + (*td)->aborted_with_changes = false; + (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); + (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); + (*td)->creation_time = le32_to_cpu(details_le.creation_time); + (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); + + list_add(&(*td)->list, &pmd->thin_devices); + + return 0; +} + +static void __close_device(struct dm_thin_device *td) +{ + --td->open_count; +} + +static int __create_thin(struct dm_pool_metadata *pmd, + dm_thin_id dev) +{ + int r; + dm_block_t dev_root; + uint64_t key = dev; + struct dm_thin_device *td; + __le64 value; + + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &key, NULL); + if (!r) + return -EEXIST; + + /* + * Create an empty btree for the mappings. + */ + r = dm_btree_empty(&pmd->bl_info, &dev_root); + if (r) + return r; + + /* + * Insert it into the main mapping tree. + */ + value = cpu_to_le64(dev_root); + __dm_bless_for_disk(&value); + r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); + if (r) { + dm_btree_del(&pmd->bl_info, dev_root); + return r; + } + + r = __open_device(pmd, dev, 1, &td); + if (r) { + dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + dm_btree_del(&pmd->bl_info, dev_root); + return r; + } + __close_device(td); + + return r; +} + +int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = __create_thin(pmd, dev); + pmd_write_unlock(pmd); + + return r; +} + +static int __set_snapshot_details(struct dm_pool_metadata *pmd, + struct dm_thin_device *snap, + dm_thin_id origin, uint32_t time) +{ + int r; + struct dm_thin_device *td; + + r = __open_device(pmd, origin, 0, &td); + if (r) + return r; + + td->changed = true; + td->snapshotted_time = time; + + snap->mapped_blocks = td->mapped_blocks; + snap->snapshotted_time = time; + __close_device(td); + + return 0; +} + +static int __create_snap(struct dm_pool_metadata *pmd, + dm_thin_id dev, dm_thin_id origin) +{ + int r; + dm_block_t origin_root; + uint64_t key = origin, dev_key = dev; + struct dm_thin_device *td; + __le64 value; + + /* check this device is unused */ + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &dev_key, NULL); + if (!r) + return -EEXIST; + + /* find the mapping tree for the origin */ + r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); + if (r) + return r; + origin_root = le64_to_cpu(value); + + /* clone the origin, an inc will do */ + dm_tm_inc(pmd->tm, origin_root); + + /* insert into the main mapping tree */ + value = cpu_to_le64(origin_root); + __dm_bless_for_disk(&value); + key = dev; + r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); + if (r) { + dm_tm_dec(pmd->tm, origin_root); + return r; + } + + pmd->time++; + + r = __open_device(pmd, dev, 1, &td); + if (r) + goto bad; + + r = __set_snapshot_details(pmd, td, origin, pmd->time); + __close_device(td); + + if (r) + goto bad; + + return 0; + +bad: + dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + dm_btree_remove(&pmd->details_info, pmd->details_root, + &key, &pmd->details_root); + return r; +} + +int dm_pool_create_snap(struct dm_pool_metadata *pmd, + dm_thin_id dev, + dm_thin_id origin) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = __create_snap(pmd, dev, origin); + pmd_write_unlock(pmd); + + return r; +} + +static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) +{ + int r; + uint64_t key = dev; + struct dm_thin_device *td; + + /* TODO: failure should mark the transaction invalid */ + r = __open_device(pmd, dev, 0, &td); + if (r) + return r; + + if (td->open_count > 1) { + __close_device(td); + return -EBUSY; + } + + list_del(&td->list); + kfree(td); + r = dm_btree_remove(&pmd->details_info, pmd->details_root, + &key, &pmd->details_root); + if (r) + return r; + + r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + if (r) + return r; + + return 0; +} + +int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, + dm_thin_id dev) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = __delete_device(pmd, dev); + pmd_write_unlock(pmd); + + return r; +} + +int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t current_id, + uint64_t new_id) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + + if (pmd->fail_io) + goto out; + + if (pmd->trans_id != current_id) { + DMERR("mismatched transaction id"); + goto out; + } + + pmd->trans_id = new_id; + r = 0; + +out: + pmd_write_unlock(pmd); + + return r; +} + +int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) { + *result = pmd->trans_id; + r = 0; + } + up_read(&pmd->root_lock); + + return r; +} + +static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r, inc; + struct thin_disk_superblock *disk_super; + struct dm_block *copy, *sblock; + dm_block_t held_root; + + /* + * We commit to ensure the btree roots which we increment in a + * moment are up to date. + */ + r = __commit_transaction(pmd); + if (r < 0) { + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + return r; + } + + /* + * Copy the superblock. + */ + dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); + r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, ©, &inc); + if (r) + return r; + + BUG_ON(!inc); + + held_root = dm_block_location(copy); + disk_super = dm_block_data(copy); + + if (le64_to_cpu(disk_super->held_root)) { + DMWARN("Pool metadata snapshot already exists: release this before taking another."); + + dm_tm_dec(pmd->tm, held_root); + dm_tm_unlock(pmd->tm, copy); + return -EBUSY; + } + + /* + * Wipe the spacemap since we're not publishing this. + */ + memset(&disk_super->data_space_map_root, 0, + sizeof(disk_super->data_space_map_root)); + memset(&disk_super->metadata_space_map_root, 0, + sizeof(disk_super->metadata_space_map_root)); + + /* + * Increment the data structures that need to be preserved. + */ + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); + dm_tm_unlock(pmd->tm, copy); + + /* + * Write the held root into the superblock. + */ + r = superblock_lock(pmd, &sblock); + if (r) { + dm_tm_dec(pmd->tm, held_root); + return r; + } + + disk_super = dm_block_data(sblock); + disk_super->held_root = cpu_to_le64(held_root); + dm_bm_unlock(sblock); + return 0; +} + +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = __reserve_metadata_snap(pmd); + pmd_write_unlock(pmd); + + return r; +} + +static int __release_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock, *copy; + dm_block_t held_root; + + r = superblock_lock(pmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + held_root = le64_to_cpu(disk_super->held_root); + disk_super->held_root = cpu_to_le64(0); + + dm_bm_unlock(sblock); + + if (!held_root) { + DMWARN("No pool metadata snapshot found: nothing to release."); + return -EINVAL; + } + + r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); + if (r) + return r; + + disk_super = dm_block_data(copy); + dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root)); + dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root)); + dm_sm_dec_block(pmd->metadata_sm, held_root); + + dm_tm_unlock(pmd->tm, copy); + + return 0; +} + +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = __release_metadata_snap(pmd); + pmd_write_unlock(pmd); + + return r; +} + +static int __get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + *result = le64_to_cpu(disk_super->held_root); + + dm_bm_unlock(sblock); + + return 0; +} + +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = __get_metadata_snap(pmd, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, + struct dm_thin_device **td) +{ + int r = -EINVAL; + + pmd_write_lock_in_core(pmd); + if (!pmd->fail_io) + r = __open_device(pmd, dev, 0, td); + pmd_write_unlock(pmd); + + return r; +} + +int dm_pool_close_thin_device(struct dm_thin_device *td) +{ + pmd_write_lock_in_core(td->pmd); + __close_device(td); + pmd_write_unlock(td->pmd); + + return 0; +} + +dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) +{ + return td->id; +} + +/* + * Check whether @time (of block creation) is older than @td's last snapshot. + * If so then the associated block is shared with the last snapshot device. + * Any block on a device created *after* the device last got snapshotted is + * necessarily not shared. + */ +static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) +{ + return td->snapshotted_time > time; +} + +static void unpack_lookup_result(struct dm_thin_device *td, __le64 value, + struct dm_thin_lookup_result *result) +{ + uint64_t block_time = 0; + dm_block_t exception_block; + uint32_t exception_time; + + block_time = le64_to_cpu(value); + unpack_block_time(block_time, &exception_block, &exception_time); + result->block = exception_block; + result->shared = __snapshotted_since(td, exception_time); +} + +static int __find_block(struct dm_thin_device *td, dm_block_t block, + int can_issue_io, struct dm_thin_lookup_result *result) +{ + int r; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + struct dm_btree_info *info; + + if (can_issue_io) { + info = &pmd->info; + } else + info = &pmd->nb_info; + + r = dm_btree_lookup(info, pmd->root, keys, &value); + if (!r) + unpack_lookup_result(td, value, result); + + return r; +} + +int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, + int can_issue_io, struct dm_thin_lookup_result *result) +{ + int r; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + if (pmd->fail_io) { + up_read(&pmd->root_lock); + return -EINVAL; + } + + r = __find_block(td, block, can_issue_io, result); + + up_read(&pmd->root_lock); + return r; +} + +static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t *vblock, + struct dm_thin_lookup_result *result) +{ + int r; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value); + if (!r) + unpack_lookup_result(td, value, result); + + return r; +} + +static int __find_mapped_range(struct dm_thin_device *td, + dm_block_t begin, dm_block_t end, + dm_block_t *thin_begin, dm_block_t *thin_end, + dm_block_t *pool_begin, bool *maybe_shared) +{ + int r; + dm_block_t pool_end; + struct dm_thin_lookup_result lookup; + + if (end < begin) + return -ENODATA; + + r = __find_next_mapped_block(td, begin, &begin, &lookup); + if (r) + return r; + + if (begin >= end) + return -ENODATA; + + *thin_begin = begin; + *pool_begin = lookup.block; + *maybe_shared = lookup.shared; + + begin++; + pool_end = *pool_begin + 1; + while (begin != end) { + r = __find_block(td, begin, true, &lookup); + if (r) { + if (r == -ENODATA) + break; + else + return r; + } + + if ((lookup.block != pool_end) || + (lookup.shared != *maybe_shared)) + break; + + pool_end++; + begin++; + } + + *thin_end = begin; + return 0; +} + +int dm_thin_find_mapped_range(struct dm_thin_device *td, + dm_block_t begin, dm_block_t end, + dm_block_t *thin_begin, dm_block_t *thin_end, + dm_block_t *pool_begin, bool *maybe_shared) +{ + int r = -EINVAL; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) { + r = __find_mapped_range(td, begin, end, thin_begin, thin_end, + pool_begin, maybe_shared); + } + up_read(&pmd->root_lock); + + return r; +} + +static int __insert(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block) +{ + int r, inserted; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + value = cpu_to_le64(pack_block_time(data_block, pmd->time)); + __dm_bless_for_disk(&value); + + r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, + &pmd->root, &inserted); + if (r) + return r; + + td->changed = true; + if (inserted) + td->mapped_blocks++; + + return 0; +} + +int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block) +{ + int r = -EINVAL; + + pmd_write_lock(td->pmd); + if (!td->pmd->fail_io) + r = __insert(td, block, data_block); + pmd_write_unlock(td->pmd); + + return r; +} + +static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end) +{ + int r; + unsigned int count, total_count = 0; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[1] = { td->id }; + __le64 value; + dm_block_t mapping_root; + + /* + * Find the mapping tree + */ + r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value); + if (r) + return r; + + /* + * Remove from the mapping tree, taking care to inc the + * ref count so it doesn't get deleted. + */ + mapping_root = le64_to_cpu(value); + dm_tm_inc(pmd->tm, mapping_root); + r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root); + if (r) + return r; + + /* + * Remove leaves stops at the first unmapped entry, so we have to + * loop round finding mapped ranges. + */ + while (begin < end) { + r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value); + if (r == -ENODATA) + break; + + if (r) + return r; + + if (begin >= end) + break; + + r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count); + if (r) + return r; + + total_count += count; + } + + td->mapped_blocks -= total_count; + td->changed = true; + + /* + * Reinsert the mapping tree. + */ + value = cpu_to_le64(mapping_root); + __dm_bless_for_disk(&value); + return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root); +} + +int dm_thin_remove_range(struct dm_thin_device *td, + dm_block_t begin, dm_block_t end) +{ + int r = -EINVAL; + + pmd_write_lock(td->pmd); + if (!td->pmd->fail_io) + r = __remove_range(td, begin, end); + pmd_write_unlock(td->pmd); + + return r; +} + +int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) +{ + int r = -EINVAL; + uint32_t ref_count; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) { + r = dm_sm_get_count(pmd->data_sm, b, &ref_count); + if (!r) + *result = (ref_count > 1); + } + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = dm_sm_inc_blocks(pmd->data_sm, b, e); + pmd_write_unlock(pmd); + + return r; +} + +int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = dm_sm_dec_blocks(pmd->data_sm, b, e); + pmd_write_unlock(pmd); + + return r; +} + +bool dm_thin_changed_this_transaction(struct dm_thin_device *td) +{ + int r; + + down_read(&td->pmd->root_lock); + r = td->changed; + up_read(&td->pmd->root_lock); + + return r; +} + +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) +{ + bool r = false; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->changed) { + r = td->changed; + break; + } + } + up_read(&pmd->root_lock); + + return r; +} + +bool dm_thin_aborted_changes(struct dm_thin_device *td) +{ + bool r; + + down_read(&td->pmd->root_lock); + r = td->aborted_with_changes; + up_read(&td->pmd->root_lock); + + return r; +} + +int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = dm_sm_new_block(pmd->data_sm, result); + pmd_write_unlock(pmd); + + return r; +} + +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + /* + * Care is taken to not have commit be what + * triggers putting the thin-pool in-service. + */ + pmd_write_lock_in_core(pmd); + if (pmd->fail_io) + goto out; + + r = __commit_transaction(pmd); + if (r < 0) + goto out; + + /* + * Open the next transaction. + */ + r = __begin_transaction(pmd); +out: + pmd_write_unlock(pmd); + return r; +} + +static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) +{ + struct dm_thin_device *td; + + list_for_each_entry(td, &pmd->thin_devices, list) + td->aborted_with_changes = td->changed; +} + +int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + /* fail_io is double-checked with pmd->root_lock held below */ + if (unlikely(pmd->fail_io)) + return r; + + pmd_write_lock(pmd); + if (pmd->fail_io) { + pmd_write_unlock(pmd); + return r; + } + __set_abort_with_changes_flags(pmd); + + /* destroy data_sm/metadata_sm/nb_tm/tm */ + __destroy_persistent_data_objects(pmd, false); + + /* reset bm */ + dm_block_manager_reset(pmd->bm); + + /* rebuild data_sm/metadata_sm/nb_tm/tm */ + r = __open_or_format_metadata(pmd, false); + if (r) + pmd->fail_io = true; + pmd_write_unlock(pmd); + return r; +} + +int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_free(pmd->data_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_free(pmd->metadata_sm, result); + + if (!r) { + if (*result < pmd->metadata_reserve) + *result = 0; + else + *result -= pmd->metadata_reserve; + } + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_blocks(pmd->data_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) +{ + int r = -EINVAL; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) { + *result = td->mapped_blocks; + r = 0; + } + up_read(&pmd->root_lock); + + return r; +} + +static int __highest_block(struct dm_thin_device *td, dm_block_t *result) +{ + int r; + __le64 value_le; + dm_block_t thin_root; + struct dm_pool_metadata *pmd = td->pmd; + + r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); + if (r) + return r; + + thin_root = le64_to_cpu(value_le); + + return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); +} + +int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, + dm_block_t *result) +{ + int r = -EINVAL; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = __highest_block(td, result); + up_read(&pmd->root_lock); + + return r; +} + +static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count) +{ + int r; + dm_block_t old_count; + + r = dm_sm_get_nr_blocks(sm, &old_count); + if (r) + return r; + + if (new_count == old_count) + return 0; + + if (new_count < old_count) { + DMERR("cannot reduce size of space map"); + return -EINVAL; + } + + return dm_sm_extend(sm, new_count - old_count); +} + +int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) + r = __resize_space_map(pmd->data_sm, new_count); + pmd_write_unlock(pmd); + + return r; +} + +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r = -EINVAL; + + pmd_write_lock(pmd); + if (!pmd->fail_io) { + r = __resize_space_map(pmd->metadata_sm, new_count); + if (!r) + __set_metadata_reserve(pmd); + } + pmd_write_unlock(pmd); + + return r; +} + +void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) +{ + pmd_write_lock_in_core(pmd); + dm_bm_set_read_only(pmd->bm); + pmd_write_unlock(pmd); +} + +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) +{ + pmd_write_lock_in_core(pmd); + dm_bm_set_read_write(pmd->bm); + pmd_write_unlock(pmd); +} + +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + int r = -EINVAL; + + pmd_write_lock_in_core(pmd); + if (!pmd->fail_io) { + r = dm_sm_register_threshold_callback(pmd->metadata_sm, + threshold, fn, context); + } + pmd_write_unlock(pmd); + + return r; +} + +void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, + dm_pool_pre_commit_fn fn, + void *context) +{ + pmd_write_lock_in_core(pmd); + pmd->pre_commit_fn = fn; + pmd->pre_commit_context = context; + pmd_write_unlock(pmd); +} + +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + + pmd_write_lock(pmd); + if (pmd->fail_io) + goto out; + + pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; + + r = superblock_lock(pmd, &sblock); + if (r) { + DMERR("couldn't lock superblock"); + goto out; + } + + disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(pmd->flags); + + dm_bm_unlock(sblock); +out: + pmd_write_unlock(pmd); + return r; +} + +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) +{ + bool needs_check; + + down_read(&pmd->root_lock); + needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; + up_read(&pmd->root_lock); + + return needs_check; +} + +void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) +{ + down_read(&pmd->root_lock); + if (!pmd->fail_io) + dm_tm_issue_prefetches(pmd->tm); + up_read(&pmd->root_lock); +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h new file mode 100644 index 000000000..4d7a2caf2 --- /dev/null +++ b/drivers/md/dm-thin-metadata.h @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2010-2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_THIN_METADATA_H +#define DM_THIN_METADATA_H + +#include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-metadata.h" + +#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE + +/* + * The metadata device is currently limited in size. + */ +#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +/*----------------------------------------------------------------*/ + +/* + * Thin metadata superblock flags. + */ +#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) + +struct dm_pool_metadata; +struct dm_thin_device; + +/* + * Device identifier + */ +typedef uint64_t dm_thin_id; + +/* + * Reopens or creates a new, empty metadata volume. + */ +struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool format_device); + +int dm_pool_metadata_close(struct dm_pool_metadata *pmd); + +/* + * Compat feature flags. Any incompat flags beyond the ones + * specified below will prevent use of the thin metadata. + */ +#define THIN_FEATURE_COMPAT_SUPP 0UL +#define THIN_FEATURE_COMPAT_RO_SUPP 0UL +#define THIN_FEATURE_INCOMPAT_SUPP 0UL + +/* + * Device creation/deletion. + */ +int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev); + +/* + * An internal snapshot. + * + * You can only snapshot a quiesced origin i.e. one that is either + * suspended or not instanced at all. + */ +int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev, + dm_thin_id origin); + +/* + * Deletes a virtual device from the metadata. It _is_ safe to call this + * when that device is open. Operations on that device will just start + * failing. You still need to call close() on the device. + */ +int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, + dm_thin_id dev); + +/* + * Commits _all_ metadata changes: device creation, deletion, mapping + * updates. + */ +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); + +/* + * Discards all uncommitted changes. Rereads the superblock, rolling back + * to the last good transaction. Thin devices remain open. + * dm_thin_aborted_changes() tells you if they had uncommitted changes. + * + * If this call fails it's only useful to call dm_pool_metadata_close(). + * All other methods will fail with -EINVAL. + */ +int dm_pool_abort_metadata(struct dm_pool_metadata *pmd); + +/* + * Set/get userspace transaction id. + */ +int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t current_id, + uint64_t new_id); + +int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t *result); + +/* + * Hold/get root for userspace transaction. + * + * The metadata snapshot is a copy of the current superblock (minus the + * space maps). Userland can access the data structures for READ + * operations only. A small performance hit is incurred by providing this + * copy of the metadata to userland due to extra copy-on-write operations + * on the metadata nodes. Release this as soon as you finish with it. + */ +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd); +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd); + +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result); + +/* + * Actions on a single virtual device. + */ + +/* + * Opening the same device more than once will fail with -EBUSY. + */ +int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, + struct dm_thin_device **td); + +int dm_pool_close_thin_device(struct dm_thin_device *td); + +dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); + +struct dm_thin_lookup_result { + dm_block_t block; + bool shared:1; +}; + +/* + * Returns: + * -EWOULDBLOCK iff @can_issue_io is set and would issue IO + * -ENODATA iff that mapping is not present. + * 0 success + */ +int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, + int can_issue_io, struct dm_thin_lookup_result *result); + +/* + * Retrieve the next run of contiguously mapped blocks. Useful for working + * out where to break up IO. Returns 0 on success, < 0 on error. + */ +int dm_thin_find_mapped_range(struct dm_thin_device *td, + dm_block_t begin, dm_block_t end, + dm_block_t *thin_begin, dm_block_t *thin_end, + dm_block_t *pool_begin, bool *maybe_shared); + +/* + * Obtain an unused block. + */ +int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result); + +/* + * Insert or remove block. + */ +int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block); + +int dm_thin_remove_range(struct dm_thin_device *td, + dm_block_t begin, dm_block_t end); + +/* + * Queries. + */ +bool dm_thin_changed_this_transaction(struct dm_thin_device *td); + +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd); + +bool dm_thin_aborted_changes(struct dm_thin_device *td); + +int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, + dm_block_t *highest_mapped); + +int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result); + +int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); + +int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); + +int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); +int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); + +/* + * Returns -ENOSPC if the new size is too small and already allocated + * blocks would be lost. + */ +int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); + +/* + * Flicks the underlying block manager into read only mode, so you know + * that nothing is changing. + */ +void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); + +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); + +/* + * Updates the superblock immediately. + */ +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); + +/* + * Issue any prefetches that may be useful. + */ +void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd); + +/* Pre-commit callback */ +typedef int (*dm_pool_pre_commit_fn)(void *context); + +void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, + dm_pool_pre_commit_fn fn, + void *context); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c new file mode 100644 index 000000000..601f9e4e6 --- /dev/null +++ b/drivers/md/dm-thin.c @@ -0,0 +1,4529 @@ +/* + * Copyright (C) 2011-2012 Red Hat UK. + * + * This file is released under the GPL. + */ + +#include "dm-thin-metadata.h" +#include "dm-bio-prison-v1.h" +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "thin" + +/* + * Tunable constants + */ +#define ENDIO_HOOK_POOL_SIZE 1024 +#define MAPPING_POOL_SIZE 1024 +#define COMMIT_PERIOD HZ +#define NO_SPACE_TIMEOUT_SECS 60 + +static unsigned int no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS; + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, + "A percentage of time allocated for copy on write"); + +/* + * The block size of the device holding pool data must be + * between 64KB and 1GB. + */ +#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) +#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) + +/* + * Device id is restricted to 24 bits. + */ +#define MAX_DEV_ID ((1 << 24) - 1) + +/* + * How do we handle breaking sharing of data blocks? + * ================================================= + * + * We use a standard copy-on-write btree to store the mappings for the + * devices (note I'm talking about copy-on-write of the metadata here, not + * the data). When you take an internal snapshot you clone the root node + * of the origin btree. After this there is no concept of an origin or a + * snapshot. They are just two device trees that happen to point to the + * same data blocks. + * + * When we get a write in we decide if it's to a shared data block using + * some timestamp magic. If it is, we have to break sharing. + * + * Let's say we write to a shared block in what was the origin. The + * steps are: + * + * i) plug io further to this physical block. (see bio_prison code). + * + * ii) quiesce any read io to that shared data block. Obviously + * including all devices that share this block. (see dm_deferred_set code) + * + * iii) copy the data block to a newly allocate block. This step can be + * missed out if the io covers the block. (schedule_copy). + * + * iv) insert the new mapping into the origin's btree + * (process_prepared_mapping). This act of inserting breaks some + * sharing of btree nodes between the two devices. Breaking sharing only + * effects the btree of that specific device. Btrees for the other + * devices that share the block never change. The btree for the origin + * device as it was after the last commit is untouched, ie. we're using + * persistent data structures in the functional programming sense. + * + * v) unplug io to this physical block, including the io that triggered + * the breaking of sharing. + * + * Steps (ii) and (iii) occur in parallel. + * + * The metadata _doesn't_ need to be committed before the io continues. We + * get away with this because the io is always written to a _new_ block. + * If there's a crash, then: + * + * - The origin mapping will point to the old origin block (the shared + * one). This will contain the data as it was before the io that triggered + * the breaking of sharing came in. + * + * - The snap mapping still points to the old block. As it would after + * the commit. + * + * The downside of this scheme is the timestamp magic isn't perfect, and + * will continue to think that data block in the snapshot device is shared + * even after the write to the origin has broken sharing. I suspect data + * blocks will typically be shared by many different devices, so we're + * breaking sharing n + 1 times, rather than n, where n is the number of + * devices that reference this data block. At the moment I think the + * benefits far, far outweigh the disadvantages. + */ + +/*----------------------------------------------------------------*/ + +/* + * Key building. + */ +enum lock_space { + VIRTUAL, + PHYSICAL +}; + +static void build_key(struct dm_thin_device *td, enum lock_space ls, + dm_block_t b, dm_block_t e, struct dm_cell_key *key) +{ + key->virtual = (ls == VIRTUAL); + key->dev = dm_thin_dev_id(td); + key->block_begin = b; + key->block_end = e; +} + +static void build_data_key(struct dm_thin_device *td, dm_block_t b, + struct dm_cell_key *key) +{ + build_key(td, PHYSICAL, b, b + 1llu, key); +} + +static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, + struct dm_cell_key *key) +{ + build_key(td, VIRTUAL, b, b + 1llu, key); +} + +/*----------------------------------------------------------------*/ + +#define THROTTLE_THRESHOLD (1 * HZ) + +struct throttle { + struct rw_semaphore lock; + unsigned long threshold; + bool throttle_applied; +}; + +static void throttle_init(struct throttle *t) +{ + init_rwsem(&t->lock); + t->throttle_applied = false; +} + +static void throttle_work_start(struct throttle *t) +{ + t->threshold = jiffies + THROTTLE_THRESHOLD; +} + +static void throttle_work_update(struct throttle *t) +{ + if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) { + down_write(&t->lock); + t->throttle_applied = true; + } +} + +static void throttle_work_complete(struct throttle *t) +{ + if (t->throttle_applied) { + t->throttle_applied = false; + up_write(&t->lock); + } +} + +static void throttle_lock(struct throttle *t) +{ + down_read(&t->lock); +} + +static void throttle_unlock(struct throttle *t) +{ + up_read(&t->lock); +} + +/*----------------------------------------------------------------*/ + +/* + * A pool device ties together a metadata device and a data device. It + * also provides the interface for creating and destroying internal + * devices. + */ +struct dm_thin_new_mapping; + +/* + * The pool runs in various modes. Ordered in degraded order for comparisons. + */ +enum pool_mode { + PM_WRITE, /* metadata may be changed */ + PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ + + /* + * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY. + */ + PM_OUT_OF_METADATA_SPACE, + PM_READ_ONLY, /* metadata may not be changed */ + + PM_FAIL, /* all I/O fails */ +}; + +struct pool_features { + enum pool_mode mode; + + bool zero_new_blocks:1; + bool discard_enabled:1; + bool discard_passdown:1; + bool error_if_no_space:1; +}; + +struct thin_c; +typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell); +typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); + +#define CELL_SORT_ARRAY_SIZE 8192 + +struct pool { + struct list_head list; + struct dm_target *ti; /* Only set if a pool target is bound */ + + struct mapped_device *pool_md; + struct block_device *data_dev; + struct block_device *md_dev; + struct dm_pool_metadata *pmd; + + dm_block_t low_water_blocks; + uint32_t sectors_per_block; + int sectors_per_block_shift; + + struct pool_features pf; + bool low_water_triggered:1; /* A dm event has been sent */ + bool suspended:1; + bool out_of_data_space:1; + + struct dm_bio_prison *prison; + struct dm_kcopyd_client *copier; + + struct work_struct worker; + struct workqueue_struct *wq; + struct throttle throttle; + struct delayed_work waker; + struct delayed_work no_space_timeout; + + unsigned long last_commit_jiffies; + unsigned int ref_count; + + spinlock_t lock; + struct bio_list deferred_flush_bios; + struct bio_list deferred_flush_completions; + struct list_head prepared_mappings; + struct list_head prepared_discards; + struct list_head prepared_discards_pt2; + struct list_head active_thins; + + struct dm_deferred_set *shared_read_ds; + struct dm_deferred_set *all_io_ds; + + struct dm_thin_new_mapping *next_mapping; + + process_bio_fn process_bio; + process_bio_fn process_discard; + + process_cell_fn process_cell; + process_cell_fn process_discard_cell; + + process_mapping_fn process_prepared_mapping; + process_mapping_fn process_prepared_discard; + process_mapping_fn process_prepared_discard_pt2; + + struct dm_bio_prison_cell **cell_sort_array; + + mempool_t mapping_pool; +}; + +static void metadata_operation_failed(struct pool *pool, const char *op, int r); + +static enum pool_mode get_pool_mode(struct pool *pool) +{ + return pool->pf.mode; +} + +static void notify_of_pool_mode_change(struct pool *pool) +{ + const char *descs[] = { + "write", + "out-of-data-space", + "read-only", + "read-only", + "fail" + }; + const char *extra_desc = NULL; + enum pool_mode mode = get_pool_mode(pool); + + if (mode == PM_OUT_OF_DATA_SPACE) { + if (!pool->pf.error_if_no_space) + extra_desc = " (queue IO)"; + else + extra_desc = " (error IO)"; + } + + dm_table_event(pool->ti->table); + DMINFO("%s: switching pool to %s%s mode", + dm_device_name(pool->pool_md), + descs[(int)mode], extra_desc ? : ""); +} + +/* + * Target context for a pool. + */ +struct pool_c { + struct dm_target *ti; + struct pool *pool; + struct dm_dev *data_dev; + struct dm_dev *metadata_dev; + + dm_block_t low_water_blocks; + struct pool_features requested_pf; /* Features requested during table load */ + struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ +}; + +/* + * Target context for a thin. + */ +struct thin_c { + struct list_head list; + struct dm_dev *pool_dev; + struct dm_dev *origin_dev; + sector_t origin_size; + dm_thin_id dev_id; + + struct pool *pool; + struct dm_thin_device *td; + struct mapped_device *thin_md; + + bool requeue_mode:1; + spinlock_t lock; + struct list_head deferred_cells; + struct bio_list deferred_bio_list; + struct bio_list retry_on_resume_list; + struct rb_root sort_bio_list; /* sorted list of deferred bios */ + + /* + * Ensures the thin is not destroyed until the worker has finished + * iterating the active_thins list. + */ + refcount_t refcount; + struct completion can_destroy; +}; + +/*----------------------------------------------------------------*/ + +static bool block_size_is_power_of_two(struct pool *pool) +{ + return pool->sectors_per_block_shift >= 0; +} + +static sector_t block_to_sectors(struct pool *pool, dm_block_t b) +{ + return block_size_is_power_of_two(pool) ? + (b << pool->sectors_per_block_shift) : + (b * pool->sectors_per_block); +} + +/*----------------------------------------------------------------*/ + +struct discard_op { + struct thin_c *tc; + struct blk_plug plug; + struct bio *parent_bio; + struct bio *bio; +}; + +static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent) +{ + BUG_ON(!parent); + + op->tc = tc; + blk_start_plug(&op->plug); + op->parent_bio = parent; + op->bio = NULL; +} + +static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e) +{ + struct thin_c *tc = op->tc; + sector_t s = block_to_sectors(tc->pool, data_b); + sector_t len = block_to_sectors(tc->pool, data_e - data_b); + + return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio); +} + +static void end_discard(struct discard_op *op, int r) +{ + if (op->bio) { + /* + * Even if one of the calls to issue_discard failed, we + * need to wait for the chain to complete. + */ + bio_chain(op->bio, op->parent_bio); + bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0); + submit_bio(op->bio); + } + + blk_finish_plug(&op->plug); + + /* + * Even if r is set, there could be sub discards in flight that we + * need to wait for. + */ + if (r && !op->parent_bio->bi_status) + op->parent_bio->bi_status = errno_to_blk_status(r); + bio_endio(op->parent_bio); +} + +/*----------------------------------------------------------------*/ + +/* + * wake_worker() is used when new work is queued and when pool_resume is + * ready to continue deferred IO processing. + */ +static void wake_worker(struct pool *pool) +{ + queue_work(pool->wq, &pool->worker); +} + +/*----------------------------------------------------------------*/ + +static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_bio_prison_cell *cell_prealloc; + + /* + * Allocate a cell from the prison's mempool. + * This might block but it can't fail. + */ + cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO); + + r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); + if (r) + /* + * We reused an old cell; we can get rid of + * the new one. + */ + dm_bio_prison_free_cell(pool->prison, cell_prealloc); + + return r; +} + +static void cell_release(struct pool *pool, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + dm_cell_release(pool->prison, cell, bios); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_visit_release(struct pool *pool, + void (*fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + dm_cell_visit_release(pool->prison, fn, context, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_release_no_holder(struct pool *pool, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + dm_cell_release_no_holder(pool->prison, cell, bios); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_error_with_code(struct pool *pool, + struct dm_bio_prison_cell *cell, blk_status_t error_code) +{ + dm_cell_error(pool->prison, cell, error_code); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static blk_status_t get_pool_io_error_code(struct pool *pool) +{ + return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR; +} + +static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, get_pool_io_error_code(pool)); +} + +static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, 0); +} + +static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE); +} + +/*----------------------------------------------------------------*/ + +/* + * A global list of pools that uses a struct mapped_device as a key. + */ +static struct dm_thin_pool_table { + struct mutex mutex; + struct list_head pools; +} dm_thin_pool_table; + +static void pool_table_init(void) +{ + mutex_init(&dm_thin_pool_table.mutex); + INIT_LIST_HEAD(&dm_thin_pool_table.pools); +} + +static void pool_table_exit(void) +{ + mutex_destroy(&dm_thin_pool_table.mutex); +} + +static void __pool_table_insert(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + list_add(&pool->list, &dm_thin_pool_table.pools); +} + +static void __pool_table_remove(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + list_del(&pool->list); +} + +static struct pool *__pool_table_lookup(struct mapped_device *md) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->pool_md == md) { + pool = tmp; + break; + } + } + + return pool; +} + +static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->md_dev == md_dev) { + pool = tmp; + break; + } + } + + return pool; +} + +/*----------------------------------------------------------------*/ + +struct dm_thin_endio_hook { + struct thin_c *tc; + struct dm_deferred_entry *shared_read_entry; + struct dm_deferred_entry *all_io_entry; + struct dm_thin_new_mapping *overwrite_mapping; + struct rb_node rb_node; + struct dm_bio_prison_cell *cell; +}; + +static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) +{ + bio_list_merge(bios, master); + bio_list_init(master); +} + +static void error_bio_list(struct bio_list *bios, blk_status_t error) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bios))) { + bio->bi_status = error; + bio_endio(bio); + } +} + +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, + blk_status_t error) +{ + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irq(&tc->lock); + __merge_bio_list(&bios, master); + spin_unlock_irq(&tc->lock); + + error_bio_list(&bios, error); +} + +static void requeue_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + struct list_head cells; + struct dm_bio_prison_cell *cell, *tmp; + + INIT_LIST_HEAD(&cells); + + spin_lock_irq(&tc->lock); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irq(&tc->lock); + + list_for_each_entry_safe(cell, tmp, &cells, user_list) + cell_requeue(pool, cell); +} + +static void requeue_io(struct thin_c *tc) +{ + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irq(&tc->lock); + __merge_bio_list(&bios, &tc->deferred_bio_list); + __merge_bio_list(&bios, &tc->retry_on_resume_list); + spin_unlock_irq(&tc->lock); + + error_bio_list(&bios, BLK_STS_DM_REQUEUE); + requeue_deferred_cells(tc); +} + +static void error_retry_list_with_code(struct pool *pool, blk_status_t error) +{ + struct thin_c *tc; + + rcu_read_lock(); + list_for_each_entry_rcu(tc, &pool->active_thins, list) + error_thin_bio_list(tc, &tc->retry_on_resume_list, error); + rcu_read_unlock(); +} + +static void error_retry_list(struct pool *pool) +{ + error_retry_list_with_code(pool, get_pool_io_error_code(pool)); +} + +/* + * This section of code contains the logic for processing a thin device's IO. + * Much of the code depends on pool object resources (lists, workqueues, etc) + * but most is exclusively called from the thin target rather than the thin-pool + * target. + */ + +static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + sector_t block_nr = bio->bi_iter.bi_sector; + + if (block_size_is_power_of_two(pool)) + block_nr >>= pool->sectors_per_block_shift; + else + (void) sector_div(block_nr, pool->sectors_per_block); + + return block_nr; +} + +/* + * Returns the _complete_ blocks that this bio covers. + */ +static void get_bio_block_range(struct thin_c *tc, struct bio *bio, + dm_block_t *begin, dm_block_t *end) +{ + struct pool *pool = tc->pool; + sector_t b = bio->bi_iter.bi_sector; + sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT); + + b += pool->sectors_per_block - 1ull; /* so we round up */ + + if (block_size_is_power_of_two(pool)) { + b >>= pool->sectors_per_block_shift; + e >>= pool->sectors_per_block_shift; + } else { + (void) sector_div(b, pool->sectors_per_block); + (void) sector_div(e, pool->sectors_per_block); + } + + if (e < b) + /* Can happen if the bio is within a single block. */ + e = b; + + *begin = b; + *end = e; +} + +static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) +{ + struct pool *pool = tc->pool; + sector_t bi_sector = bio->bi_iter.bi_sector; + + bio_set_dev(bio, tc->pool_dev->bdev); + if (block_size_is_power_of_two(pool)) + bio->bi_iter.bi_sector = + (block << pool->sectors_per_block_shift) | + (bi_sector & (pool->sectors_per_block - 1)); + else + bio->bi_iter.bi_sector = (block * pool->sectors_per_block) + + sector_div(bi_sector, pool->sectors_per_block); +} + +static void remap_to_origin(struct thin_c *tc, struct bio *bio) +{ + bio_set_dev(bio, tc->origin_dev->bdev); +} + +static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) +{ + return op_is_flush(bio->bi_opf) && + dm_thin_changed_this_transaction(tc->td); +} + +static void inc_all_io_entry(struct pool *pool, struct bio *bio) +{ + struct dm_thin_endio_hook *h; + + if (bio_op(bio) == REQ_OP_DISCARD) + return; + + h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds); +} + +static void issue(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + if (!bio_triggers_commit(tc, bio)) { + dm_submit_bio_remap(bio, NULL); + return; + } + + /* + * Complete bio with an error if earlier I/O caused changes to + * the metadata that can't be committed e.g, due to I/O errors + * on the metadata device. + */ + if (dm_thin_aborted_changes(tc->td)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in process_deferred_bios(). + */ + spin_lock_irq(&pool->lock); + bio_list_add(&pool->deferred_flush_bios, bio); + spin_unlock_irq(&pool->lock); +} + +static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) +{ + remap_to_origin(tc, bio); + issue(tc, bio); +} + +static void remap_and_issue(struct thin_c *tc, struct bio *bio, + dm_block_t block) +{ + remap(tc, bio, block); + issue(tc, bio); +} + +/*----------------------------------------------------------------*/ + +/* + * Bio endio functions. + */ +struct dm_thin_new_mapping { + struct list_head list; + + bool pass_discard:1; + bool maybe_shared:1; + + /* + * Track quiescing, copying and zeroing preparation actions. When this + * counter hits zero the block is prepared and can be inserted into the + * btree. + */ + atomic_t prepare_actions; + + blk_status_t status; + struct thin_c *tc; + dm_block_t virt_begin, virt_end; + dm_block_t data_block; + struct dm_bio_prison_cell *cell; + + /* + * If the bio covers the whole area of a block then we can avoid + * zeroing or copying. Instead this bio is hooked. The bio will + * still be in the cell, so care has to be taken to avoid issuing + * the bio twice. + */ + struct bio *bio; + bio_end_io_t *saved_bi_end_io; +}; + +static void __complete_mapping_preparation(struct dm_thin_new_mapping *m) +{ + struct pool *pool = m->tc->pool; + + if (atomic_dec_and_test(&m->prepare_actions)) { + list_add_tail(&m->list, &pool->prepared_mappings); + wake_worker(pool); + } +} + +static void complete_mapping_preparation(struct dm_thin_new_mapping *m) +{ + unsigned long flags; + struct pool *pool = m->tc->pool; + + spin_lock_irqsave(&pool->lock, flags); + __complete_mapping_preparation(m); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + struct dm_thin_new_mapping *m = context; + + m->status = read_err || write_err ? BLK_STS_IOERR : 0; + complete_mapping_preparation(m); +} + +static void overwrite_endio(struct bio *bio) +{ + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + struct dm_thin_new_mapping *m = h->overwrite_mapping; + + bio->bi_end_io = m->saved_bi_end_io; + + m->status = bio->bi_status; + complete_mapping_preparation(m); +} + +/*----------------------------------------------------------------*/ + +/* + * Workqueue. + */ + +/* + * Prepared mapping jobs. + */ + +/* + * This sends the bios in the cell, except the original holder, back + * to the deferred_bios list. + */ +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + struct pool *pool = tc->pool; + unsigned long flags; + int has_work; + + spin_lock_irqsave(&tc->lock, flags); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); + has_work = !bio_list_empty(&tc->deferred_bio_list); + spin_unlock_irqrestore(&tc->lock, flags); + + if (has_work) + wake_worker(pool); +} + +static void thin_defer_bio(struct thin_c *tc, struct bio *bio); + +struct remap_info { + struct thin_c *tc; + struct bio_list defer_bios; + struct bio_list issue_bios; +}; + +static void __inc_remap_and_issue_cell(void *context, + struct dm_bio_prison_cell *cell) +{ + struct remap_info *info = context; + struct bio *bio; + + while ((bio = bio_list_pop(&cell->bios))) { + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) + bio_list_add(&info->defer_bios, bio); + else { + inc_all_io_entry(info->tc->pool, bio); + + /* + * We can't issue the bios with the bio prison lock + * held, so we add them to a list to issue on + * return from this function. + */ + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void inc_remap_and_issue_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + /* + * We have to be careful to inc any bios we're about to issue + * before the cell is released, and avoid a race with new bios + * being added to the cell. + */ + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(info.tc, bio, block); +} + +static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) +{ + cell_error(m->tc->pool, m->cell); + list_del(&m->list); + mempool_free(m, &m->tc->pool->mapping_pool); +} + +static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + /* + * If the bio has the REQ_FUA flag set we must commit the metadata + * before signaling its completion. + */ + if (!bio_triggers_commit(tc, bio)) { + bio_endio(bio); + return; + } + + /* + * Complete bio with an error if earlier I/O caused changes to the + * metadata that can't be committed, e.g, due to I/O errors on the + * metadata device. + */ + if (dm_thin_aborted_changes(tc->td)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in process_deferred_bios(). + */ + spin_lock_irq(&pool->lock); + bio_list_add(&pool->deferred_flush_completions, bio); + spin_unlock_irq(&pool->lock); +} + +static void process_prepared_mapping(struct dm_thin_new_mapping *m) +{ + struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; + struct bio *bio = m->bio; + int r; + + if (m->status) { + cell_error(pool, m->cell); + goto out; + } + + /* + * Commit the prepared block into the mapping btree. + * Any I/O for this block arriving after this point will get + * remapped to it directly. + */ + r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block); + if (r) { + metadata_operation_failed(pool, "dm_thin_insert_block", r); + cell_error(pool, m->cell); + goto out; + } + + /* + * Release any bios held while the block was being provisioned. + * If we are processing a write bio that completely covers the block, + * we already processed it so can ignore it now when processing + * the bios in the cell. + */ + if (bio) { + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + complete_overwrite_bio(tc, bio); + } else { + inc_all_io_entry(tc->pool, m->cell->holder); + remap_and_issue(tc, m->cell->holder, m->data_block); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + } + +out: + list_del(&m->list); + mempool_free(m, &pool->mapping_pool); +} + +/*----------------------------------------------------------------*/ + +static void free_discard_mapping(struct dm_thin_new_mapping *m) +{ + struct thin_c *tc = m->tc; + if (m->cell) + cell_defer_no_holder(tc, m->cell); + mempool_free(m, &tc->pool->mapping_pool); +} + +static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) +{ + bio_io_error(m->bio); + free_discard_mapping(m); +} + +static void process_prepared_discard_success(struct dm_thin_new_mapping *m) +{ + bio_endio(m->bio); + free_discard_mapping(m); +} + +static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end); + if (r) { + metadata_operation_failed(tc->pool, "dm_thin_remove_range", r); + bio_io_error(m->bio); + } else + bio_endio(m->bio); + + cell_defer_no_holder(tc, m->cell); + mempool_free(m, &tc->pool->mapping_pool); +} + +/*----------------------------------------------------------------*/ + +static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m, + struct bio *discard_parent) +{ + /* + * We've already unmapped this range of blocks, but before we + * passdown we have to check that these blocks are now unused. + */ + int r = 0; + bool shared = true; + struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; + dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; + struct discard_op op; + + begin_discard(&op, tc, discard_parent); + while (b != end) { + /* find start of unmapped run */ + for (; b < end; b++) { + r = dm_pool_block_is_shared(pool->pmd, b, &shared); + if (r) + goto out; + + if (!shared) + break; + } + + if (b == end) + break; + + /* find end of run */ + for (e = b + 1; e != end; e++) { + r = dm_pool_block_is_shared(pool->pmd, e, &shared); + if (r) + goto out; + + if (shared) + break; + } + + r = issue_discard(&op, b, e); + if (r) + goto out; + + b = e; + } +out: + end_discard(&op, r); +} + +static void queue_passdown_pt2(struct dm_thin_new_mapping *m) +{ + unsigned long flags; + struct pool *pool = m->tc->pool; + + spin_lock_irqsave(&pool->lock, flags); + list_add_tail(&m->list, &pool->prepared_discards_pt2); + spin_unlock_irqrestore(&pool->lock, flags); + wake_worker(pool); +} + +static void passdown_endio(struct bio *bio) +{ + /* + * It doesn't matter if the passdown discard failed, we still want + * to unmap (we ignore err). + */ + queue_passdown_pt2(bio->bi_private); + bio_put(bio); +} + +static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; + struct bio *discard_parent; + dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin); + + /* + * Only this thread allocates blocks, so we can be sure that the + * newly unmapped blocks will not be allocated before the end of + * the function. + */ + r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); + if (r) { + metadata_operation_failed(pool, "dm_thin_remove_range", r); + bio_io_error(m->bio); + cell_defer_no_holder(tc, m->cell); + mempool_free(m, &pool->mapping_pool); + return; + } + + /* + * Increment the unmapped blocks. This prevents a race between the + * passdown io and reallocation of freed blocks. + */ + r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); + if (r) { + metadata_operation_failed(pool, "dm_pool_inc_data_range", r); + bio_io_error(m->bio); + cell_defer_no_holder(tc, m->cell); + mempool_free(m, &pool->mapping_pool); + return; + } + + discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO); + discard_parent->bi_end_io = passdown_endio; + discard_parent->bi_private = m; + if (m->maybe_shared) + passdown_double_checking_shared_status(m, discard_parent); + else { + struct discard_op op; + + begin_discard(&op, tc, discard_parent); + r = issue_discard(&op, m->data_block, data_end); + end_discard(&op, r); + } +} + +static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; + + /* + * The passdown has completed, so now we can decrement all those + * unmapped blocks. + */ + r = dm_pool_dec_data_range(pool->pmd, m->data_block, + m->data_block + (m->virt_end - m->virt_begin)); + if (r) { + metadata_operation_failed(pool, "dm_pool_dec_data_range", r); + bio_io_error(m->bio); + } else + bio_endio(m->bio); + + cell_defer_no_holder(tc, m->cell); + mempool_free(m, &pool->mapping_pool); +} + +static void process_prepared(struct pool *pool, struct list_head *head, + process_mapping_fn *fn) +{ + struct list_head maps; + struct dm_thin_new_mapping *m, *tmp; + + INIT_LIST_HEAD(&maps); + spin_lock_irq(&pool->lock); + list_splice_init(head, &maps); + spin_unlock_irq(&pool->lock); + + list_for_each_entry_safe(m, tmp, &maps, list) + (*fn)(m); +} + +/* + * Deferred bio jobs. + */ +static int io_overlaps_block(struct pool *pool, struct bio *bio) +{ + return bio->bi_iter.bi_size == + (pool->sectors_per_block << SECTOR_SHIFT); +} + +static int io_overwrites_block(struct pool *pool, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + io_overlaps_block(pool, bio); +} + +static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, + bio_end_io_t *fn) +{ + *save = bio->bi_end_io; + bio->bi_end_io = fn; +} + +static int ensure_next_mapping(struct pool *pool) +{ + if (pool->next_mapping) + return 0; + + pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC); + + return pool->next_mapping ? 0 : -ENOMEM; +} + +static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) +{ + struct dm_thin_new_mapping *m = pool->next_mapping; + + BUG_ON(!pool->next_mapping); + + memset(m, 0, sizeof(struct dm_thin_new_mapping)); + INIT_LIST_HEAD(&m->list); + m->bio = NULL; + + pool->next_mapping = NULL; + + return m; +} + +static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, + sector_t begin, sector_t end) +{ + struct dm_io_region to; + + to.bdev = tc->pool_dev->bdev; + to.sector = begin; + to.count = end - begin; + + dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); +} + +static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, + dm_block_t data_begin, + struct dm_thin_new_mapping *m) +{ + struct pool *pool = tc->pool; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_begin); +} + +/* + * A partial copy also needs to zero the uncopied region. + */ +static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, + struct dm_dev *origin, dm_block_t data_origin, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio, + sector_t len) +{ + struct pool *pool = tc->pool; + struct dm_thin_new_mapping *m = get_next_mapping(pool); + + m->tc = tc; + m->virt_begin = virt_block; + m->virt_end = virt_block + 1u; + m->data_block = data_dest; + m->cell = cell; + + /* + * quiesce action + copy action + an extra reference held for the + * duration of this function (we may need to inc later for a + * partial zero). + */ + atomic_set(&m->prepare_actions, 3); + + if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) + complete_mapping_preparation(m); /* already quiesced */ + + /* + * IO to pool_dev remaps to the pool target's data_dev. + * + * If the whole block of data is being overwritten, we can issue the + * bio immediately. Otherwise we use kcopyd to clone the data first. + */ + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_dest, m); + else { + struct dm_io_region from, to; + + from.bdev = origin->bdev; + from.sector = data_origin * pool->sectors_per_block; + from.count = len; + + to.bdev = tc->pool_dev->bdev; + to.sector = data_dest * pool->sectors_per_block; + to.count = len; + + dm_kcopyd_copy(pool->copier, &from, 1, &to, + 0, copy_complete, m); + + /* + * Do we need to zero a tail region? + */ + if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) { + atomic_inc(&m->prepare_actions); + ll_zero(tc, m, + data_dest * pool->sectors_per_block + len, + (data_dest + 1) * pool->sectors_per_block); + } + } + + complete_mapping_preparation(m); /* drop our ref */ +} + +static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_origin, dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->pool_dev, + data_origin, data_dest, cell, bio, + tc->pool->sectors_per_block); +} + +static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_block, struct dm_bio_prison_cell *cell, + struct bio *bio) +{ + struct pool *pool = tc->pool; + struct dm_thin_new_mapping *m = get_next_mapping(pool); + + atomic_set(&m->prepare_actions, 1); /* no need to quiesce */ + m->tc = tc; + m->virt_begin = virt_block; + m->virt_end = virt_block + 1u; + m->data_block = data_block; + m->cell = cell; + + /* + * If the whole block of data is being overwritten or we are not + * zeroing pre-existing data, we can issue the bio immediately. + * Otherwise we use kcopyd to zero the data first. + */ + if (pool->pf.zero_new_blocks) { + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_block, m); + else + ll_zero(tc, m, data_block * pool->sectors_per_block, + (data_block + 1) * pool->sectors_per_block); + } else + process_prepared_mapping(m); +} + +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + struct pool *pool = tc->pool; + sector_t virt_block_begin = virt_block * pool->sectors_per_block; + sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block; + + if (virt_block_end <= tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + pool->sectors_per_block); + + else if (virt_block_begin < tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + tc->origin_size - virt_block_begin); + + else + schedule_zero(tc, virt_block, data_dest, cell, bio); +} + +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); + +static void requeue_bios(struct pool *pool); + +static bool is_read_only_pool_mode(enum pool_mode mode) +{ + return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY); +} + +static bool is_read_only(struct pool *pool) +{ + return is_read_only_pool_mode(get_pool_mode(pool)); +} + +static void check_for_metadata_space(struct pool *pool) +{ + int r; + const char *ooms_reason = NULL; + dm_block_t nr_free; + + r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free); + if (r) + ooms_reason = "Could not get free metadata blocks"; + else if (!nr_free) + ooms_reason = "No free metadata blocks"; + + if (ooms_reason && !is_read_only(pool)) { + DMERR("%s", ooms_reason); + set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE); + } +} + +static void check_for_data_space(struct pool *pool) +{ + int r; + dm_block_t nr_free; + + if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE) + return; + + r = dm_pool_get_free_block_count(pool->pmd, &nr_free); + if (r) + return; + + if (nr_free) { + set_pool_mode(pool, PM_WRITE); + requeue_bios(pool); + } +} + +/* + * A non-zero return indicates read_only or fail_io mode. + * Many callers don't care about the return value. + */ +static int commit(struct pool *pool) +{ + int r; + + if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) + return -EINVAL; + + r = dm_pool_commit_metadata(pool->pmd); + if (r) + metadata_operation_failed(pool, "dm_pool_commit_metadata", r); + else { + check_for_metadata_space(pool); + check_for_data_space(pool); + } + + return r; +} + +static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) +{ + if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { + DMWARN("%s: reached low water mark for data device: sending event.", + dm_device_name(pool->pool_md)); + spin_lock_irq(&pool->lock); + pool->low_water_triggered = true; + spin_unlock_irq(&pool->lock); + dm_table_event(pool->ti->table); + } +} + +static int alloc_data_block(struct thin_c *tc, dm_block_t *result) +{ + int r; + dm_block_t free_blocks; + struct pool *pool = tc->pool; + + if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) + return -EINVAL; + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); + return r; + } + + check_low_water_mark(pool, free_blocks); + + if (!free_blocks) { + /* + * Try to commit to see if that will free up some + * more space. + */ + r = commit(pool); + if (r) + return r; + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); + return r; + } + + if (!free_blocks) { + set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); + return -ENOSPC; + } + } + + r = dm_pool_alloc_data_block(pool->pmd, result); + if (r) { + if (r == -ENOSPC) + set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); + else + metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); + return r; + } + + r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks); + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r); + return r; + } + + if (!free_blocks) { + /* Let's commit before we use up the metadata reserve. */ + r = commit(pool); + if (r) + return r; + } + + return 0; +} + +/* + * If we have run out of space, queue bios until the device is + * resumed, presumably after having been reloaded with more space. + */ +static void retry_on_resume(struct bio *bio) +{ + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + struct thin_c *tc = h->tc; + + spin_lock_irq(&tc->lock); + bio_list_add(&tc->retry_on_resume_list, bio); + spin_unlock_irq(&tc->lock); +} + +static blk_status_t should_error_unserviceable_bio(struct pool *pool) +{ + enum pool_mode m = get_pool_mode(pool); + + switch (m) { + case PM_WRITE: + /* Shouldn't get here */ + DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); + return BLK_STS_IOERR; + + case PM_OUT_OF_DATA_SPACE: + return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; + + case PM_OUT_OF_METADATA_SPACE: + case PM_READ_ONLY: + case PM_FAIL: + return BLK_STS_IOERR; + default: + /* Shouldn't get here */ + DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); + return BLK_STS_IOERR; + } +} + +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +{ + blk_status_t error = should_error_unserviceable_bio(pool); + + if (error) { + bio->bi_status = error; + bio_endio(bio); + } else + retry_on_resume(bio); +} + +static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + struct bio *bio; + struct bio_list bios; + blk_status_t error; + + error = should_error_unserviceable_bio(pool); + if (error) { + cell_error_with_code(pool, cell, error); + return; + } + + bio_list_init(&bios); + cell_release(pool, cell, &bios); + + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); +} + +static void process_discard_cell_no_passdown(struct thin_c *tc, + struct dm_bio_prison_cell *virt_cell) +{ + struct pool *pool = tc->pool; + struct dm_thin_new_mapping *m = get_next_mapping(pool); + + /* + * We don't need to lock the data blocks, since there's no + * passdown. We only lock data blocks for allocation and breaking sharing. + */ + m->tc = tc; + m->virt_begin = virt_cell->key.block_begin; + m->virt_end = virt_cell->key.block_end; + m->cell = virt_cell; + m->bio = virt_cell->holder; + + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); +} + +static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end, + struct bio *bio) +{ + struct pool *pool = tc->pool; + + int r; + bool maybe_shared; + struct dm_cell_key data_key; + struct dm_bio_prison_cell *data_cell; + struct dm_thin_new_mapping *m; + dm_block_t virt_begin, virt_end, data_begin; + + while (begin != end) { + r = ensure_next_mapping(pool); + if (r) + /* we did our best */ + return; + + r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end, + &data_begin, &maybe_shared); + if (r) + /* + * Silently fail, letting any mappings we've + * created complete. + */ + break; + + build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key); + if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) { + /* contention, we'll give up with this range */ + begin = virt_end; + continue; + } + + /* + * IO may still be going to the destination block. We must + * quiesce before we can do the removal. + */ + m = get_next_mapping(pool); + m->tc = tc; + m->maybe_shared = maybe_shared; + m->virt_begin = virt_begin; + m->virt_end = virt_end; + m->data_block = data_begin; + m->cell = data_cell; + m->bio = bio; + + /* + * The parent bio must not complete before sub discard bios are + * chained to it (see end_discard's bio_chain)! + * + * This per-mapping bi_remaining increment is paired with + * the implicit decrement that occurs via bio_endio() in + * end_discard(). + */ + bio_inc_remaining(bio); + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); + + begin = virt_end; + } +} + +static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell) +{ + struct bio *bio = virt_cell->holder; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + /* + * The virt_cell will only get freed once the origin bio completes. + * This means it will remain locked while all the individual + * passdown bios are in flight. + */ + h->cell = virt_cell; + break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio); + + /* + * We complete the bio now, knowing that the bi_remaining field + * will prevent completion until the sub range discards have + * completed. + */ + bio_endio(bio); +} + +static void process_discard_bio(struct thin_c *tc, struct bio *bio) +{ + dm_block_t begin, end; + struct dm_cell_key virt_key; + struct dm_bio_prison_cell *virt_cell; + + get_bio_block_range(tc, bio, &begin, &end); + if (begin == end) { + /* + * The discard covers less than a block. + */ + bio_endio(bio); + return; + } + + build_key(tc->td, VIRTUAL, begin, end, &virt_key); + if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) + /* + * Potential starvation issue: We're relying on the + * fs/application being well behaved, and not trying to + * send IO to a region at the same time as discarding it. + * If they do this persistently then it's possible this + * cell will never be granted. + */ + return; + + tc->pool->process_discard_cell(tc, virt_cell); +} + +static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, + struct dm_cell_key *key, + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *cell) +{ + int r; + dm_block_t data_block; + struct pool *pool = tc->pool; + + r = alloc_data_block(tc, &data_block); + switch (r) { + case 0: + schedule_internal_copy(tc, block, lookup_result->block, + data_block, cell, bio); + break; + + case -ENOSPC: + retry_bios_on_resume(pool, cell); + break; + + default: + DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", + __func__, r); + cell_error(pool, cell); + break; + } +} + +static void __remap_and_issue_shared_cell(void *context, + struct dm_bio_prison_cell *cell) +{ + struct remap_info *info = context; + struct bio *bio; + + while ((bio = bio_list_pop(&cell->bios))) { + if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) || + bio_op(bio) == REQ_OP_DISCARD) + bio_list_add(&info->defer_bios, bio); + else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); + inc_all_io_entry(info->tc->pool, bio); + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void remap_and_issue_shared_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + cell_visit_release(tc->pool, __remap_and_issue_shared_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(tc, bio, block); +} + +static void process_shared_bio(struct thin_c *tc, struct bio *bio, + dm_block_t block, + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *virt_cell) +{ + struct dm_bio_prison_cell *data_cell; + struct pool *pool = tc->pool; + struct dm_cell_key key; + + /* + * If cell is already occupied, then sharing is already in the process + * of being broken so we have nothing further to do here. + */ + build_data_key(tc->td, lookup_result->block, &key); + if (bio_detain(pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); + return; + } + + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) { + break_sharing(tc, bio, block, &key, lookup_result, data_cell); + cell_defer_no_holder(tc, virt_cell); + } else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, lookup_result->block); + + remap_and_issue_shared_cell(tc, data_cell, lookup_result->block); + remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block); + } +} + +static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, + struct dm_bio_prison_cell *cell) +{ + int r; + dm_block_t data_block; + struct pool *pool = tc->pool; + + /* + * Remap empty bios (flushes) immediately, without provisioning. + */ + if (!bio->bi_iter.bi_size) { + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + + remap_and_issue(tc, bio, 0); + return; + } + + /* + * Fill read bios with zeroes and complete them immediately. + */ + if (bio_data_dir(bio) == READ) { + zero_fill_bio(bio); + cell_defer_no_holder(tc, cell); + bio_endio(bio); + return; + } + + r = alloc_data_block(tc, &data_block); + switch (r) { + case 0: + if (tc->origin_dev) + schedule_external_copy(tc, block, data_block, cell, bio); + else + schedule_zero(tc, block, data_block, cell, bio); + break; + + case -ENOSPC: + retry_bios_on_resume(pool, cell); + break; + + default: + DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", + __func__, r); + cell_error(pool, cell); + break; + } +} + +static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + int r; + struct pool *pool = tc->pool; + struct bio *bio = cell->holder; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + + if (tc->requeue_mode) { + cell_requeue(pool, cell); + return; + } + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + if (lookup_result.shared) + process_shared_bio(tc, bio, block, &lookup_result, cell); + else { + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, lookup_result.block); + inc_remap_and_issue_cell(tc, cell, lookup_result.block); + } + break; + + case -ENODATA: + if (bio_data_dir(bio) == READ && tc->origin_dev) { + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + + if (bio_end_sector(bio) <= tc->origin_size) + remap_to_origin_and_issue(tc, bio); + + else if (bio->bi_iter.bi_sector < tc->origin_size) { + zero_fill_bio(bio); + bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT; + remap_to_origin_and_issue(tc, bio); + + } else { + zero_fill_bio(bio); + bio_endio(bio); + } + } else + provision_block(tc, bio, block, cell); + break; + + default: + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", + __func__, r); + cell_defer_no_holder(tc, cell); + bio_io_error(bio); + break; + } +} + +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + dm_block_t block = get_bio_block(tc, bio); + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + process_cell(tc, cell); +} + +static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, + struct dm_bio_prison_cell *cell) +{ + int r; + int rw = bio_data_dir(bio); + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) { + handle_unserviceable_bio(tc->pool, bio); + if (cell) + cell_defer_no_holder(tc, cell); + } else { + inc_all_io_entry(tc->pool, bio); + remap_and_issue(tc, bio, lookup_result.block); + if (cell) + inc_remap_and_issue_cell(tc, cell, lookup_result.block); + } + break; + + case -ENODATA: + if (cell) + cell_defer_no_holder(tc, cell); + if (rw != READ) { + handle_unserviceable_bio(tc->pool, bio); + break; + } + + if (tc->origin_dev) { + inc_all_io_entry(tc->pool, bio); + remap_to_origin_and_issue(tc, bio); + break; + } + + zero_fill_bio(bio); + bio_endio(bio); + break; + + default: + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", + __func__, r); + if (cell) + cell_defer_no_holder(tc, cell); + bio_io_error(bio); + break; + } +} + +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + __process_bio_read_only(tc, bio, NULL); +} + +static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + __process_bio_read_only(tc, cell->holder, cell); +} + +static void process_bio_success(struct thin_c *tc, struct bio *bio) +{ + bio_endio(bio); +} + +static void process_bio_fail(struct thin_c *tc, struct bio *bio) +{ + bio_io_error(bio); +} + +static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_success(tc->pool, cell); +} + +static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_error(tc->pool, cell); +} + +/* + * FIXME: should we also commit due to size of transaction, measured in + * metadata blocks? + */ +static int need_commit_due_to_time(struct pool *pool) +{ + return !time_in_range(jiffies, pool->last_commit_jiffies, + pool->last_commit_jiffies + COMMIT_PERIOD); +} + +#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) +#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) + +static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) +{ + struct rb_node **rbp, *parent; + struct dm_thin_endio_hook *pbd; + sector_t bi_sector = bio->bi_iter.bi_sector; + + rbp = &tc->sort_bio_list.rb_node; + parent = NULL; + while (*rbp) { + parent = *rbp; + pbd = thin_pbd(parent); + + if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + + pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + rb_link_node(&pbd->rb_node, parent, rbp); + rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); +} + +static void __extract_sorted_bios(struct thin_c *tc) +{ + struct rb_node *node; + struct dm_thin_endio_hook *pbd; + struct bio *bio; + + for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { + pbd = thin_pbd(node); + bio = thin_bio(pbd); + + bio_list_add(&tc->deferred_bio_list, bio); + rb_erase(&pbd->rb_node, &tc->sort_bio_list); + } + + WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); +} + +static void __sort_thin_deferred_bios(struct thin_c *tc) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, &tc->deferred_bio_list); + bio_list_init(&tc->deferred_bio_list); + + /* Sort deferred_bio_list using rb-tree */ + while ((bio = bio_list_pop(&bios))) + __thin_bio_rb_add(tc, bio); + + /* + * Transfer the sorted bios in sort_bio_list back to + * deferred_bio_list to allow lockless submission of + * all bios. + */ + __extract_sorted_bios(tc); +} + +static void process_thin_deferred_bios(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + struct bio *bio; + struct bio_list bios; + struct blk_plug plug; + unsigned int count = 0; + + if (tc->requeue_mode) { + error_thin_bio_list(tc, &tc->deferred_bio_list, + BLK_STS_DM_REQUEUE); + return; + } + + bio_list_init(&bios); + + spin_lock_irq(&tc->lock); + + if (bio_list_empty(&tc->deferred_bio_list)) { + spin_unlock_irq(&tc->lock); + return; + } + + __sort_thin_deferred_bios(tc); + + bio_list_merge(&bios, &tc->deferred_bio_list); + bio_list_init(&tc->deferred_bio_list); + + spin_unlock_irq(&tc->lock); + + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bios))) { + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + spin_lock_irq(&tc->lock); + bio_list_add(&tc->deferred_bio_list, bio); + bio_list_merge(&tc->deferred_bio_list, &bios); + spin_unlock_irq(&tc->lock); + break; + } + + if (bio_op(bio) == REQ_OP_DISCARD) + pool->process_discard(tc, bio); + else + pool->process_bio(tc, bio); + + if ((count++ & 127) == 0) { + throttle_work_update(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + } + cond_resched(); + } + blk_finish_plug(&plug); +} + +static int cmp_cells(const void *lhs, const void *rhs) +{ + struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs); + struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs); + + BUG_ON(!lhs_cell->holder); + BUG_ON(!rhs_cell->holder); + + if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector) + return -1; + + if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector) + return 1; + + return 0; +} + +static unsigned int sort_cells(struct pool *pool, struct list_head *cells) +{ + unsigned int count = 0; + struct dm_bio_prison_cell *cell, *tmp; + + list_for_each_entry_safe(cell, tmp, cells, user_list) { + if (count >= CELL_SORT_ARRAY_SIZE) + break; + + pool->cell_sort_array[count++] = cell; + list_del(&cell->user_list); + } + + sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL); + + return count; +} + +static void process_thin_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + struct list_head cells; + struct dm_bio_prison_cell *cell; + unsigned int i, j, count; + + INIT_LIST_HEAD(&cells); + + spin_lock_irq(&tc->lock); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irq(&tc->lock); + + if (list_empty(&cells)) + return; + + do { + count = sort_cells(tc->pool, &cells); + + for (i = 0; i < count; i++) { + cell = pool->cell_sort_array[i]; + BUG_ON(!cell->holder); + + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + for (j = i; j < count; j++) + list_add(&pool->cell_sort_array[j]->user_list, &cells); + + spin_lock_irq(&tc->lock); + list_splice(&cells, &tc->deferred_cells); + spin_unlock_irq(&tc->lock); + return; + } + + if (bio_op(cell->holder) == REQ_OP_DISCARD) + pool->process_discard_cell(tc, cell); + else + pool->process_cell(tc, cell); + } + cond_resched(); + } while (!list_empty(&cells)); +} + +static void thin_get(struct thin_c *tc); +static void thin_put(struct thin_c *tc); + +/* + * We can't hold rcu_read_lock() around code that can block. So we + * find a thin with the rcu lock held; bump a refcount; then drop + * the lock. + */ +static struct thin_c *get_first_thin(struct pool *pool) +{ + struct thin_c *tc = NULL; + + rcu_read_lock(); + if (!list_empty(&pool->active_thins)) { + tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + thin_get(tc); + } + rcu_read_unlock(); + + return tc; +} + +static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) +{ + struct thin_c *old_tc = tc; + + rcu_read_lock(); + list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) { + thin_get(tc); + thin_put(old_tc); + rcu_read_unlock(); + return tc; + } + thin_put(old_tc); + rcu_read_unlock(); + + return NULL; +} + +static void process_deferred_bios(struct pool *pool) +{ + struct bio *bio; + struct bio_list bios, bio_completions; + struct thin_c *tc; + + tc = get_first_thin(pool); + while (tc) { + process_thin_deferred_cells(tc); + process_thin_deferred_bios(tc); + tc = get_next_thin(pool, tc); + } + + /* + * If there are any deferred flush bios, we must commit the metadata + * before issuing them or signaling their completion. + */ + bio_list_init(&bios); + bio_list_init(&bio_completions); + + spin_lock_irq(&pool->lock); + bio_list_merge(&bios, &pool->deferred_flush_bios); + bio_list_init(&pool->deferred_flush_bios); + + bio_list_merge(&bio_completions, &pool->deferred_flush_completions); + bio_list_init(&pool->deferred_flush_completions); + spin_unlock_irq(&pool->lock); + + if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && + !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) + return; + + if (commit(pool)) { + bio_list_merge(&bios, &bio_completions); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + return; + } + pool->last_commit_jiffies = jiffies; + + while ((bio = bio_list_pop(&bio_completions))) + bio_endio(bio); + + while ((bio = bio_list_pop(&bios))) { + /* + * The data device was flushed as part of metadata commit, + * so complete redundant flushes immediately. + */ + if (bio->bi_opf & REQ_PREFLUSH) + bio_endio(bio); + else + dm_submit_bio_remap(bio, NULL); + } +} + +static void do_worker(struct work_struct *ws) +{ + struct pool *pool = container_of(ws, struct pool, worker); + + throttle_work_start(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + throttle_work_update(&pool->throttle); + process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + throttle_work_update(&pool->throttle); + process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); + throttle_work_update(&pool->throttle); + process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2); + throttle_work_update(&pool->throttle); + process_deferred_bios(pool); + throttle_work_complete(&pool->throttle); +} + +/* + * We want to commit periodically so that not too much + * unwritten data builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); + wake_worker(pool); + queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); +} + +/* + * We're holding onto IO to allow userland time to react. After the + * timeout either the pool will have been resized (and thus back in + * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space. + */ +static void do_no_space_timeout(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, + no_space_timeout); + + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { + pool->pf.error_if_no_space = true; + notify_of_pool_mode_change(pool); + error_retry_list_with_code(pool, BLK_STS_NOSPC); + } +} + +/*----------------------------------------------------------------*/ + +struct pool_work { + struct work_struct worker; + struct completion complete; +}; + +static struct pool_work *to_pool_work(struct work_struct *ws) +{ + return container_of(ws, struct pool_work, worker); +} + +static void pool_work_complete(struct pool_work *pw) +{ + complete(&pw->complete); +} + +static void pool_work_wait(struct pool_work *pw, struct pool *pool, + void (*fn)(struct work_struct *)) +{ + INIT_WORK_ONSTACK(&pw->worker, fn); + init_completion(&pw->complete); + queue_work(pool->wq, &pw->worker); + wait_for_completion(&pw->complete); +} + +/*----------------------------------------------------------------*/ + +struct noflush_work { + struct pool_work pw; + struct thin_c *tc; +}; + +static struct noflush_work *to_noflush(struct work_struct *ws) +{ + return container_of(to_pool_work(ws), struct noflush_work, pw); +} + +static void do_noflush_start(struct work_struct *ws) +{ + struct noflush_work *w = to_noflush(ws); + w->tc->requeue_mode = true; + requeue_io(w->tc); + pool_work_complete(&w->pw); +} + +static void do_noflush_stop(struct work_struct *ws) +{ + struct noflush_work *w = to_noflush(ws); + w->tc->requeue_mode = false; + pool_work_complete(&w->pw); +} + +static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) +{ + struct noflush_work w; + + w.tc = tc; + pool_work_wait(&w.pw, tc->pool, fn); +} + +/*----------------------------------------------------------------*/ + +static bool passdown_enabled(struct pool_c *pt) +{ + return pt->adjusted_pf.discard_passdown; +} + +static void set_discard_callbacks(struct pool *pool) +{ + struct pool_c *pt = pool->ti->private; + + if (passdown_enabled(pt)) { + pool->process_discard_cell = process_discard_cell_passdown; + pool->process_prepared_discard = process_prepared_discard_passdown_pt1; + pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2; + } else { + pool->process_discard_cell = process_discard_cell_no_passdown; + pool->process_prepared_discard = process_prepared_discard_no_passdown; + } +} + +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) +{ + struct pool_c *pt = pool->ti->private; + bool needs_check = dm_pool_metadata_needs_check(pool->pmd); + enum pool_mode old_mode = get_pool_mode(pool); + unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ; + + /* + * Never allow the pool to transition to PM_WRITE mode if user + * intervention is required to verify metadata and data consistency. + */ + if (new_mode == PM_WRITE && needs_check) { + DMERR("%s: unable to switch pool to write mode until repaired.", + dm_device_name(pool->pool_md)); + if (old_mode != new_mode) + new_mode = old_mode; + else + new_mode = PM_READ_ONLY; + } + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. + */ + if (old_mode == PM_FAIL) + new_mode = old_mode; + + switch (new_mode) { + case PM_FAIL: + dm_pool_metadata_read_only(pool->pmd); + pool->process_bio = process_bio_fail; + pool->process_discard = process_bio_fail; + pool->process_cell = process_cell_fail; + pool->process_discard_cell = process_cell_fail; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_fail; + + error_retry_list(pool); + break; + + case PM_OUT_OF_METADATA_SPACE: + case PM_READ_ONLY: + dm_pool_metadata_read_only(pool->pmd); + pool->process_bio = process_bio_read_only; + pool->process_discard = process_bio_success; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_cell_success; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_success; + + error_retry_list(pool); + break; + + case PM_OUT_OF_DATA_SPACE: + /* + * Ideally we'd never hit this state; the low water mark + * would trigger userland to extend the pool before we + * completely run out of data space. However, many small + * IOs to unprovisioned space can consume data space at an + * alarming rate. Adjust your low water mark if you're + * frequently seeing this mode. + */ + pool->out_of_data_space = true; + pool->process_bio = process_bio_read_only; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell_read_only; + pool->process_prepared_mapping = process_prepared_mapping; + set_discard_callbacks(pool); + + if (!pool->pf.error_if_no_space && no_space_timeout) + queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); + break; + + case PM_WRITE: + if (old_mode == PM_OUT_OF_DATA_SPACE) + cancel_delayed_work_sync(&pool->no_space_timeout); + pool->out_of_data_space = false; + pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; + dm_pool_metadata_read_write(pool->pmd); + pool->process_bio = process_bio; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell; + pool->process_prepared_mapping = process_prepared_mapping; + set_discard_callbacks(pool); + break; + } + + pool->pf.mode = new_mode; + /* + * The pool mode may have changed, sync it so bind_control_target() + * doesn't cause an unexpected mode transition on resume. + */ + pt->adjusted_pf.mode = new_mode; + + if (old_mode != new_mode) + notify_of_pool_mode_change(pool); +} + +static void abort_transaction(struct pool *pool) +{ + const char *dev_name = dm_device_name(pool->pool_md); + + DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); + if (dm_pool_abort_metadata(pool->pmd)) { + DMERR("%s: failed to abort metadata transaction", dev_name); + set_pool_mode(pool, PM_FAIL); + } + + if (dm_pool_metadata_set_needs_check(pool->pmd)) { + DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); + set_pool_mode(pool, PM_FAIL); + } +} + +static void metadata_operation_failed(struct pool *pool, const char *op, int r) +{ + DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", + dm_device_name(pool->pool_md), op, r); + + abort_transaction(pool); + set_pool_mode(pool, PM_READ_ONLY); +} + +/*----------------------------------------------------------------*/ + +/* + * Mapping functions. + */ + +/* + * Called only while mapping a thin bio to hand it over to the workqueue. + */ +static void thin_defer_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + spin_lock_irq(&tc->lock); + bio_list_add(&tc->deferred_bio_list, bio); + spin_unlock_irq(&tc->lock); + + wake_worker(pool); +} + +static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + thin_defer_bio(tc, bio); + throttle_unlock(&pool->throttle); +} + +static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + spin_lock_irq(&tc->lock); + list_add_tail(&cell->user_list, &tc->deferred_cells); + spin_unlock_irq(&tc->lock); + throttle_unlock(&pool->throttle); + + wake_worker(pool); +} + +static void thin_hook_bio(struct thin_c *tc, struct bio *bio) +{ + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->tc = tc; + h->shared_read_entry = NULL; + h->all_io_entry = NULL; + h->overwrite_mapping = NULL; + h->cell = NULL; +} + +/* + * Non-blocking function called from the thin target's map function. + */ +static int thin_bio_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct thin_c *tc = ti->private; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_device *td = tc->td; + struct dm_thin_lookup_result result; + struct dm_bio_prison_cell *virt_cell, *data_cell; + struct dm_cell_key key; + + thin_hook_bio(tc, bio); + + if (tc->requeue_mode) { + bio->bi_status = BLK_STS_DM_REQUEUE; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + if (get_pool_mode(tc->pool) == PM_FAIL) { + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) { + thin_defer_bio_with_throttle(tc, bio); + return DM_MAPIO_SUBMITTED; + } + + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &virt_cell)) + return DM_MAPIO_SUBMITTED; + + r = dm_thin_find_block(td, block, 0, &result); + + /* + * Note that we defer readahead too. + */ + switch (r) { + case 0: + if (unlikely(result.shared)) { + /* + * We have a race condition here between the + * result.shared value returned by the lookup and + * snapshot creation, which may cause new + * sharing. + * + * To avoid this always quiesce the origin before + * taking the snap. You want to do this anyway to + * ensure a consistent application view + * (i.e. lockfs). + * + * More distant ancestors are irrelevant. The + * shared flag will be set in their case. + */ + thin_defer_cell(tc, virt_cell); + return DM_MAPIO_SUBMITTED; + } + + build_data_key(tc->td, result.block, &key); + if (bio_detain(tc->pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); + return DM_MAPIO_SUBMITTED; + } + + inc_all_io_entry(tc->pool, bio); + cell_defer_no_holder(tc, data_cell); + cell_defer_no_holder(tc, virt_cell); + + remap(tc, bio, result.block); + return DM_MAPIO_REMAPPED; + + case -ENODATA: + case -EWOULDBLOCK: + thin_defer_cell(tc, virt_cell); + return DM_MAPIO_SUBMITTED; + + default: + /* + * Must always call bio_io_error on failure. + * dm_thin_find_block can fail with -EINVAL if the + * pool is switched to fail-io mode. + */ + bio_io_error(bio); + cell_defer_no_holder(tc, virt_cell); + return DM_MAPIO_SUBMITTED; + } +} + +static void requeue_bios(struct pool *pool) +{ + struct thin_c *tc; + + rcu_read_lock(); + list_for_each_entry_rcu(tc, &pool->active_thins, list) { + spin_lock_irq(&tc->lock); + bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); + bio_list_init(&tc->retry_on_resume_list); + spin_unlock_irq(&tc->lock); + } + rcu_read_unlock(); +} + +/*---------------------------------------------------------------- + * Binding of control targets to a pool object + *--------------------------------------------------------------*/ +static bool is_factor(sector_t block_size, uint32_t n) +{ + return !sector_div(block_size, n); +} + +/* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct pool_c *pt) +{ + struct pool *pool = pt->pool; + struct block_device *data_bdev = pt->data_dev->bdev; + struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; + const char *reason = NULL; + + if (!pt->adjusted_pf.discard_passdown) + return; + + if (!bdev_max_discard_sectors(pt->data_dev->bdev)) + reason = "discard unsupported"; + + else if (data_limits->max_discard_sectors < pool->sectors_per_block) + reason = "max discard sectors smaller than a block"; + + if (reason) { + DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason); + pt->adjusted_pf.discard_passdown = false; + } +} + +static int bind_control_target(struct pool *pool, struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + + /* + * We want to make sure that a pool in PM_FAIL mode is never upgraded. + */ + enum pool_mode old_mode = get_pool_mode(pool); + enum pool_mode new_mode = pt->adjusted_pf.mode; + + /* + * Don't change the pool's mode until set_pool_mode() below. + * Otherwise the pool's process_* function pointers may + * not match the desired pool mode. + */ + pt->adjusted_pf.mode = old_mode; + + pool->ti = ti; + pool->pf = pt->adjusted_pf; + pool->low_water_blocks = pt->low_water_blocks; + + set_pool_mode(pool, new_mode); + + return 0; +} + +static void unbind_control_target(struct pool *pool, struct dm_target *ti) +{ + if (pool->ti == ti) + pool->ti = NULL; +} + +/*---------------------------------------------------------------- + * Pool creation + *--------------------------------------------------------------*/ +/* Initialize pool features. */ +static void pool_features_init(struct pool_features *pf) +{ + pf->mode = PM_WRITE; + pf->zero_new_blocks = true; + pf->discard_enabled = true; + pf->discard_passdown = true; + pf->error_if_no_space = false; +} + +static void __pool_destroy(struct pool *pool) +{ + __pool_table_remove(pool); + + vfree(pool->cell_sort_array); + if (dm_pool_metadata_close(pool->pmd) < 0) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + + dm_bio_prison_destroy(pool->prison); + dm_kcopyd_client_destroy(pool->copier); + + cancel_delayed_work_sync(&pool->waker); + cancel_delayed_work_sync(&pool->no_space_timeout); + if (pool->wq) + destroy_workqueue(pool->wq); + + if (pool->next_mapping) + mempool_free(pool->next_mapping, &pool->mapping_pool); + mempool_exit(&pool->mapping_pool); + dm_deferred_set_destroy(pool->shared_read_ds); + dm_deferred_set_destroy(pool->all_io_ds); + kfree(pool); +} + +static struct kmem_cache *_new_mapping_cache; + +static struct pool *pool_create(struct mapped_device *pool_md, + struct block_device *metadata_dev, + struct block_device *data_dev, + unsigned long block_size, + int read_only, char **error) +{ + int r; + void *err_p; + struct pool *pool; + struct dm_pool_metadata *pmd; + bool format_device = read_only ? false : true; + + pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); + if (IS_ERR(pmd)) { + *error = "Error creating metadata object"; + return (struct pool *)pmd; + } + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + *error = "Error allocating memory for pool"; + err_p = ERR_PTR(-ENOMEM); + goto bad_pool; + } + + pool->pmd = pmd; + pool->sectors_per_block = block_size; + if (block_size & (block_size - 1)) + pool->sectors_per_block_shift = -1; + else + pool->sectors_per_block_shift = __ffs(block_size); + pool->low_water_blocks = 0; + pool_features_init(&pool->pf); + pool->prison = dm_bio_prison_create(); + if (!pool->prison) { + *error = "Error creating pool's bio prison"; + err_p = ERR_PTR(-ENOMEM); + goto bad_prison; + } + + pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(pool->copier)) { + r = PTR_ERR(pool->copier); + *error = "Error creating pool's kcopyd client"; + err_p = ERR_PTR(r); + goto bad_kcopyd_client; + } + + /* + * Create singlethreaded workqueue that will service all devices + * that use this metadata. + */ + pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!pool->wq) { + *error = "Error creating pool's workqueue"; + err_p = ERR_PTR(-ENOMEM); + goto bad_wq; + } + + throttle_init(&pool->throttle); + INIT_WORK(&pool->worker, do_worker); + INIT_DELAYED_WORK(&pool->waker, do_waker); + INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); + spin_lock_init(&pool->lock); + bio_list_init(&pool->deferred_flush_bios); + bio_list_init(&pool->deferred_flush_completions); + INIT_LIST_HEAD(&pool->prepared_mappings); + INIT_LIST_HEAD(&pool->prepared_discards); + INIT_LIST_HEAD(&pool->prepared_discards_pt2); + INIT_LIST_HEAD(&pool->active_thins); + pool->low_water_triggered = false; + pool->suspended = true; + pool->out_of_data_space = false; + + pool->shared_read_ds = dm_deferred_set_create(); + if (!pool->shared_read_ds) { + *error = "Error creating pool's shared read deferred set"; + err_p = ERR_PTR(-ENOMEM); + goto bad_shared_read_ds; + } + + pool->all_io_ds = dm_deferred_set_create(); + if (!pool->all_io_ds) { + *error = "Error creating pool's all io deferred set"; + err_p = ERR_PTR(-ENOMEM); + goto bad_all_io_ds; + } + + pool->next_mapping = NULL; + r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE, + _new_mapping_cache); + if (r) { + *error = "Error creating pool's mapping mempool"; + err_p = ERR_PTR(r); + goto bad_mapping_pool; + } + + pool->cell_sort_array = + vmalloc(array_size(CELL_SORT_ARRAY_SIZE, + sizeof(*pool->cell_sort_array))); + if (!pool->cell_sort_array) { + *error = "Error allocating cell sort array"; + err_p = ERR_PTR(-ENOMEM); + goto bad_sort_array; + } + + pool->ref_count = 1; + pool->last_commit_jiffies = jiffies; + pool->pool_md = pool_md; + pool->md_dev = metadata_dev; + pool->data_dev = data_dev; + __pool_table_insert(pool); + + return pool; + +bad_sort_array: + mempool_exit(&pool->mapping_pool); +bad_mapping_pool: + dm_deferred_set_destroy(pool->all_io_ds); +bad_all_io_ds: + dm_deferred_set_destroy(pool->shared_read_ds); +bad_shared_read_ds: + destroy_workqueue(pool->wq); +bad_wq: + dm_kcopyd_client_destroy(pool->copier); +bad_kcopyd_client: + dm_bio_prison_destroy(pool->prison); +bad_prison: + kfree(pool); +bad_pool: + if (dm_pool_metadata_close(pmd)) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + + return err_p; +} + +static void __pool_inc(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + pool->ref_count++; +} + +static void __pool_dec(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + BUG_ON(!pool->ref_count); + if (!--pool->ref_count) + __pool_destroy(pool); +} + +static struct pool *__pool_find(struct mapped_device *pool_md, + struct block_device *metadata_dev, + struct block_device *data_dev, + unsigned long block_size, int read_only, + char **error, int *created) +{ + struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); + + if (pool) { + if (pool->pool_md != pool_md) { + *error = "metadata device already in use by a pool"; + return ERR_PTR(-EBUSY); + } + if (pool->data_dev != data_dev) { + *error = "data device already in use by a pool"; + return ERR_PTR(-EBUSY); + } + __pool_inc(pool); + + } else { + pool = __pool_table_lookup(pool_md); + if (pool) { + if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) { + *error = "different pool cannot replace a pool"; + return ERR_PTR(-EINVAL); + } + __pool_inc(pool); + + } else { + pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error); + *created = 1; + } + } + + return pool; +} + +/*---------------------------------------------------------------- + * Pool target methods + *--------------------------------------------------------------*/ +static void pool_dtr(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + + mutex_lock(&dm_thin_pool_table.mutex); + + unbind_control_target(pt->pool, ti); + __pool_dec(pt->pool); + dm_put_device(ti, pt->metadata_dev); + dm_put_device(ti, pt->data_dev); + kfree(pt); + + mutex_unlock(&dm_thin_pool_table.mutex); +} + +static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, + struct dm_target *ti) +{ + int r; + unsigned int argc; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, 4, "Invalid number of pool feature arguments"}, + }; + + /* + * No feature arguments supplied. + */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + while (argc && !r) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "skip_block_zeroing")) + pf->zero_new_blocks = false; + + else if (!strcasecmp(arg_name, "ignore_discard")) + pf->discard_enabled = false; + + else if (!strcasecmp(arg_name, "no_discard_passdown")) + pf->discard_passdown = false; + + else if (!strcasecmp(arg_name, "read_only")) + pf->mode = PM_READ_ONLY; + + else if (!strcasecmp(arg_name, "error_if_no_space")) + pf->error_if_no_space = true; + + else { + ti->error = "Unrecognised pool feature requested"; + r = -EINVAL; + break; + } + } + + return r; +} + +static void metadata_low_callback(void *context) +{ + struct pool *pool = context; + + DMWARN("%s: reached low water mark for metadata device: sending event.", + dm_device_name(pool->pool_md)); + + dm_table_event(pool->ti->table); +} + +/* + * We need to flush the data device **before** committing the metadata. + * + * This ensures that the data blocks of any newly inserted mappings are + * properly written to non-volatile storage and won't be lost in case of a + * crash. + * + * Failure to do so can result in data corruption in the case of internal or + * external snapshots and in the case of newly provisioned blocks, when block + * zeroing is enabled. + */ +static int metadata_pre_commit_callback(void *context) +{ + struct pool *pool = context; + + return blkdev_issue_flush(pool->data_dev); +} + +static sector_t get_dev_size(struct block_device *bdev) +{ + return bdev_nr_sectors(bdev); +} + +static void warn_if_metadata_device_too_big(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_dev_size(bdev); + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", + bdev, THIN_METADATA_MAX_SECTORS); +} + +static sector_t get_metadata_dev_size(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_dev_size(bdev); + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS) + metadata_dev_size = THIN_METADATA_MAX_SECTORS; + + return metadata_dev_size; +} + +static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_metadata_dev_size(bdev); + + sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE); + + return metadata_dev_size; +} + +/* + * When a metadata threshold is crossed a dm event is triggered, and + * userland should respond by growing the metadata device. We could let + * userland set the threshold, like we do with the data threshold, but I'm + * not sure they know enough to do this well. + */ +static dm_block_t calc_metadata_threshold(struct pool_c *pt) +{ + /* + * 4M is ample for all ops with the possible exception of thin + * device deletion which is harmless if it fails (just retry the + * delete after you've grown the device). + */ + dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4; + return min((dm_block_t)1024ULL /* 4M */, quarter); +} + +/* + * thin-pool + * + * + * [<#feature args> []*] + * + * Optional feature arguments are: + * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. + * ignore_discard: disable discard + * no_discard_passdown: don't pass discards down to the data device + * read_only: Don't allow any changes to be made to the pool metadata. + * error_if_no_space: error IOs, instead of queueing, if no space. + */ +static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r, pool_created = 0; + struct pool_c *pt; + struct pool *pool; + struct pool_features pf; + struct dm_arg_set as; + struct dm_dev *data_dev; + unsigned long block_size; + dm_block_t low_water_blocks; + struct dm_dev *metadata_dev; + fmode_t metadata_mode; + + /* + * FIXME Remove validation from scope of lock. + */ + mutex_lock(&dm_thin_pool_table.mutex); + + if (argc < 4) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto out_unlock; + } + + as.argc = argc; + as.argv = argv; + + /* make sure metadata and data are different devices */ + if (!strcmp(argv[0], argv[1])) { + ti->error = "Error setting metadata or data device"; + r = -EINVAL; + goto out_unlock; + } + + /* + * Set default pool features. + */ + pool_features_init(&pf); + + dm_consume_args(&as, 4); + r = parse_pool_features(&as, &pf, ti); + if (r) + goto out_unlock; + + metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); + r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); + if (r) { + ti->error = "Error opening metadata block device"; + goto out_unlock; + } + warn_if_metadata_device_too_big(metadata_dev->bdev); + + r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); + if (r) { + ti->error = "Error getting data device"; + goto out_metadata; + } + + if (kstrtoul(argv[2], 10, &block_size) || !block_size || + block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || + block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || + block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { + ti->error = "Invalid block size"; + r = -EINVAL; + goto out; + } + + if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { + ti->error = "Invalid low water mark"; + r = -EINVAL; + goto out; + } + + pt = kzalloc(sizeof(*pt), GFP_KERNEL); + if (!pt) { + r = -ENOMEM; + goto out; + } + + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev, + block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); + if (IS_ERR(pool)) { + r = PTR_ERR(pool); + goto out_free_pt; + } + + /* + * 'pool_created' reflects whether this is the first table load. + * Top level discard support is not allowed to be changed after + * initial load. This would require a pool reload to trigger thin + * device changes. + */ + if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { + ti->error = "Discard support cannot be disabled once enabled"; + r = -EINVAL; + goto out_flags_changed; + } + + pt->pool = pool; + pt->ti = ti; + pt->metadata_dev = metadata_dev; + pt->data_dev = data_dev; + pt->low_water_blocks = low_water_blocks; + pt->adjusted_pf = pt->requested_pf = pf; + ti->num_flush_bios = 1; + ti->limit_swap_bios = true; + + /* + * Only need to enable discards if the pool should pass + * them down to the data device. The thin device's discard + * processing will cause mappings to be removed from the btree. + */ + if (pf.discard_enabled && pf.discard_passdown) { + ti->num_discard_bios = 1; + + /* + * Setting 'discards_supported' circumvents the normal + * stacking of discard limits (this keeps the pool and + * thin devices' discard limits consistent). + */ + ti->discards_supported = true; + } + ti->private = pt; + + r = dm_pool_register_metadata_threshold(pt->pool->pmd, + calc_metadata_threshold(pt), + metadata_low_callback, + pool); + if (r) { + ti->error = "Error registering metadata threshold"; + goto out_flags_changed; + } + + dm_pool_register_pre_commit_callback(pool->pmd, + metadata_pre_commit_callback, pool); + + mutex_unlock(&dm_thin_pool_table.mutex); + + return 0; + +out_flags_changed: + __pool_dec(pool); +out_free_pt: + kfree(pt); +out: + dm_put_device(ti, data_dev); +out_metadata: + dm_put_device(ti, metadata_dev); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); + + return r; +} + +static int pool_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + /* + * As this is a singleton target, ti->begin is always zero. + */ + spin_lock_irq(&pool->lock); + bio_set_dev(bio, pt->data_dev->bdev); + r = DM_MAPIO_REMAPPED; + spin_unlock_irq(&pool->lock); + + return r; +} + +static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + sector_t data_size = ti->len; + dm_block_t sb_data_size; + + *need_commit = false; + + (void) sector_div(data_size, pool->sectors_per_block); + + r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); + if (r) { + DMERR("%s: failed to retrieve data device size", + dm_device_name(pool->pool_md)); + return r; + } + + if (data_size < sb_data_size) { + DMERR("%s: pool target (%llu blocks) too small: expected %llu", + dm_device_name(pool->pool_md), + (unsigned long long)data_size, sb_data_size); + return -EINVAL; + + } else if (data_size > sb_data_size) { + if (dm_pool_metadata_needs_check(pool->pmd)) { + DMERR("%s: unable to grow the data device until repaired.", + dm_device_name(pool->pool_md)); + return 0; + } + + if (sb_data_size) + DMINFO("%s: growing the data device from %llu to %llu blocks", + dm_device_name(pool->pool_md), + sb_data_size, (unsigned long long)data_size); + r = dm_pool_resize_data_dev(pool->pmd, data_size); + if (r) { + metadata_operation_failed(pool, "dm_pool_resize_data_dev", r); + return r; + } + + *need_commit = true; + } + + return 0; +} + +static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + dm_block_t metadata_dev_size, sb_metadata_dev_size; + + *need_commit = false; + + metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev); + + r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); + if (r) { + DMERR("%s: failed to retrieve metadata device size", + dm_device_name(pool->pool_md)); + return r; + } + + if (metadata_dev_size < sb_metadata_dev_size) { + DMERR("%s: metadata device (%llu blocks) too small: expected %llu", + dm_device_name(pool->pool_md), + metadata_dev_size, sb_metadata_dev_size); + return -EINVAL; + + } else if (metadata_dev_size > sb_metadata_dev_size) { + if (dm_pool_metadata_needs_check(pool->pmd)) { + DMERR("%s: unable to grow the metadata device until repaired.", + dm_device_name(pool->pool_md)); + return 0; + } + + warn_if_metadata_device_too_big(pool->md_dev); + DMINFO("%s: growing the metadata device from %llu to %llu blocks", + dm_device_name(pool->pool_md), + sb_metadata_dev_size, metadata_dev_size); + + if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE) + set_pool_mode(pool, PM_WRITE); + + r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); + if (r) { + metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); + return r; + } + + *need_commit = true; + } + + return 0; +} + +/* + * Retrieves the number of blocks of the data device from + * the superblock and compares it to the actual device size, + * thus resizing the data device in case it has grown. + * + * This both copes with opening preallocated data devices in the ctr + * being followed by a resume + * -and- + * calling the resume method individually after userspace has + * grown the data device in reaction to a table event. + */ +static int pool_preresume(struct dm_target *ti) +{ + int r; + bool need_commit1, need_commit2; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + /* + * Take control of the pool object. + */ + r = bind_control_target(pool, ti); + if (r) + goto out; + + r = maybe_resize_data_dev(ti, &need_commit1); + if (r) + goto out; + + r = maybe_resize_metadata_dev(ti, &need_commit2); + if (r) + goto out; + + if (need_commit1 || need_commit2) + (void) commit(pool); +out: + /* + * When a thin-pool is PM_FAIL, it cannot be rebuilt if + * bio is in deferred list. Therefore need to return 0 + * to allow pool_resume() to flush IO. + */ + if (r && get_pool_mode(pool) == PM_FAIL) + r = 0; + + return r; +} + +static void pool_suspend_active_thins(struct pool *pool) +{ + struct thin_c *tc; + + /* Suspend all active thin devices */ + tc = get_first_thin(pool); + while (tc) { + dm_internal_suspend_noflush(tc->thin_md); + tc = get_next_thin(pool, tc); + } +} + +static void pool_resume_active_thins(struct pool *pool) +{ + struct thin_c *tc; + + /* Resume all active thin devices */ + tc = get_first_thin(pool); + while (tc) { + dm_internal_resume(tc->thin_md); + tc = get_next_thin(pool, tc); + } +} + +static void pool_resume(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + /* + * Must requeue active_thins' bios and then resume + * active_thins _before_ clearing 'suspend' flag. + */ + requeue_bios(pool); + pool_resume_active_thins(pool); + + spin_lock_irq(&pool->lock); + pool->low_water_triggered = false; + pool->suspended = false; + spin_unlock_irq(&pool->lock); + + do_waker(&pool->waker.work); +} + +static void pool_presuspend(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + spin_lock_irq(&pool->lock); + pool->suspended = true; + spin_unlock_irq(&pool->lock); + + pool_suspend_active_thins(pool); +} + +static void pool_presuspend_undo(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + pool_resume_active_thins(pool); + + spin_lock_irq(&pool->lock); + pool->suspended = false; + spin_unlock_irq(&pool->lock); +} + +static void pool_postsuspend(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + cancel_delayed_work_sync(&pool->waker); + cancel_delayed_work_sync(&pool->no_space_timeout); + flush_workqueue(pool->wq); + (void) commit(pool); +} + +static int check_arg_count(unsigned int argc, unsigned int args_required) +{ + if (argc != args_required) { + DMWARN("Message received with %u arguments instead of %u.", + argc, args_required); + return -EINVAL; + } + + return 0; +} + +static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) +{ + if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && + *dev_id <= MAX_DEV_ID) + return 0; + + if (warning) + DMWARN("Message received with invalid device id: %s", arg); + + return -EINVAL; +} + +static int process_create_thin_mesg(unsigned int argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + int r; + + r = check_arg_count(argc, 2); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = dm_pool_create_thin(pool->pmd, dev_id); + if (r) { + DMWARN("Creation of new thinly-provisioned device with id %s failed.", + argv[1]); + return r; + } + + return 0; +} + +static int process_create_snap_mesg(unsigned int argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + dm_thin_id origin_dev_id; + int r; + + r = check_arg_count(argc, 3); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = read_dev_id(argv[2], &origin_dev_id, 1); + if (r) + return r; + + r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); + if (r) { + DMWARN("Creation of new snapshot %s of device %s failed.", + argv[1], argv[2]); + return r; + } + + return 0; +} + +static int process_delete_mesg(unsigned int argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + int r; + + r = check_arg_count(argc, 2); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = dm_pool_delete_thin_device(pool->pmd, dev_id); + if (r) + DMWARN("Deletion of thin device %s failed.", argv[1]); + + return r; +} + +static int process_set_transaction_id_mesg(unsigned int argc, char **argv, struct pool *pool) +{ + dm_thin_id old_id, new_id; + int r; + + r = check_arg_count(argc, 3); + if (r) + return r; + + if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { + DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); + return -EINVAL; + } + + if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { + DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); + return -EINVAL; + } + + r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); + if (r) { + DMWARN("Failed to change transaction id from %s to %s.", + argv[1], argv[2]); + return r; + } + + return 0; +} + +static int process_reserve_metadata_snap_mesg(unsigned int argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + (void) commit(pool); + + r = dm_pool_reserve_metadata_snap(pool->pmd); + if (r) + DMWARN("reserve_metadata_snap message failed."); + + return r; +} + +static int process_release_metadata_snap_mesg(unsigned int argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + r = dm_pool_release_metadata_snap(pool->pmd); + if (r) + DMWARN("release_metadata_snap message failed."); + + return r; +} + +/* + * Messages supported: + * create_thin + * create_snap + * delete + * set_transaction_id + * reserve_metadata_snap + * release_metadata_snap + */ +static int pool_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r = -EINVAL; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) { + DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", + dm_device_name(pool->pool_md)); + return -EOPNOTSUPP; + } + + if (!strcasecmp(argv[0], "create_thin")) + r = process_create_thin_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "create_snap")) + r = process_create_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "delete")) + r = process_delete_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "set_transaction_id")) + r = process_set_transaction_id_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "reserve_metadata_snap")) + r = process_reserve_metadata_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "release_metadata_snap")) + r = process_release_metadata_snap_mesg(argc, argv, pool); + + else + DMWARN("Unrecognised thin pool target message received: %s", argv[0]); + + if (!r) + (void) commit(pool); + + return r; +} + +static void emit_flags(struct pool_features *pf, char *result, + unsigned int sz, unsigned int maxlen) +{ + unsigned int count = !pf->zero_new_blocks + !pf->discard_enabled + + !pf->discard_passdown + (pf->mode == PM_READ_ONLY) + + pf->error_if_no_space; + DMEMIT("%u ", count); + + if (!pf->zero_new_blocks) + DMEMIT("skip_block_zeroing "); + + if (!pf->discard_enabled) + DMEMIT("ignore_discard "); + + if (!pf->discard_passdown) + DMEMIT("no_discard_passdown "); + + if (pf->mode == PM_READ_ONLY) + DMEMIT("read_only "); + + if (pf->error_if_no_space) + DMEMIT("error_if_no_space "); +} + +/* + * Status line is: + * / + * / + * + */ +static void pool_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + int r; + unsigned int sz = 0; + uint64_t transaction_id; + dm_block_t nr_free_blocks_data; + dm_block_t nr_free_blocks_metadata; + dm_block_t nr_blocks_data; + dm_block_t nr_blocks_metadata; + dm_block_t held_root; + enum pool_mode mode; + char buf[BDEVNAME_SIZE]; + char buf2[BDEVNAME_SIZE]; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + switch (type) { + case STATUSTYPE_INFO: + if (get_pool_mode(pool) == PM_FAIL) { + DMEMIT("Fail"); + break; + } + + /* Commit to ensure statistics aren't out-of-date */ + if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) + (void) commit(pool); + + r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); + if (r) { + DMERR("%s: dm_pool_get_metadata_transaction_id returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); + if (r) { + DMERR("%s: dm_pool_get_free_metadata_block_count returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); + if (r) { + DMERR("%s: dm_pool_get_metadata_dev_size returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); + if (r) { + DMERR("%s: dm_pool_get_free_block_count returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); + if (r) { + DMERR("%s: dm_pool_get_data_dev_size returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_metadata_snap(pool->pmd, &held_root); + if (r) { + DMERR("%s: dm_pool_get_metadata_snap returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + DMEMIT("%llu %llu/%llu %llu/%llu ", + (unsigned long long)transaction_id, + (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), + (unsigned long long)nr_blocks_metadata, + (unsigned long long)(nr_blocks_data - nr_free_blocks_data), + (unsigned long long)nr_blocks_data); + + if (held_root) + DMEMIT("%llu ", held_root); + else + DMEMIT("- "); + + mode = get_pool_mode(pool); + if (mode == PM_OUT_OF_DATA_SPACE) + DMEMIT("out_of_data_space "); + else if (is_read_only_pool_mode(mode)) + DMEMIT("ro "); + else + DMEMIT("rw "); + + if (!pool->pf.discard_enabled) + DMEMIT("ignore_discard "); + else if (pool->pf.discard_passdown) + DMEMIT("discard_passdown "); + else + DMEMIT("no_discard_passdown "); + + if (pool->pf.error_if_no_space) + DMEMIT("error_if_no_space "); + else + DMEMIT("queue_if_no_space "); + + if (dm_pool_metadata_needs_check(pool->pmd)) + DMEMIT("needs_check "); + else + DMEMIT("- "); + + DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt)); + + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %s %lu %llu ", + format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), + format_dev_t(buf2, pt->data_dev->bdev->bd_dev), + (unsigned long)pool->sectors_per_block, + (unsigned long long)pt->low_water_blocks); + emit_flags(&pt->requested_pf, result, sz, maxlen); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + return; + +err: + DMEMIT("Error"); +} + +static int pool_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct pool_c *pt = ti->private; + + return fn(ti, pt->data_dev, 0, ti->len, data); +} + +static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If max_sectors is smaller than pool->sectors_per_block adjust it + * to the highest possible power-of-2 factor of pool->sectors_per_block. + * This is especially beneficial when the pool's data device is a RAID + * device that has a full stripe width that matches pool->sectors_per_block + * -- because even though partial RAID stripe-sized IOs will be issued to a + * single RAID stripe; when aggregated they will end on a full RAID stripe + * boundary.. which avoids additional partial RAID stripe writes cascading + */ + if (limits->max_sectors < pool->sectors_per_block) { + while (!is_factor(pool->sectors_per_block, limits->max_sectors)) { + if ((limits->max_sectors & (limits->max_sectors - 1)) == 0) + limits->max_sectors--; + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } + } + + /* + * If the system-determined stacked limits are compatible with the + * pool's blocksize (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < pool->sectors_per_block || + !is_factor(io_opt_sectors, pool->sectors_per_block)) { + if (is_factor(pool->sectors_per_block, limits->max_sectors)) + blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + else + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + } + + /* + * pt->adjusted_pf is a staging area for the actual features to use. + * They get transferred to the live pool in bind_control_target() + * called from pool_preresume(). + */ + if (!pt->adjusted_pf.discard_enabled) { + /* + * Must explicitly disallow stacking discard limits otherwise the + * block layer will stack them if pool's data device has support. + */ + limits->discard_granularity = 0; + return; + } + + disable_passdown_if_not_supported(pt); + + /* + * The pool uses the same discard limits as the underlying data + * device. DM core has already set this up. + */ +} + +static struct target_type pool_target = { + .name = "thin-pool", + .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | + DM_TARGET_IMMUTABLE, + .version = {1, 22, 0}, + .module = THIS_MODULE, + .ctr = pool_ctr, + .dtr = pool_dtr, + .map = pool_map, + .presuspend = pool_presuspend, + .presuspend_undo = pool_presuspend_undo, + .postsuspend = pool_postsuspend, + .preresume = pool_preresume, + .resume = pool_resume, + .message = pool_message, + .status = pool_status, + .iterate_devices = pool_iterate_devices, + .io_hints = pool_io_hints, +}; + +/*---------------------------------------------------------------- + * Thin target methods + *--------------------------------------------------------------*/ +static void thin_get(struct thin_c *tc) +{ + refcount_inc(&tc->refcount); +} + +static void thin_put(struct thin_c *tc) +{ + if (refcount_dec_and_test(&tc->refcount)) + complete(&tc->can_destroy); +} + +static void thin_dtr(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + spin_lock_irq(&tc->pool->lock); + list_del_rcu(&tc->list); + spin_unlock_irq(&tc->pool->lock); + synchronize_rcu(); + + thin_put(tc); + wait_for_completion(&tc->can_destroy); + + mutex_lock(&dm_thin_pool_table.mutex); + + __pool_dec(tc->pool); + dm_pool_close_thin_device(tc->td); + dm_put_device(ti, tc->pool_dev); + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); + kfree(tc); + + mutex_unlock(&dm_thin_pool_table.mutex); +} + +/* + * Thin target parameters: + * + * [origin_dev] + * + * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) + * dev_id: the internal device identifier + * origin_dev: a device external to the pool that should act as the origin + * + * If the pool device has discards disabled, they get disabled for the thin + * device as well. + */ +static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct thin_c *tc; + struct dm_dev *pool_dev, *origin_dev; + struct mapped_device *pool_md; + + mutex_lock(&dm_thin_pool_table.mutex); + + if (argc != 2 && argc != 3) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto out_unlock; + } + + tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); + if (!tc) { + ti->error = "Out of memory"; + r = -ENOMEM; + goto out_unlock; + } + tc->thin_md = dm_table_get_md(ti->table); + spin_lock_init(&tc->lock); + INIT_LIST_HEAD(&tc->deferred_cells); + bio_list_init(&tc->deferred_bio_list); + bio_list_init(&tc->retry_on_resume_list); + tc->sort_bio_list = RB_ROOT; + + if (argc == 3) { + if (!strcmp(argv[0], argv[2])) { + ti->error = "Error setting origin device"; + r = -EINVAL; + goto bad_origin_dev; + } + + r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + if (r) { + ti->error = "Error opening origin device"; + goto bad_origin_dev; + } + tc->origin_dev = origin_dev; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); + if (r) { + ti->error = "Error opening pool device"; + goto bad_pool_dev; + } + tc->pool_dev = pool_dev; + + if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { + ti->error = "Invalid device id"; + r = -EINVAL; + goto bad_common; + } + + pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); + if (!pool_md) { + ti->error = "Couldn't get pool mapped device"; + r = -EINVAL; + goto bad_common; + } + + tc->pool = __pool_table_lookup(pool_md); + if (!tc->pool) { + ti->error = "Couldn't find pool object"; + r = -EINVAL; + goto bad_pool_lookup; + } + __pool_inc(tc->pool); + + if (get_pool_mode(tc->pool) == PM_FAIL) { + ti->error = "Couldn't open thin device, Pool is in fail mode"; + r = -EINVAL; + goto bad_pool; + } + + r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); + if (r) { + ti->error = "Couldn't open thin internal device"; + goto bad_pool; + } + + r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); + if (r) + goto bad; + + ti->num_flush_bios = 1; + ti->limit_swap_bios = true; + ti->flush_supported = true; + ti->accounts_remapped_io = true; + ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); + + /* In case the pool supports discards, pass them on. */ + if (tc->pool->pf.discard_enabled) { + ti->discards_supported = true; + ti->num_discard_bios = 1; + } + + mutex_unlock(&dm_thin_pool_table.mutex); + + spin_lock_irq(&tc->pool->lock); + if (tc->pool->suspended) { + spin_unlock_irq(&tc->pool->lock); + mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ + ti->error = "Unable to activate thin device while pool is suspended"; + r = -EINVAL; + goto bad; + } + refcount_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + list_add_tail_rcu(&tc->list, &tc->pool->active_thins); + spin_unlock_irq(&tc->pool->lock); + /* + * This synchronize_rcu() call is needed here otherwise we risk a + * wake_worker() call finding no bios to process (because the newly + * added tc isn't yet visible). So this reduces latency since we + * aren't then dependent on the periodic commit to wake_worker(). + */ + synchronize_rcu(); + + dm_put(pool_md); + + return 0; + +bad: + dm_pool_close_thin_device(tc->td); +bad_pool: + __pool_dec(tc->pool); +bad_pool_lookup: + dm_put(pool_md); +bad_common: + dm_put_device(ti, tc->pool_dev); +bad_pool_dev: + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); +bad_origin_dev: + kfree(tc); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); + + return r; +} + +static int thin_map(struct dm_target *ti, struct bio *bio) +{ + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + return thin_bio_map(ti, bio); +} + +static int thin_endio(struct dm_target *ti, struct bio *bio, + blk_status_t *err) +{ + unsigned long flags; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + struct list_head work; + struct dm_thin_new_mapping *m, *tmp; + struct pool *pool = h->tc->pool; + + if (h->shared_read_entry) { + INIT_LIST_HEAD(&work); + dm_deferred_entry_dec(h->shared_read_entry, &work); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) { + list_del(&m->list); + __complete_mapping_preparation(m); + } + spin_unlock_irqrestore(&pool->lock, flags); + } + + if (h->all_io_entry) { + INIT_LIST_HEAD(&work); + dm_deferred_entry_dec(h->all_io_entry, &work); + if (!list_empty(&work)) { + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) + list_add_tail(&m->list, &pool->prepared_discards); + spin_unlock_irqrestore(&pool->lock, flags); + wake_worker(pool); + } + } + + if (h->cell) + cell_defer_no_holder(h->tc, h->cell); + + return DM_ENDIO_DONE; +} + +static void thin_presuspend(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + if (dm_noflush_suspending(ti)) + noflush_work(tc, do_noflush_start); +} + +static void thin_postsuspend(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + /* + * The dm_noflush_suspending flag has been cleared by now, so + * unfortunately we must always run this. + */ + noflush_work(tc, do_noflush_stop); +} + +static int thin_preresume(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + if (tc->origin_dev) + tc->origin_size = get_dev_size(tc->origin_dev->bdev); + + return 0; +} + +/* + * + */ +static void thin_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + int r; + ssize_t sz = 0; + dm_block_t mapped, highest; + char buf[BDEVNAME_SIZE]; + struct thin_c *tc = ti->private; + + if (get_pool_mode(tc->pool) == PM_FAIL) { + DMEMIT("Fail"); + return; + } + + if (!tc->td) + DMEMIT("-"); + else { + switch (type) { + case STATUSTYPE_INFO: + r = dm_thin_get_mapped_count(tc->td, &mapped); + if (r) { + DMERR("dm_thin_get_mapped_count returned %d", r); + goto err; + } + + r = dm_thin_get_highest_mapped_block(tc->td, &highest); + if (r < 0) { + DMERR("dm_thin_get_highest_mapped_block returned %d", r); + goto err; + } + + DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); + if (r) + DMEMIT("%llu", ((highest + 1) * + tc->pool->sectors_per_block) - 1); + else + DMEMIT("-"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %lu", + format_dev_t(buf, tc->pool_dev->bdev->bd_dev), + (unsigned long) tc->dev_id); + if (tc->origin_dev) + DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + } + + return; + +err: + DMEMIT("Error"); +} + +static int thin_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + sector_t blocks; + struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; + + /* + * We can't call dm_pool_get_data_dev_size() since that blocks. So + * we follow a more convoluted path through to the pool's target. + */ + if (!pool->ti) + return 0; /* nothing is bound */ + + blocks = pool->ti->len; + (void) sector_div(blocks, pool->sectors_per_block); + if (blocks) + return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); + + return 0; +} + +static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; + + if (!pool->pf.discard_enabled) + return; + + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */ +} + +static struct target_type thin_target = { + .name = "thin", + .version = {1, 22, 0}, + .module = THIS_MODULE, + .ctr = thin_ctr, + .dtr = thin_dtr, + .map = thin_map, + .end_io = thin_endio, + .preresume = thin_preresume, + .presuspend = thin_presuspend, + .postsuspend = thin_postsuspend, + .status = thin_status, + .iterate_devices = thin_iterate_devices, + .io_hints = thin_io_hints, +}; + +/*----------------------------------------------------------------*/ + +static int __init dm_thin_init(void) +{ + int r = -ENOMEM; + + pool_table_init(); + + _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); + if (!_new_mapping_cache) + return r; + + r = dm_register_target(&thin_target); + if (r) + goto bad_new_mapping_cache; + + r = dm_register_target(&pool_target); + if (r) + goto bad_thin_target; + + return 0; + +bad_thin_target: + dm_unregister_target(&thin_target); +bad_new_mapping_cache: + kmem_cache_destroy(_new_mapping_cache); + + return r; +} + +static void dm_thin_exit(void) +{ + dm_unregister_target(&thin_target); + dm_unregister_target(&pool_target); + + kmem_cache_destroy(_new_mapping_cache); + + pool_table_exit(); +} + +module_init(dm_thin_init); +module_exit(dm_thin_exit); + +module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds"); + +MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c new file mode 100644 index 000000000..a02b3f6ea --- /dev/null +++ b/drivers/md/dm-uevent.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Device Mapper Uevent Support (dm-uevent) + * + * Copyright IBM Corporation, 2007 + * Author: Mike Anderson + */ +#include +#include +#include +#include +#include + +#include "dm.h" +#include "dm-uevent.h" + +#define DM_MSG_PREFIX "uevent" + +static const struct { + enum dm_uevent_type type; + enum kobject_action action; + char *name; +} _dm_uevent_type_names[] = { + {DM_UEVENT_PATH_FAILED, KOBJ_CHANGE, "PATH_FAILED"}, + {DM_UEVENT_PATH_REINSTATED, KOBJ_CHANGE, "PATH_REINSTATED"}, +}; + +static struct kmem_cache *_dm_event_cache; + +struct dm_uevent { + struct mapped_device *md; + enum kobject_action action; + struct kobj_uevent_env ku_env; + struct list_head elist; + char name[DM_NAME_LEN]; + char uuid[DM_UUID_LEN]; +}; + +static void dm_uevent_free(struct dm_uevent *event) +{ + kmem_cache_free(_dm_event_cache, event); +} + +static struct dm_uevent *dm_uevent_alloc(struct mapped_device *md) +{ + struct dm_uevent *event; + + event = kmem_cache_zalloc(_dm_event_cache, GFP_ATOMIC); + if (!event) + return NULL; + + INIT_LIST_HEAD(&event->elist); + event->md = md; + + return event; +} + +static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md, + struct dm_target *ti, + enum kobject_action action, + const char *dm_action, + const char *path, + unsigned int nr_valid_paths) +{ + struct dm_uevent *event; + + event = dm_uevent_alloc(md); + if (!event) { + DMERR("%s: dm_uevent_alloc() failed", __func__); + goto err_nomem; + } + + event->action = action; + + if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) { + DMERR("%s: add_uevent_var() for DM_TARGET failed", + __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) { + DMERR("%s: add_uevent_var() for DM_ACTION failed", + __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u", + dm_next_uevent_seq(md))) { + DMERR("%s: add_uevent_var() for DM_SEQNUM failed", + __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) { + DMERR("%s: add_uevent_var() for DM_PATH failed", __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d", + nr_valid_paths)) { + DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed", + __func__); + goto err_add; + } + + return event; + +err_add: + dm_uevent_free(event); +err_nomem: + return ERR_PTR(-ENOMEM); +} + +/** + * dm_send_uevents - send uevents for given list + * + * @events: list of events to send + * @kobj: kobject generating event + * + */ +void dm_send_uevents(struct list_head *events, struct kobject *kobj) +{ + int r; + struct dm_uevent *event, *next; + + list_for_each_entry_safe(event, next, events, elist) { + list_del_init(&event->elist); + + /* + * When a device is being removed this copy fails and we + * discard these unsent events. + */ + if (dm_copy_name_and_uuid(event->md, event->name, + event->uuid)) { + DMINFO("%s: skipping sending uevent for lost device", + __func__); + goto uevent_free; + } + + if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) { + DMERR("%s: add_uevent_var() for DM_NAME failed", + __func__); + goto uevent_free; + } + + if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) { + DMERR("%s: add_uevent_var() for DM_UUID failed", + __func__); + goto uevent_free; + } + + r = kobject_uevent_env(kobj, event->action, event->ku_env.envp); + if (r) + DMERR("%s: kobject_uevent_env failed", __func__); +uevent_free: + dm_uevent_free(event); + } +} +EXPORT_SYMBOL_GPL(dm_send_uevents); + +/** + * dm_path_uevent - called to create a new path event and queue it + * + * @event_type: path event type enum + * @ti: pointer to a dm_target + * @path: string containing pathname + * @nr_valid_paths: number of valid paths remaining + * + */ +void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned int nr_valid_paths) +{ + struct mapped_device *md = dm_table_get_md(ti->table); + struct dm_uevent *event; + + if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { + DMERR("%s: Invalid event_type %d", __func__, event_type); + return; + } + + event = dm_build_path_uevent(md, ti, + _dm_uevent_type_names[event_type].action, + _dm_uevent_type_names[event_type].name, + path, nr_valid_paths); + if (IS_ERR(event)) + return; + + dm_uevent_add(md, &event->elist); +} +EXPORT_SYMBOL_GPL(dm_path_uevent); + +int dm_uevent_init(void) +{ + _dm_event_cache = KMEM_CACHE(dm_uevent, 0); + if (!_dm_event_cache) + return -ENOMEM; + + DMINFO("version 1.0.3"); + + return 0; +} + +void dm_uevent_exit(void) +{ + kmem_cache_destroy(_dm_event_cache); +} diff --git a/drivers/md/dm-uevent.h b/drivers/md/dm-uevent.h new file mode 100644 index 000000000..12a5d4fb7 --- /dev/null +++ b/drivers/md/dm-uevent.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device Mapper Uevent Support + * + * Copyright IBM Corporation, 2007 + * Author: Mike Anderson + */ +#ifndef DM_UEVENT_H +#define DM_UEVENT_H + +enum dm_uevent_type { + DM_UEVENT_PATH_FAILED, + DM_UEVENT_PATH_REINSTATED, +}; + +#ifdef CONFIG_DM_UEVENT + +extern int dm_uevent_init(void); +extern void dm_uevent_exit(void); +extern void dm_send_uevents(struct list_head *events, struct kobject *kobj); +extern void dm_path_uevent(enum dm_uevent_type event_type, + struct dm_target *ti, const char *path, + unsigned int nr_valid_paths); + +#else + +static inline int dm_uevent_init(void) +{ + return 0; +} +static inline void dm_uevent_exit(void) +{ +} +static inline void dm_send_uevents(struct list_head *events, + struct kobject *kobj) +{ +} +static inline void dm_path_uevent(enum dm_uevent_type event_type, + struct dm_target *ti, const char *path, + unsigned int nr_valid_paths) +{ +} + +#endif /* CONFIG_DM_UEVENT */ + +#endif /* DM_UEVENT_H */ diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c new file mode 100644 index 000000000..fdc8921e5 --- /dev/null +++ b/drivers/md/dm-unstripe.c @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2017 Intel Corporation. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include + +struct unstripe_c { + struct dm_dev *dev; + sector_t physical_start; + + uint32_t stripes; + + uint32_t unstripe; + sector_t unstripe_width; + sector_t unstripe_offset; + + uint32_t chunk_size; + u8 chunk_shift; +}; + +#define DM_MSG_PREFIX "unstriped" + +static void cleanup_unstripe(struct unstripe_c *uc, struct dm_target *ti) +{ + if (uc->dev) + dm_put_device(ti, uc->dev); + kfree(uc); +} + +/* + * Contruct an unstriped mapping. + * + */ +static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct unstripe_c *uc; + sector_t tmp_len; + unsigned long long start; + char dummy; + + if (argc != 5) { + ti->error = "Invalid number of arguments"; + return -EINVAL; + } + + uc = kzalloc(sizeof(*uc), GFP_KERNEL); + if (!uc) { + ti->error = "Memory allocation for unstriped context failed"; + return -ENOMEM; + } + + if (kstrtouint(argv[0], 10, &uc->stripes) || !uc->stripes) { + ti->error = "Invalid stripe count"; + goto err; + } + + if (kstrtouint(argv[1], 10, &uc->chunk_size) || !uc->chunk_size) { + ti->error = "Invalid chunk_size"; + goto err; + } + + if (kstrtouint(argv[2], 10, &uc->unstripe)) { + ti->error = "Invalid stripe number"; + goto err; + } + + if (uc->unstripe > uc->stripes && uc->stripes > 1) { + ti->error = "Please provide stripe between [0, # of stripes]"; + goto err; + } + + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &uc->dev)) { + ti->error = "Couldn't get striped device"; + goto err; + } + + if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { + ti->error = "Invalid striped device offset"; + goto err; + } + uc->physical_start = start; + + uc->unstripe_offset = uc->unstripe * uc->chunk_size; + uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size; + uc->chunk_shift = is_power_of_2(uc->chunk_size) ? fls(uc->chunk_size) - 1 : 0; + + tmp_len = ti->len; + if (sector_div(tmp_len, uc->chunk_size)) { + ti->error = "Target length not divisible by chunk size"; + goto err; + } + + if (dm_set_target_max_io_len(ti, uc->chunk_size)) { + ti->error = "Failed to set max io len"; + goto err; + } + + ti->private = uc; + return 0; +err: + cleanup_unstripe(uc, ti); + return -EINVAL; +} + +static void unstripe_dtr(struct dm_target *ti) +{ + struct unstripe_c *uc = ti->private; + + cleanup_unstripe(uc, ti); +} + +static sector_t map_to_core(struct dm_target *ti, struct bio *bio) +{ + struct unstripe_c *uc = ti->private; + sector_t sector = bio->bi_iter.bi_sector; + sector_t tmp_sector = sector; + + /* Shift us up to the right "row" on the stripe */ + if (uc->chunk_shift) + tmp_sector >>= uc->chunk_shift; + else + sector_div(tmp_sector, uc->chunk_size); + + sector += uc->unstripe_width * tmp_sector; + + /* Account for what stripe we're operating on */ + return sector + uc->unstripe_offset; +} + +static int unstripe_map(struct dm_target *ti, struct bio *bio) +{ + struct unstripe_c *uc = ti->private; + + bio_set_dev(bio, uc->dev->bdev); + bio->bi_iter.bi_sector = map_to_core(ti, bio) + uc->physical_start; + + return DM_MAPIO_REMAPPED; +} + +static void unstripe_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct unstripe_c *uc = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + break; + + case STATUSTYPE_TABLE: + DMEMIT("%d %llu %d %s %llu", + uc->stripes, (unsigned long long)uc->chunk_size, uc->unstripe, + uc->dev->name, (unsigned long long)uc->physical_start); + break; + + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int unstripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct unstripe_c *uc = ti->private; + + return fn(ti, uc->dev, uc->physical_start, ti->len, data); +} + +static void unstripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct unstripe_c *uc = ti->private; + + limits->chunk_sectors = uc->chunk_size; +} + +static struct target_type unstripe_target = { + .name = "unstriped", + .version = {1, 1, 0}, + .features = DM_TARGET_NOWAIT, + .module = THIS_MODULE, + .ctr = unstripe_ctr, + .dtr = unstripe_dtr, + .map = unstripe_map, + .status = unstripe_status, + .iterate_devices = unstripe_iterate_devices, + .io_hints = unstripe_io_hints, +}; + +static int __init dm_unstripe_init(void) +{ + return dm_register_target(&unstripe_target); +} + +static void __exit dm_unstripe_exit(void) +{ + dm_unregister_target(&unstripe_target); +} + +module_init(dm_unstripe_init); +module_exit(dm_unstripe_exit); + +MODULE_DESCRIPTION(DM_NAME " unstriped target"); +MODULE_ALIAS("dm-unstriped"); +MODULE_AUTHOR("Scott Bauer "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c new file mode 100644 index 000000000..0304e36af --- /dev/null +++ b/drivers/md/dm-verity-fec.c @@ -0,0 +1,823 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen + */ + +#include "dm-verity-fec.h" +#include + +#define DM_MSG_PREFIX "verity-fec" + +/* + * If error correction has been configured, returns true. + */ +bool verity_fec_is_enabled(struct dm_verity *v) +{ + return v->fec && v->fec->dev; +} + +/* + * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable + * length fields. + */ +static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io) +{ + return (struct dm_verity_fec_io *) + ((char *)io + io->v->ti->per_io_data_size - sizeof(struct dm_verity_fec_io)); +} + +/* + * Return an interleaved offset for a byte in RS block. + */ +static inline u64 fec_interleave(struct dm_verity *v, u64 offset) +{ + u32 mod; + + mod = do_div(offset, v->fec->rsn); + return offset + mod * (v->fec->rounds << v->data_dev_block_bits); +} + +/* + * Decode an RS block using Reed-Solomon. + */ +static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, + u8 *data, u8 *fec, int neras) +{ + int i; + uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + + for (i = 0; i < v->fec->roots; i++) + par[i] = fec[i]; + + return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, + fio->erasures, 0, NULL); +} + +/* + * Read error-correcting codes for the requested RS block. Returns a pointer + * to the data block. Caller is responsible for releasing buf. + */ +static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, + unsigned int *offset, struct dm_buffer **buf) +{ + u64 position, block, rem; + u8 *res; + + position = (index + rsb) * v->fec->roots; + block = div64_u64_rem(position, v->fec->io_size, &rem); + *offset = (unsigned int)rem; + + res = dm_bufio_read(v->fec->bufio, block, buf); + if (IS_ERR(res)) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, (unsigned long long)rsb, + (unsigned long long)block, PTR_ERR(res)); + *buf = NULL; + } + + return res; +} + +/* Loop over each preallocated buffer slot. */ +#define fec_for_each_prealloc_buffer(__i) \ + for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++) + +/* Loop over each extra buffer slot. */ +#define fec_for_each_extra_buffer(io, __i) \ + for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++) + +/* Loop over each allocated buffer. */ +#define fec_for_each_buffer(io, __i) \ + for (__i = 0; __i < (io)->nbufs; __i++) + +/* Loop over each RS block in each allocated buffer. */ +#define fec_for_each_buffer_rs_block(io, __i, __j) \ + fec_for_each_buffer(io, __i) \ + for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++) + +/* + * Return a pointer to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline u8 *fec_buffer_rs_block(struct dm_verity *v, + struct dm_verity_fec_io *fio, + unsigned int i, unsigned int j) +{ + return &fio->bufs[i][j * v->fec->rsn]; +} + +/* + * Return an index to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline unsigned int fec_buffer_rs_index(unsigned int i, unsigned int j) +{ + return (i << DM_VERITY_FEC_BUF_RS_BITS) + j; +} + +/* + * Decode all RS blocks from buffers and copy corrected bytes into fio->output + * starting from block_offset. + */ +static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, + u64 rsb, int byte_index, unsigned int block_offset, + int neras) +{ + int r, corrected = 0, res; + struct dm_buffer *buf; + unsigned int n, i, offset; + u8 *par, *block; + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (IS_ERR(par)) + return PTR_ERR(par); + + /* + * Decode the RS blocks we have in bufs. Each RS block results in + * one corrected target byte and consumes fec->roots parity bytes. + */ + fec_for_each_buffer_rs_block(fio, n, i) { + block = fec_buffer_rs_block(v, fio, n, i); + res = fec_decode_rs8(v, fio, block, &par[offset], neras); + if (res < 0) { + r = res; + goto error; + } + + corrected += res; + fio->output[block_offset] = block[byte_index]; + + block_offset++; + if (block_offset >= 1 << v->data_dev_block_bits) + goto done; + + /* read the next block when we run out of parity bytes */ + offset += v->fec->roots; + if (offset >= v->fec->io_size) { + dm_bufio_release(buf); + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (IS_ERR(par)) + return PTR_ERR(par); + } + } +done: + r = corrected; +error: + dm_bufio_release(buf); + + if (r < 0 && neras) + DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", + v->data_dev->name, (unsigned long long)rsb, r); + else if (r > 0) + DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", + v->data_dev->name, (unsigned long long)rsb, r); + + return r; +} + +/* + * Locate data block erasures using verity hashes. + */ +static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, + u8 *want_digest, u8 *data) +{ + if (unlikely(verity_hash(v, verity_io_hash_req(v, io), + data, 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io), true))) + return 0; + + return memcmp(verity_io_real_digest(v, io), want_digest, + v->digest_size) != 0; +} + +/* + * Read data blocks that are part of the RS block and deinterleave as much as + * fits into buffers. Check for erasure locations if @neras is non-NULL. + */ +static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, + u64 rsb, u64 target, unsigned int block_offset, + int *neras) +{ + bool is_zero; + int i, j, target_index = -1; + struct dm_buffer *buf; + struct dm_bufio_client *bufio; + struct dm_verity_fec_io *fio = fec_io(io); + u64 block, ileaved; + u8 *bbuf, *rs_block; + u8 want_digest[HASH_MAX_DIGESTSIZE]; + unsigned int n, k; + + if (neras) + *neras = 0; + + if (WARN_ON(v->digest_size > sizeof(want_digest))) + return -EINVAL; + + /* + * read each of the rsn data blocks that are part of the RS block, and + * interleave contents to available bufs + */ + for (i = 0; i < v->fec->rsn; i++) { + ileaved = fec_interleave(v, rsb * v->fec->rsn + i); + + /* + * target is the data block we want to correct, target_index is + * the index of this block within the rsn RS blocks + */ + if (ileaved == target) + target_index = i; + + block = ileaved >> v->data_dev_block_bits; + bufio = v->fec->data_bufio; + + if (block >= v->data_blocks) { + block -= v->data_blocks; + + /* + * blocks outside the area were assumed to contain + * zeros when encoding data was generated + */ + if (unlikely(block >= v->fec->hash_blocks)) + continue; + + block += v->hash_start; + bufio = v->bufio; + } + + bbuf = dm_bufio_read(bufio, block, &buf); + if (IS_ERR(bbuf)) { + DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", + v->data_dev->name, + (unsigned long long)rsb, + (unsigned long long)block, PTR_ERR(bbuf)); + + /* assume the block is corrupted */ + if (neras && *neras <= v->fec->roots) + fio->erasures[(*neras)++] = i; + + continue; + } + + /* locate erasures if the block is on the data device */ + if (bufio == v->fec->data_bufio && + verity_hash_for_block(v, io, block, want_digest, + &is_zero) == 0) { + /* skip known zero blocks entirely */ + if (is_zero) + goto done; + + /* + * skip if we have already found the theoretical + * maximum number (i.e. fec->roots) of erasures + */ + if (neras && *neras <= v->fec->roots && + fec_is_erasure(v, io, want_digest, bbuf)) + fio->erasures[(*neras)++] = i; + } + + /* + * deinterleave and copy the bytes that fit into bufs, + * starting from block_offset + */ + fec_for_each_buffer_rs_block(fio, n, j) { + k = fec_buffer_rs_index(n, j) + block_offset; + + if (k >= 1 << v->data_dev_block_bits) + goto done; + + rs_block = fec_buffer_rs_block(v, fio, n, j); + rs_block[i] = bbuf[k]; + } +done: + dm_bufio_release(buf); + } + + return target_index; +} + +/* + * Allocate RS control structure and FEC buffers from preallocated mempools, + * and attempt to allocate as many extra buffers as available. + */ +static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned int n; + + if (!fio->rs) + fio->rs = mempool_alloc(&v->fec->rs_pool, GFP_NOIO); + + fec_for_each_prealloc_buffer(n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT); + if (unlikely(!fio->bufs[n])) { + DMERR("failed to allocate FEC buffer"); + return -ENOMEM; + } + } + + /* try to allocate the maximum number of buffers */ + fec_for_each_extra_buffer(fio, n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT); + /* we can manage with even one buffer if necessary */ + if (unlikely(!fio->bufs[n])) + break; + } + fio->nbufs = n; + + if (!fio->output) + fio->output = mempool_alloc(&v->fec->output_pool, GFP_NOIO); + + return 0; +} + +/* + * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are + * zeroed before deinterleaving. + */ +static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned int n; + + fec_for_each_buffer(fio, n) + memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS); + + memset(fio->erasures, 0, sizeof(fio->erasures)); +} + +/* + * Decode all RS blocks in a single data block and return the target block + * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses + * hashes to locate erasures. + */ +static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 rsb, u64 offset, + bool use_erasures) +{ + int r, neras = 0; + unsigned int pos; + + r = fec_alloc_bufs(v, fio); + if (unlikely(r < 0)) + return r; + + for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { + fec_init_bufs(v, fio); + + r = fec_read_bufs(v, io, rsb, offset, pos, + use_erasures ? &neras : NULL); + if (unlikely(r < 0)) + return r; + + r = fec_decode_bufs(v, fio, rsb, r, pos, neras); + if (r < 0) + return r; + + pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; + } + + /* Always re-validate the corrected block against the expected hash */ + r = verity_hash(v, verity_io_hash_req(v, io), fio->output, + 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io), true); + if (unlikely(r < 0)) + return r; + + if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), + v->digest_size)) { + DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", + v->data_dev->name, (unsigned long long)rsb, neras); + return -EILSEQ; + } + + return 0; +} + +static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data, + size_t len) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + memcpy(data, &fio->output[fio->output_pos], len); + fio->output_pos += len; + + return 0; +} + +/* + * Correct errors in a block. Copies corrected block to dest if non-NULL, + * otherwise to a bio_vec starting from iter. + */ +int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + int r; + struct dm_verity_fec_io *fio = fec_io(io); + u64 offset, res, rsb; + + if (!verity_fec_is_enabled(v)) + return -EOPNOTSUPP; + + if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { + DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + return -EIO; + } + + fio->level++; + + if (type == DM_VERITY_BLOCK_TYPE_METADATA) + block = block - v->hash_start + v->data_blocks; + + /* + * For RS(M, N), the continuous FEC data is divided into blocks of N + * bytes. Since block size may not be divisible by N, the last block + * is zero padded when decoding. + * + * Each byte of the block is covered by a different RS(M, N) code, + * and each code is interleaved over N blocks to make it less likely + * that bursty corruption will leave us in unrecoverable state. + */ + + offset = block << v->data_dev_block_bits; + res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); + + /* + * The base RS block we can feed to the interleaver to find out all + * blocks required for decoding. + */ + rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits); + + /* + * Locating erasures is slow, so attempt to recover the block without + * them first. Do a second attempt with erasures if the corruption is + * bad enough. + */ + r = fec_decode_rsb(v, io, fio, rsb, offset, false); + if (r < 0) { + r = fec_decode_rsb(v, io, fio, rsb, offset, true); + if (r < 0) + goto done; + } + + if (dest) + memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + else if (iter) { + fio->output_pos = 0; + r = verity_for_bv_block(v, io, iter, fec_bv_copy); + } + +done: + fio->level--; + return r; +} + +/* + * Clean up per-bio data. + */ +void verity_fec_finish_io(struct dm_verity_io *io) +{ + unsigned int n; + struct dm_verity_fec *f = io->v->fec; + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + mempool_free(fio->rs, &f->rs_pool); + + fec_for_each_prealloc_buffer(n) + mempool_free(fio->bufs[n], &f->prealloc_pool); + + fec_for_each_extra_buffer(fio, n) + mempool_free(fio->bufs[n], &f->extra_pool); + + mempool_free(fio->output, &f->output_pool); +} + +/* + * Initialize per-bio data. + */ +void verity_fec_init_io(struct dm_verity_io *io) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + fio->rs = NULL; + memset(fio->bufs, 0, sizeof(fio->bufs)); + fio->nbufs = 0; + fio->output = NULL; + fio->level = 0; +} + +/* + * Append feature arguments and values to the status table. + */ +unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, + char *result, unsigned int maxlen) +{ + if (!verity_fec_is_enabled(v)) + return sz; + + DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s " + DM_VERITY_OPT_FEC_BLOCKS " %llu " + DM_VERITY_OPT_FEC_START " %llu " + DM_VERITY_OPT_FEC_ROOTS " %d", + v->fec->dev->name, + (unsigned long long)v->fec->blocks, + (unsigned long long)v->fec->start, + v->fec->roots); + + return sz; +} + +void verity_fec_dtr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + + if (!verity_fec_is_enabled(v)) + goto out; + + mempool_exit(&f->rs_pool); + mempool_exit(&f->prealloc_pool); + mempool_exit(&f->extra_pool); + mempool_exit(&f->output_pool); + kmem_cache_destroy(f->cache); + + if (f->data_bufio) + dm_bufio_client_destroy(f->data_bufio); + if (f->bufio) + dm_bufio_client_destroy(f->bufio); + + if (f->dev) + dm_put_device(v->ti, f->dev); +out: + kfree(f); + v->fec = NULL; +} + +static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct dm_verity *v = (struct dm_verity *)pool_data; + + return init_rs_gfp(8, 0x11d, 0, 1, v->fec->roots, gfp_mask); +} + +static void fec_rs_free(void *element, void *pool_data) +{ + struct rs_control *rs = (struct rs_control *)element; + + if (rs) + free_rs(rs); +} + +bool verity_is_fec_opt_arg(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)); +} + +int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + unsigned int *argc, const char *arg_name) +{ + int r; + struct dm_target *ti = v->ti; + const char *arg_value; + unsigned long long num_ll; + unsigned char num_c; + char dummy; + + if (!*argc) { + ti->error = "FEC feature arguments require a value"; + return -EINVAL; + } + + arg_value = dm_shift_arg(as); + (*argc)--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev); + if (r) { + ti->error = "FEC device lookup failed"; + return r; + } + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + v->fec->blocks = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >> + (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_START; + return -EINVAL; + } + v->fec->start = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) { + if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c || + num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) || + num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + v->fec->roots = num_c; + + } else { + ti->error = "Unrecognized verity FEC feature request"; + return -EINVAL; + } + + return 0; +} + +/* + * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. + */ +int verity_fec_ctr_alloc(struct dm_verity *v) +{ + struct dm_verity_fec *f; + + f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL); + if (!f) { + v->ti->error = "Cannot allocate FEC structure"; + return -ENOMEM; + } + v->fec = f; + + return 0; +} + +/* + * Validate arguments and preallocate memory. Must be called after arguments + * have been parsed using verity_fec_parse_opt_args. + */ +int verity_fec_ctr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + struct dm_target *ti = v->ti; + u64 hash_blocks, fec_blocks; + int ret; + + if (!verity_fec_is_enabled(v)) { + verity_fec_dtr(v); + return 0; + } + + /* + * FEC is computed over data blocks, possible metadata, and + * hash blocks. In other words, FEC covers total of fec_blocks + * blocks consisting of the following: + * + * data blocks | hash blocks | metadata (optional) + * + * We allow metadata after hash blocks to support a use case + * where all data is stored on the same device and FEC covers + * the entire area. + * + * If metadata is included, we require it to be available on the + * hash device after the hash blocks. + */ + + hash_blocks = v->hash_blocks - v->hash_start; + + /* + * Require matching block sizes for data and hash devices for + * simplicity. + */ + if (v->data_dev_block_bits != v->hash_dev_block_bits) { + ti->error = "Block sizes must match to use FEC"; + return -EINVAL; + } + + if (!f->roots) { + ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + f->rsn = DM_VERITY_FEC_RSM - f->roots; + + if (!f->blocks) { + ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + f->rounds = f->blocks; + if (sector_div(f->rounds, f->rsn)) + f->rounds++; + + /* + * Due to optional metadata, f->blocks can be larger than + * data_blocks and hash_blocks combined. + */ + if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + /* + * Metadata is accessed through the hash device, so we require + * it to be large enough. + */ + f->hash_blocks = f->blocks - v->data_blocks; + if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) { + ti->error = "Hash device is too small for " + DM_VERITY_OPT_FEC_BLOCKS; + return -E2BIG; + } + + if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) + f->io_size = 1 << v->data_dev_block_bits; + else + f->io_size = v->fec->roots << SECTOR_SHIFT; + + f->bufio = dm_bufio_client_create(f->dev->bdev, + f->io_size, + 1, 0, NULL, NULL, 0); + if (IS_ERR(f->bufio)) { + ti->error = "Cannot initialize FEC bufio client"; + return PTR_ERR(f->bufio); + } + + dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT)); + + fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT); + if (dm_bufio_get_device_size(f->bufio) < fec_blocks) { + ti->error = "FEC device is too small"; + return -E2BIG; + } + + f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL, 0); + if (IS_ERR(f->data_bufio)) { + ti->error = "Cannot initialize FEC data bufio client"; + return PTR_ERR(f->data_bufio); + } + + if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) { + ti->error = "Data device is too small"; + return -E2BIG; + } + + /* Preallocate an rs_control structure for each worker thread */ + ret = mempool_init(&f->rs_pool, num_online_cpus(), fec_rs_alloc, + fec_rs_free, (void *) v); + if (ret) { + ti->error = "Cannot allocate RS pool"; + return ret; + } + + f->cache = kmem_cache_create("dm_verity_fec_buffers", + f->rsn << DM_VERITY_FEC_BUF_RS_BITS, + 0, 0, NULL); + if (!f->cache) { + ti->error = "Cannot create FEC buffer cache"; + return -ENOMEM; + } + + /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */ + ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus() * + DM_VERITY_FEC_BUF_PREALLOC, + f->cache); + if (ret) { + ti->error = "Cannot allocate FEC buffer prealloc pool"; + return ret; + } + + ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache); + if (ret) { + ti->error = "Cannot allocate FEC buffer extra pool"; + return ret; + } + + /* Preallocate an output buffer for each thread */ + ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(), + 1 << v->data_dev_block_bits); + if (ret) { + ti->error = "Cannot allocate FEC output pool"; + return ret; + } + + /* Reserve space for our per-bio data */ + ti->per_io_data_size += sizeof(struct dm_verity_fec_io); + + return 0; +} diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h new file mode 100644 index 000000000..8454070d2 --- /dev/null +++ b/drivers/md/dm-verity-fec.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen + */ + +#ifndef DM_VERITY_FEC_H +#define DM_VERITY_FEC_H + +#include "dm-verity.h" +#include + +/* Reed-Solomon(M, N) parameters */ +#define DM_VERITY_FEC_RSM 255 +#define DM_VERITY_FEC_MAX_RSN 253 +#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ + +/* buffers for deinterleaving and decoding */ +#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */ +#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ +/* we need buffers for at most 1 << block size RS blocks */ +#define DM_VERITY_FEC_BUF_MAX \ + (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) + +/* maximum recursion level for verity_fec_decode */ +#define DM_VERITY_FEC_MAX_RECURSION 4 + +#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" +#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" +#define DM_VERITY_OPT_FEC_START "fec_start" +#define DM_VERITY_OPT_FEC_ROOTS "fec_roots" + +/* configuration */ +struct dm_verity_fec { + struct dm_dev *dev; /* parity data device */ + struct dm_bufio_client *data_bufio; /* for data dev access */ + struct dm_bufio_client *bufio; /* for parity data access */ + size_t io_size; /* IO size for roots */ + sector_t start; /* parity data start in blocks */ + sector_t blocks; /* number of blocks covered */ + sector_t rounds; /* number of interleaving rounds */ + sector_t hash_blocks; /* blocks covered after v->hash_start */ + unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ + unsigned char rsn; /* N of RS(M, N) */ + mempool_t rs_pool; /* mempool for fio->rs */ + mempool_t prealloc_pool; /* mempool for preallocated buffers */ + mempool_t extra_pool; /* mempool for extra buffers */ + mempool_t output_pool; /* mempool for output */ + struct kmem_cache *cache; /* cache for buffers */ +}; + +/* per-bio data */ +struct dm_verity_fec_io { + struct rs_control *rs; /* Reed-Solomon state */ + int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ + u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ + unsigned int nbufs; /* number of buffers allocated */ + u8 *output; /* buffer for corrected output */ + size_t output_pos; + unsigned int level; /* recursion level */ +}; + +#ifdef CONFIG_DM_VERITY_FEC + +/* each feature parameter requires a value */ +#define DM_VERITY_OPTS_FEC 8 + +extern bool verity_fec_is_enabled(struct dm_verity *v); + +extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, + u8 *dest, struct bvec_iter *iter); + +extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, + char *result, unsigned int maxlen); + +extern void verity_fec_finish_io(struct dm_verity_io *io); +extern void verity_fec_init_io(struct dm_verity_io *io); + +extern bool verity_is_fec_opt_arg(const char *arg_name); +extern int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, unsigned int *argc, + const char *arg_name); + +extern void verity_fec_dtr(struct dm_verity *v); + +extern int verity_fec_ctr_alloc(struct dm_verity *v); +extern int verity_fec_ctr(struct dm_verity *v); + +#else /* !CONFIG_DM_VERITY_FEC */ + +#define DM_VERITY_OPTS_FEC 0 + +static inline bool verity_fec_is_enabled(struct dm_verity *v) +{ + return false; +} + +static inline int verity_fec_decode(struct dm_verity *v, + struct dm_verity_io *io, + enum verity_block_type type, + sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + return -EOPNOTSUPP; +} + +static inline unsigned int verity_fec_status_table(struct dm_verity *v, + unsigned int sz, char *result, + unsigned int maxlen) +{ + return sz; +} + +static inline void verity_fec_finish_io(struct dm_verity_io *io) +{ +} + +static inline void verity_fec_init_io(struct dm_verity_io *io) +{ +} + +static inline bool verity_is_fec_opt_arg(const char *arg_name) +{ + return false; +} + +static inline int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, + unsigned int *argc, + const char *arg_name) +{ + return -EINVAL; +} + +static inline void verity_fec_dtr(struct dm_verity *v) +{ +} + +static inline int verity_fec_ctr_alloc(struct dm_verity *v) +{ + return 0; +} + +static inline int verity_fec_ctr(struct dm_verity *v) +{ + return 0; +} + +#endif /* CONFIG_DM_VERITY_FEC */ + +#endif /* DM_VERITY_FEC_H */ diff --git a/drivers/md/dm-verity-loadpin.c b/drivers/md/dm-verity-loadpin.c new file mode 100644 index 000000000..0666699b6 --- /dev/null +++ b/drivers/md/dm-verity-loadpin.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include "dm.h" +#include "dm-core.h" +#include "dm-verity.h" + +#define DM_MSG_PREFIX "verity-loadpin" + +LIST_HEAD(dm_verity_loadpin_trusted_root_digests); + +static bool is_trusted_verity_target(struct dm_target *ti) +{ + int verity_mode; + u8 *root_digest; + unsigned int digest_size; + struct dm_verity_loadpin_trusted_root_digest *trd; + bool trusted = false; + + if (!dm_is_verity_target(ti)) + return false; + + verity_mode = dm_verity_get_mode(ti); + + if ((verity_mode != DM_VERITY_MODE_EIO) && + (verity_mode != DM_VERITY_MODE_RESTART) && + (verity_mode != DM_VERITY_MODE_PANIC)) + return false; + + if (dm_verity_get_root_digest(ti, &root_digest, &digest_size)) + return false; + + list_for_each_entry(trd, &dm_verity_loadpin_trusted_root_digests, node) { + if ((trd->len == digest_size) && + !memcmp(trd->data, root_digest, digest_size)) { + trusted = true; + break; + } + } + + kfree(root_digest); + + return trusted; +} + +/* + * Determines whether the file system of a superblock is located on + * a verity device that is trusted by LoadPin. + */ +bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev) +{ + struct mapped_device *md; + struct dm_table *table; + struct dm_target *ti; + int srcu_idx; + bool trusted = false; + + if (bdev == NULL) + return false; + + if (list_empty(&dm_verity_loadpin_trusted_root_digests)) + return false; + + md = dm_get_md(bdev->bd_dev); + if (!md) + return false; + + table = dm_get_live_table(md, &srcu_idx); + + if (table->num_targets != 1) + goto out; + + ti = dm_table_get_target(table, 0); + + if (is_trusted_verity_target(ti)) + trusted = true; + +out: + dm_put_live_table(md, srcu_idx); + dm_put(md); + + return trusted; +} diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c new file mode 100644 index 000000000..24df610a2 --- /dev/null +++ b/drivers/md/dm-verity-target.c @@ -0,0 +1,1530 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * Author: Mikulas Patocka + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set + * default prefetch value. Data are read in "prefetch_cluster" chunks from the + * hash device. Setting this greatly improves performance when data and hash + * are on the same disk on different partitions on devices with poor random + * access behavior. + */ + +#include "dm-verity.h" +#include "dm-verity-fec.h" +#include "dm-verity-verify-sig.h" +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "verity" + +#define DM_VERITY_ENV_LENGTH 42 +#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" + +#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 + +#define DM_VERITY_MAX_CORRUPTED_ERRS 100 + +#define DM_VERITY_OPT_LOGGING "ignore_corruption" +#define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPT_PANIC "panic_on_corruption" +#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" +#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" +#define DM_VERITY_OPT_TASKLET_VERIFY "try_verify_in_tasklet" + +#define DM_VERITY_OPTS_MAX (4 + DM_VERITY_OPTS_FEC + \ + DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) + +static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; + +module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); + +static DEFINE_STATIC_KEY_FALSE(use_tasklet_enabled); + +struct dm_verity_prefetch_work { + struct work_struct work; + struct dm_verity *v; + sector_t block; + unsigned int n_blocks; +}; + +/* + * Auxiliary structure appended to each dm-bufio buffer. If the value + * hash_verified is nonzero, hash of the block has been verified. + * + * The variable hash_verified is set to 0 when allocating the buffer, then + * it can be changed to 1 and it is never reset to 0 again. + * + * There is no lock around this value, a race condition can at worst cause + * that multiple processes verify the hash of the same buffer simultaneously + * and write 1 to hash_verified simultaneously. + * This condition is harmless, so we don't need locking. + */ +struct buffer_aux { + int hash_verified; +}; + +/* + * Initialize struct buffer_aux for a freshly created buffer. + */ +static void dm_bufio_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + + aux->hash_verified = 0; +} + +/* + * Translate input sector number to the sector number on the target device. + */ +static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) +{ + return v->data_start + dm_target_offset(v->ti, bi_sector); +} + +/* + * Return hash position of a specified block at a specified tree level + * (0 is the lowest level). + * The lowest "hash_per_block_bits"-bits of the result denote hash position + * inside a hash block. The remaining bits denote location of the hash block. + */ +static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, + int level) +{ + return block >> (level * v->hash_per_block_bits); +} + +static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, + const u8 *data, size_t len, + struct crypto_wait *wait) +{ + struct scatterlist sg; + + if (likely(!is_vmalloc_addr(data))) { + sg_init_one(&sg, data, len); + ahash_request_set_crypt(req, &sg, NULL, len); + return crypto_wait_req(crypto_ahash_update(req), wait); + } else { + do { + int r; + size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data)); + flush_kernel_vmap_range((void *)data, this_step); + sg_init_table(&sg, 1); + sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data)); + ahash_request_set_crypt(req, &sg, NULL, this_step); + r = crypto_wait_req(crypto_ahash_update(req), wait); + if (unlikely(r)) + return r; + data += this_step; + len -= this_step; + } while (len); + return 0; + } +} + +/* + * Wrapper for crypto_ahash_init, which handles verity salting. + */ +static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, + struct crypto_wait *wait, bool may_sleep) +{ + int r; + + ahash_request_set_tfm(req, v->tfm); + ahash_request_set_callback(req, + may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, + crypto_req_done, (void *)wait); + crypto_init_wait(wait); + + r = crypto_wait_req(crypto_ahash_init(req), wait); + + if (unlikely(r < 0)) { + if (r != -ENOMEM) + DMERR("crypto_ahash_init failed: %d", r); + return r; + } + + if (likely(v->salt_size && (v->version >= 1))) + r = verity_hash_update(v, req, v->salt, v->salt_size, wait); + + return r; +} + +static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, + u8 *digest, struct crypto_wait *wait) +{ + int r; + + if (unlikely(v->salt_size && (!v->version))) { + r = verity_hash_update(v, req, v->salt, v->salt_size, wait); + + if (r < 0) { + DMERR("verity_hash_final failed updating salt: %d", r); + goto out; + } + } + + ahash_request_set_crypt(req, NULL, digest, 0); + r = crypto_wait_req(crypto_ahash_final(req), wait); +out: + return r; +} + +int verity_hash(struct dm_verity *v, struct ahash_request *req, + const u8 *data, size_t len, u8 *digest, bool may_sleep) +{ + int r; + struct crypto_wait wait; + + r = verity_hash_init(v, req, &wait, may_sleep); + if (unlikely(r < 0)) + goto out; + + r = verity_hash_update(v, req, data, len, &wait); + if (unlikely(r < 0)) + goto out; + + r = verity_hash_final(v, req, digest, &wait); + +out: + return r; +} + +static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, + sector_t *hash_block, unsigned int *offset) +{ + sector_t position = verity_position_at_level(v, block, level); + unsigned int idx; + + *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); + + if (!offset) + return; + + idx = position & ((1 << v->hash_per_block_bits) - 1); + if (!v->version) + *offset = idx * v->digest_size; + else + *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); +} + +/* + * Handle verification errors. + */ +static int verity_handle_err(struct dm_verity *v, enum verity_block_type type, + unsigned long long block) +{ + char verity_env[DM_VERITY_ENV_LENGTH]; + char *envp[] = { verity_env, NULL }; + const char *type_str = ""; + struct mapped_device *md = dm_table_get_md(v->ti->table); + + /* Corruption should be visible in device status in all modes */ + v->hash_failed = true; + + if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS) + goto out; + + v->corrupted_errs++; + + switch (type) { + case DM_VERITY_BLOCK_TYPE_DATA: + type_str = "data"; + break; + case DM_VERITY_BLOCK_TYPE_METADATA: + type_str = "metadata"; + break; + default: + BUG(); + } + + DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name, + type_str, block); + + if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS) + DMERR("%s: reached maximum errors", v->data_dev->name); + + snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu", + DM_VERITY_ENV_VAR_NAME, type, block); + + kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp); + +out: + if (v->mode == DM_VERITY_MODE_LOGGING) + return 0; + + if (v->mode == DM_VERITY_MODE_RESTART) + kernel_restart("dm-verity device corrupted"); + + if (v->mode == DM_VERITY_MODE_PANIC) + panic("dm-verity device corrupted"); + + return 1; +} + +/* + * Verify hash of a metadata block pertaining to the specified data block + * ("block" argument) at a specified level ("level" argument). + * + * On successful return, verity_io_want_digest(v, io) contains the hash value + * for a lower tree level or for the data block (if we're at the lowest level). + * + * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. + * If "skip_unverified" is false, unverified buffer is hashed and verified + * against current value of verity_io_want_digest(v, io). + */ +static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, int level, bool skip_unverified, + u8 *want_digest) +{ + struct dm_buffer *buf; + struct buffer_aux *aux; + u8 *data; + int r; + sector_t hash_block; + unsigned int offset; + + verity_hash_at_level(v, block, level, &hash_block, &offset); + + if (static_branch_unlikely(&use_tasklet_enabled) && io->in_tasklet) { + data = dm_bufio_get(v->bufio, hash_block, &buf); + if (data == NULL) { + /* + * In tasklet and the hash was not in the bufio cache. + * Return early and resume execution from a work-queue + * to read the hash from disk. + */ + return -EAGAIN; + } + } else + data = dm_bufio_read(v->bufio, hash_block, &buf); + + if (IS_ERR(data)) + return PTR_ERR(data); + + aux = dm_bufio_get_aux_data(buf); + + if (!aux->hash_verified) { + if (skip_unverified) { + r = 1; + goto release_ret_r; + } + + r = verity_hash(v, verity_io_hash_req(v, io), + data, 1 << v->hash_dev_block_bits, + verity_io_real_digest(v, io), !io->in_tasklet); + if (unlikely(r < 0)) + goto release_ret_r; + + if (likely(memcmp(verity_io_real_digest(v, io), want_digest, + v->digest_size) == 0)) + aux->hash_verified = 1; + else if (static_branch_unlikely(&use_tasklet_enabled) && + io->in_tasklet) { + /* + * Error handling code (FEC included) cannot be run in a + * tasklet since it may sleep, so fallback to work-queue. + */ + r = -EAGAIN; + goto release_ret_r; + } + else if (verity_fec_decode(v, io, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data, NULL) == 0) + aux->hash_verified = 1; + else if (verity_handle_err(v, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block)) { + r = -EIO; + goto release_ret_r; + } + } + + data += offset; + memcpy(want_digest, data, v->digest_size); + r = 0; + +release_ret_r: + dm_bufio_release(buf); + return r; +} + +/* + * Find a hash for a given block, write it to digest and verify the integrity + * of the hash tree if necessary. + */ +int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest, bool *is_zero) +{ + int r = 0, i; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 1 and we fall back to whole + * chain verification. + */ + r = verity_verify_level(v, io, block, 0, true, digest); + if (likely(r <= 0)) + goto out; + } + + memcpy(digest, v->root_digest, v->digest_size); + + for (i = v->levels - 1; i >= 0; i--) { + r = verity_verify_level(v, io, block, i, false, digest); + if (unlikely(r)) + goto out; + } +out: + if (!r && v->zero_digest) + *is_zero = !memcmp(v->zero_digest, digest, v->digest_size); + else + *is_zero = false; + + return r; +} + +/* + * Calculates the digest for the given bio + */ +static int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, struct crypto_wait *wait) +{ + unsigned int todo = 1 << v->data_dev_block_bits; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + struct scatterlist sg; + struct ahash_request *req = verity_io_hash_req(v, io); + + do { + int r; + unsigned int len; + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + sg_init_table(&sg, 1); + + len = bv.bv_len; + + if (likely(len >= todo)) + len = todo; + /* + * Operating on a single page at a time looks suboptimal + * until you consider the typical block size is 4,096B. + * Going through this loops twice should be very rare. + */ + sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); + ahash_request_set_crypt(req, &sg, NULL, len); + r = crypto_wait_req(crypto_ahash_update(req), wait); + + if (unlikely(r < 0)) { + DMERR("verity_for_io_block crypto op failed: %d", r); + return r; + } + + bio_advance_iter(bio, iter, len); + todo -= len; + } while (todo); + + return 0; +} + +/* + * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec + * starting from iter. + */ +int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, u8 *data, + size_t len)) +{ + unsigned int todo = 1 << v->data_dev_block_bits; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + + do { + int r; + u8 *page; + unsigned int len; + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + page = bvec_kmap_local(&bv); + len = bv.bv_len; + + if (likely(len >= todo)) + len = todo; + + r = process(v, io, page, len); + kunmap_local(page); + + if (r < 0) + return r; + + bio_advance_iter(bio, iter, len); + todo -= len; + } while (todo); + + return 0; +} + +static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + memset(data, 0, len); + return 0; +} + +/* + * Moves the bio iter one data block forward. + */ +static inline void verity_bv_skip_block(struct dm_verity *v, + struct dm_verity_io *io, + struct bvec_iter *iter) +{ + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + + bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits); +} + +/* + * Verify one "dm_verity_io" structure. + */ +static int verity_verify_io(struct dm_verity_io *io) +{ + bool is_zero; + struct dm_verity *v = io->v; +#if defined(CONFIG_DM_VERITY_FEC) + struct bvec_iter start; +#endif + struct bvec_iter iter_copy; + struct bvec_iter *iter; + struct crypto_wait wait; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + unsigned int b; + + if (static_branch_unlikely(&use_tasklet_enabled) && io->in_tasklet) { + /* + * Copy the iterator in case we need to restart + * verification in a work-queue. + */ + iter_copy = io->iter; + iter = &iter_copy; + } else + iter = &io->iter; + + for (b = 0; b < io->n_blocks; b++) { + int r; + sector_t cur_block = io->block + b; + struct ahash_request *req = verity_io_hash_req(v, io); + + if (v->validated_blocks && bio->bi_status == BLK_STS_OK && + likely(test_bit(cur_block, v->validated_blocks))) { + verity_bv_skip_block(v, io, iter); + continue; + } + + r = verity_hash_for_block(v, io, cur_block, + verity_io_want_digest(v, io), + &is_zero); + if (unlikely(r < 0)) + return r; + + if (is_zero) { + /* + * If we expect a zero block, don't validate, just + * return zeros. + */ + r = verity_for_bv_block(v, io, iter, + verity_bv_zero); + if (unlikely(r < 0)) + return r; + + continue; + } + + r = verity_hash_init(v, req, &wait, !io->in_tasklet); + if (unlikely(r < 0)) + return r; + +#if defined(CONFIG_DM_VERITY_FEC) + if (verity_fec_is_enabled(v)) + start = *iter; +#endif + r = verity_for_io_block(v, io, iter, &wait); + if (unlikely(r < 0)) + return r; + + r = verity_hash_final(v, req, verity_io_real_digest(v, io), + &wait); + if (unlikely(r < 0)) + return r; + + if (likely(memcmp(verity_io_real_digest(v, io), + verity_io_want_digest(v, io), v->digest_size) == 0)) { + if (v->validated_blocks) + set_bit(cur_block, v->validated_blocks); + continue; + } else if (static_branch_unlikely(&use_tasklet_enabled) && + io->in_tasklet) { + /* + * Error handling code (FEC included) cannot be run in a + * tasklet since it may sleep, so fallback to work-queue. + */ + return -EAGAIN; +#if defined(CONFIG_DM_VERITY_FEC) + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + cur_block, NULL, &start) == 0) { + continue; +#endif + } else { + if (bio->bi_status) { + /* + * Error correction failed; Just return error + */ + return -EIO; + } + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + cur_block)) + return -EIO; + } + } + + return 0; +} + +/* + * Skip verity work in response to I/O error when system is shutting down. + */ +static inline bool verity_is_system_shutting_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +/* + * End one "io" structure with a given error. + */ +static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) +{ + struct dm_verity *v = io->v; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + + bio->bi_end_io = io->orig_bi_end_io; + bio->bi_status = status; + + if (!static_branch_unlikely(&use_tasklet_enabled) || !io->in_tasklet) + verity_fec_finish_io(io); + + bio_endio(bio); +} + +static void verity_work(struct work_struct *w) +{ + struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); + + io->in_tasklet = false; + + verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); +} + +static void verity_tasklet(unsigned long data) +{ + struct dm_verity_io *io = (struct dm_verity_io *)data; + int err; + + io->in_tasklet = true; + err = verity_verify_io(io); + if (err == -EAGAIN || err == -ENOMEM) { + /* fallback to retrying with work-queue */ + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); + return; + } + + verity_finish_io(io, errno_to_blk_status(err)); +} + +static void verity_end_io(struct bio *bio) +{ + struct dm_verity_io *io = bio->bi_private; + + if (bio->bi_status && + (!verity_fec_is_enabled(io->v) || + verity_is_system_shutting_down() || + (bio->bi_opf & REQ_RAHEAD))) { + verity_finish_io(io, bio->bi_status); + return; + } + + if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) { + tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io); + tasklet_schedule(&io->tasklet); + } else { + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); + } +} + +/* + * Prefetch buffers for the specified io. + * The root buffer is not prefetched, it is assumed that it will be cached + * all the time. + */ +static void verity_prefetch_io(struct work_struct *work) +{ + struct dm_verity_prefetch_work *pw = + container_of(work, struct dm_verity_prefetch_work, work); + struct dm_verity *v = pw->v; + int i; + + for (i = v->levels - 2; i >= 0; i--) { + sector_t hash_block_start; + sector_t hash_block_end; + verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); + if (!i) { + unsigned int cluster = READ_ONCE(dm_verity_prefetch_cluster); + + cluster >>= v->data_dev_block_bits; + if (unlikely(!cluster)) + goto no_prefetch_cluster; + + if (unlikely(cluster & (cluster - 1))) + cluster = 1 << __fls(cluster); + + hash_block_start &= ~(sector_t)(cluster - 1); + hash_block_end |= cluster - 1; + if (unlikely(hash_block_end >= v->hash_blocks)) + hash_block_end = v->hash_blocks - 1; + } +no_prefetch_cluster: + dm_bufio_prefetch(v->bufio, hash_block_start, + hash_block_end - hash_block_start + 1); + } + + kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ + sector_t block = io->block; + unsigned int n_blocks = io->n_blocks; + struct dm_verity_prefetch_work *pw; + + if (v->validated_blocks) { + while (n_blocks && test_bit(block, v->validated_blocks)) { + block++; + n_blocks--; + } + while (n_blocks && test_bit(block + n_blocks - 1, + v->validated_blocks)) + n_blocks--; + if (!n_blocks) + return; + } + + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!pw) + return; + + INIT_WORK(&pw->work, verity_prefetch_io); + pw->v = v; + pw->block = block; + pw->n_blocks = n_blocks; + queue_work(v->verify_wq, &pw->work); +} + +/* + * Bio map function. It allocates dm_verity_io structure and bio vector and + * fills them. Then it issues prefetches and the I/O. + */ +static int verity_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_verity *v = ti->private; + struct dm_verity_io *io; + + bio_set_dev(bio, v->data_dev->bdev); + bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector); + + if (((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) & + ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { + DMERR_LIMIT("unaligned io"); + return DM_MAPIO_KILL; + } + + if (bio_end_sector(bio) >> + (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { + DMERR_LIMIT("io out of range"); + return DM_MAPIO_KILL; + } + + if (bio_data_dir(bio) == WRITE) + return DM_MAPIO_KILL; + + io = dm_per_bio_data(bio, ti->per_io_data_size); + io->v = v; + io->orig_bi_end_io = bio->bi_end_io; + io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); + io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; + + bio->bi_end_io = verity_end_io; + bio->bi_private = io; + io->iter = bio->bi_iter; + + verity_fec_init_io(io); + + verity_submit_prefetch(v, io); + + submit_bio_noacct(bio); + + return DM_MAPIO_SUBMITTED; +} + +/* + * Status: V (valid) or C (corruption found) + */ +static void verity_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dm_verity *v = ti->private; + unsigned int args = 0; + unsigned int sz = 0; + unsigned int x; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%c", v->hash_failed ? 'C' : 'V'); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %s %s %u %u %llu %llu %s ", + v->version, + v->data_dev->name, + v->hash_dev->name, + 1 << v->data_dev_block_bits, + 1 << v->hash_dev_block_bits, + (unsigned long long)v->data_blocks, + (unsigned long long)v->hash_start, + v->alg_name + ); + for (x = 0; x < v->digest_size; x++) + DMEMIT("%02x", v->root_digest[x]); + DMEMIT(" "); + if (!v->salt_size) + DMEMIT("-"); + else + for (x = 0; x < v->salt_size; x++) + DMEMIT("%02x", v->salt[x]); + if (v->mode != DM_VERITY_MODE_EIO) + args++; + if (verity_fec_is_enabled(v)) + args += DM_VERITY_OPTS_FEC; + if (v->zero_digest) + args++; + if (v->validated_blocks) + args++; + if (v->use_tasklet) + args++; + if (v->signature_key_desc) + args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS; + if (!args) + return; + DMEMIT(" %u", args); + if (v->mode != DM_VERITY_MODE_EIO) { + DMEMIT(" "); + switch (v->mode) { + case DM_VERITY_MODE_LOGGING: + DMEMIT(DM_VERITY_OPT_LOGGING); + break; + case DM_VERITY_MODE_RESTART: + DMEMIT(DM_VERITY_OPT_RESTART); + break; + case DM_VERITY_MODE_PANIC: + DMEMIT(DM_VERITY_OPT_PANIC); + break; + default: + BUG(); + } + } + if (v->zero_digest) + DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); + if (v->validated_blocks) + DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); + if (v->use_tasklet) + DMEMIT(" " DM_VERITY_OPT_TASKLET_VERIFY); + sz = verity_fec_status_table(v, sz, result, maxlen); + if (v->signature_key_desc) + DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY + " %s", v->signature_key_desc); + break; + + case STATUSTYPE_IMA: + DMEMIT_TARGET_NAME_VERSION(ti->type); + DMEMIT(",hash_failed=%c", v->hash_failed ? 'C' : 'V'); + DMEMIT(",verity_version=%u", v->version); + DMEMIT(",data_device_name=%s", v->data_dev->name); + DMEMIT(",hash_device_name=%s", v->hash_dev->name); + DMEMIT(",verity_algorithm=%s", v->alg_name); + + DMEMIT(",root_digest="); + for (x = 0; x < v->digest_size; x++) + DMEMIT("%02x", v->root_digest[x]); + + DMEMIT(",salt="); + if (!v->salt_size) + DMEMIT("-"); + else + for (x = 0; x < v->salt_size; x++) + DMEMIT("%02x", v->salt[x]); + + DMEMIT(",ignore_zero_blocks=%c", v->zero_digest ? 'y' : 'n'); + DMEMIT(",check_at_most_once=%c", v->validated_blocks ? 'y' : 'n'); + if (v->signature_key_desc) + DMEMIT(",root_hash_sig_key_desc=%s", v->signature_key_desc); + + if (v->mode != DM_VERITY_MODE_EIO) { + DMEMIT(",verity_mode="); + switch (v->mode) { + case DM_VERITY_MODE_LOGGING: + DMEMIT(DM_VERITY_OPT_LOGGING); + break; + case DM_VERITY_MODE_RESTART: + DMEMIT(DM_VERITY_OPT_RESTART); + break; + case DM_VERITY_MODE_PANIC: + DMEMIT(DM_VERITY_OPT_PANIC); + break; + default: + DMEMIT("invalid"); + } + } + DMEMIT(";"); + break; + } +} + +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct dm_verity *v = ti->private; + + *bdev = v->data_dev->bdev; + + if (v->data_start || ti->len != bdev_nr_sectors(v->data_dev->bdev)) + return 1; + return 0; +} + +static int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_verity *v = ti->private; + + return fn(ti, v->data_dev, v->data_start, ti->len, data); +} + +static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_verity *v = ti->private; + + if (limits->logical_block_size < 1 << v->data_dev_block_bits) + limits->logical_block_size = 1 << v->data_dev_block_bits; + + if (limits->physical_block_size < 1 << v->data_dev_block_bits) + limits->physical_block_size = 1 << v->data_dev_block_bits; + + blk_limits_io_min(limits, limits->logical_block_size); +} + +static void verity_dtr(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + + if (v->verify_wq) + destroy_workqueue(v->verify_wq); + + if (v->bufio) + dm_bufio_client_destroy(v->bufio); + + kvfree(v->validated_blocks); + kfree(v->salt); + kfree(v->root_digest); + kfree(v->zero_digest); + + if (v->tfm) + crypto_free_ahash(v->tfm); + + kfree(v->alg_name); + + if (v->hash_dev) + dm_put_device(ti, v->hash_dev); + + if (v->data_dev) + dm_put_device(ti, v->data_dev); + + verity_fec_dtr(v); + + kfree(v->signature_key_desc); + + if (v->use_tasklet) + static_branch_dec(&use_tasklet_enabled); + + kfree(v); +} + +static int verity_alloc_most_once(struct dm_verity *v) +{ + struct dm_target *ti = v->ti; + + /* the bitset can only handle INT_MAX blocks */ + if (v->data_blocks > INT_MAX) { + ti->error = "device too large to use check_at_most_once"; + return -E2BIG; + } + + v->validated_blocks = kvcalloc(BITS_TO_LONGS(v->data_blocks), + sizeof(unsigned long), + GFP_KERNEL); + if (!v->validated_blocks) { + ti->error = "failed to allocate bitset for check_at_most_once"; + return -ENOMEM; + } + + return 0; +} + +static int verity_alloc_zero_digest(struct dm_verity *v) +{ + int r = -ENOMEM; + struct ahash_request *req; + u8 *zero_data; + + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); + + if (!v->zero_digest) + return r; + + req = kmalloc(v->ahash_reqsize, GFP_KERNEL); + + if (!req) + return r; /* verity_dtr will free zero_digest */ + + zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); + + if (!zero_data) + goto out; + + r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, + v->zero_digest, true); + +out: + kfree(req); + kfree(zero_data); + + return r; +} + +static inline bool verity_is_verity_mode(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING) || + !strcasecmp(arg_name, DM_VERITY_OPT_RESTART) || + !strcasecmp(arg_name, DM_VERITY_OPT_PANIC)); +} + +static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name) +{ + if (v->mode) + return -EINVAL; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) + v->mode = DM_VERITY_MODE_LOGGING; + else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) + v->mode = DM_VERITY_MODE_RESTART; + else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) + v->mode = DM_VERITY_MODE_PANIC; + + return 0; +} + +static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + struct dm_verity_sig_opts *verify_args, + bool only_modifier_opts) +{ + int r = 0; + unsigned int argc; + struct dm_target *ti = v->ti; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"}, + }; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + do { + arg_name = dm_shift_arg(as); + argc--; + + if (verity_is_verity_mode(arg_name)) { + if (only_modifier_opts) + continue; + r = verity_parse_verity_mode(v, arg_name); + if (r) { + ti->error = "Conflicting error handling parameters"; + return r; + } + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { + if (only_modifier_opts) + continue; + r = verity_alloc_zero_digest(v); + if (r) { + ti->error = "Cannot allocate zero digest"; + return r; + } + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) { + if (only_modifier_opts) + continue; + r = verity_alloc_most_once(v); + if (r) + return r; + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_TASKLET_VERIFY)) { + v->use_tasklet = true; + static_branch_inc(&use_tasklet_enabled); + continue; + + } else if (verity_is_fec_opt_arg(arg_name)) { + if (only_modifier_opts) + continue; + r = verity_fec_parse_opt_args(as, v, &argc, arg_name); + if (r) + return r; + continue; + + } else if (verity_verify_is_sig_opt_arg(arg_name)) { + if (only_modifier_opts) + continue; + r = verity_verify_sig_parse_opt_args(as, v, + verify_args, + &argc, arg_name); + if (r) + return r; + continue; + + } else if (only_modifier_opts) { + /* + * Ignore unrecognized opt, could easily be an extra + * argument to an option whose parsing was skipped. + * Normal parsing (@only_modifier_opts=false) will + * properly parse all options (and their extra args). + */ + continue; + } + + DMERR("Unrecognized verity feature request: %s", arg_name); + ti->error = "Unrecognized verity feature request"; + return -EINVAL; + } while (argc && !r); + + return r; +} + +/* + * Target parameters: + * The current format is version 1. + * Vsn 0 is compatible with original Chromium OS releases. + * + * + * + * + * + * + * + * + * Hex string or "-" if no salt. + */ +static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_verity *v; + struct dm_verity_sig_opts verify_args = {0}; + struct dm_arg_set as; + unsigned int num; + unsigned int wq_flags; + unsigned long long num_ll; + int r; + int i; + sector_t hash_position; + char dummy; + char *root_hash_digest_to_validate; + + v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); + if (!v) { + ti->error = "Cannot allocate verity structure"; + return -ENOMEM; + } + ti->private = v; + v->ti = ti; + + r = verity_fec_ctr_alloc(v); + if (r) + goto bad; + + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { + ti->error = "Device must be readonly"; + r = -EINVAL; + goto bad; + } + + if (argc < 10) { + ti->error = "Not enough arguments"; + r = -EINVAL; + goto bad; + } + + /* Parse optional parameters that modify primary args */ + if (argc > 10) { + as.argc = argc - 10; + as.argv = argv + 10; + r = verity_parse_opt_args(&as, v, &verify_args, true); + if (r < 0) + goto bad; + } + + if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 || + num > 1) { + ti->error = "Invalid version"; + r = -EINVAL; + goto bad; + } + v->version = num; + + r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); + if (r) { + ti->error = "Hash device lookup failed"; + goto bad; + } + + if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->data_dev->bdev) || + num > PAGE_SIZE) { + ti->error = "Invalid data device block size"; + r = -EINVAL; + goto bad; + } + v->data_dev_block_bits = __ffs(num); + + if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->hash_dev->bdev) || + num > INT_MAX) { + ti->error = "Invalid hash device block size"; + r = -EINVAL; + goto bad; + } + v->hash_dev_block_bits = __ffs(num); + + if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || + (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) { + ti->error = "Invalid data blocks"; + r = -EINVAL; + goto bad; + } + v->data_blocks = num_ll; + + if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { + ti->error = "Data device is too small"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || + (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) + >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) { + ti->error = "Invalid hash start"; + r = -EINVAL; + goto bad; + } + v->hash_start = num_ll; + + v->alg_name = kstrdup(argv[7], GFP_KERNEL); + if (!v->alg_name) { + ti->error = "Cannot allocate algorithm name"; + r = -ENOMEM; + goto bad; + } + + v->tfm = crypto_alloc_ahash(v->alg_name, 0, + v->use_tasklet ? CRYPTO_ALG_ASYNC : 0); + if (IS_ERR(v->tfm)) { + ti->error = "Cannot initialize hash function"; + r = PTR_ERR(v->tfm); + v->tfm = NULL; + goto bad; + } + + /* + * dm-verity performance can vary greatly depending on which hash + * algorithm implementation is used. Help people debug performance + * problems by logging the ->cra_driver_name. + */ + DMINFO("%s using implementation \"%s\"", v->alg_name, + crypto_hash_alg_common(v->tfm)->base.cra_driver_name); + + v->digest_size = crypto_ahash_digestsize(v->tfm); + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + r = -EINVAL; + goto bad; + } + v->ahash_reqsize = sizeof(struct ahash_request) + + crypto_ahash_reqsize(v->tfm); + + v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); + if (!v->root_digest) { + ti->error = "Cannot allocate root digest"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[8]) != v->digest_size * 2 || + hex2bin(v->root_digest, argv[8], v->digest_size)) { + ti->error = "Invalid root digest"; + r = -EINVAL; + goto bad; + } + root_hash_digest_to_validate = argv[8]; + + if (strcmp(argv[9], "-")) { + v->salt_size = strlen(argv[9]) / 2; + v->salt = kmalloc(v->salt_size, GFP_KERNEL); + if (!v->salt) { + ti->error = "Cannot allocate salt"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[9]) != v->salt_size * 2 || + hex2bin(v->salt, argv[9], v->salt_size)) { + ti->error = "Invalid salt"; + r = -EINVAL; + goto bad; + } + } + + argv += 10; + argc -= 10; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; + r = verity_parse_opt_args(&as, v, &verify_args, false); + if (r < 0) + goto bad; + } + + /* Root hash signature is a optional parameter*/ + r = verity_verify_root_hash(root_hash_digest_to_validate, + strlen(root_hash_digest_to_validate), + verify_args.sig, + verify_args.sig_size); + if (r < 0) { + ti->error = "Root hash verification failed"; + goto bad; + } + v->hash_per_block_bits = + __fls((1 << v->hash_dev_block_bits) / v->digest_size); + + v->levels = 0; + if (v->data_blocks) + while (v->hash_per_block_bits * v->levels < 64 && + (unsigned long long)(v->data_blocks - 1) >> + (v->hash_per_block_bits * v->levels)) + v->levels++; + + if (v->levels > DM_VERITY_MAX_LEVELS) { + ti->error = "Too many tree levels"; + r = -E2BIG; + goto bad; + } + + hash_position = v->hash_start; + for (i = v->levels - 1; i >= 0; i--) { + sector_t s; + v->hash_level_block[i] = hash_position; + s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1) + >> ((i + 1) * v->hash_per_block_bits); + if (hash_position + s < hash_position) { + ti->error = "Hash device offset overflow"; + r = -E2BIG; + goto bad; + } + hash_position += s; + } + v->hash_blocks = hash_position; + + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, + 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), + dm_bufio_alloc_callback, NULL, + v->use_tasklet ? DM_BUFIO_CLIENT_NO_SLEEP : 0); + if (IS_ERR(v->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + r = PTR_ERR(v->bufio); + v->bufio = NULL; + goto bad; + } + + if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { + ti->error = "Hash device is too small"; + r = -E2BIG; + goto bad; + } + + /* WQ_UNBOUND greatly improves performance when running on ramdisk */ + wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND; + /* + * Using WQ_HIGHPRI improves throughput and completion latency by + * reducing wait times when reading from a dm-verity device. + * + * Also as required for the "try_verify_in_tasklet" feature: WQ_HIGHPRI + * allows verify_wq to preempt softirq since verification in tasklet + * will fall-back to using it for error handling (or if the bufio cache + * doesn't have required hashes). + */ + wq_flags |= WQ_HIGHPRI; + v->verify_wq = alloc_workqueue("kverityd", wq_flags, num_online_cpus()); + if (!v->verify_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + ti->per_io_data_size = sizeof(struct dm_verity_io) + + v->ahash_reqsize + v->digest_size * 2; + + r = verity_fec_ctr(v); + if (r) + goto bad; + + ti->per_io_data_size = roundup(ti->per_io_data_size, + __alignof__(struct dm_verity_io)); + + verity_verify_sig_opts_cleanup(&verify_args); + + return 0; + +bad: + + verity_verify_sig_opts_cleanup(&verify_args); + verity_dtr(ti); + + return r; +} + +/* + * Check whether a DM target is a verity target. + */ +bool dm_is_verity_target(struct dm_target *ti) +{ + return ti->type->module == THIS_MODULE; +} + +/* + * Get the verity mode (error behavior) of a verity target. + * + * Returns the verity mode of the target, or -EINVAL if 'ti' is not a verity + * target. + */ +int dm_verity_get_mode(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + + if (!dm_is_verity_target(ti)) + return -EINVAL; + + return v->mode; +} + +/* + * Get the root digest of a verity target. + * + * Returns a copy of the root digest, the caller is responsible for + * freeing the memory of the digest. + */ +int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned int *digest_size) +{ + struct dm_verity *v = ti->private; + + if (!dm_is_verity_target(ti)) + return -EINVAL; + + *root_digest = kmemdup(v->root_digest, v->digest_size, GFP_KERNEL); + if (*root_digest == NULL) + return -ENOMEM; + + *digest_size = v->digest_size; + + return 0; +} + +static struct target_type verity_target = { + .name = "verity", + .features = DM_TARGET_IMMUTABLE, + .version = {1, 9, 0}, + .module = THIS_MODULE, + .ctr = verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .prepare_ioctl = verity_prepare_ioctl, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_verity_init(void) +{ + int r; + + r = dm_register_target(&verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_verity_exit(void) +{ + dm_unregister_target(&verity_target); +} + +module_init(dm_verity_init); +module_exit(dm_verity_exit); + +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_AUTHOR("Mandeep Baines "); +MODULE_AUTHOR("Will Drewry "); +MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c new file mode 100644 index 000000000..db61a1f43 --- /dev/null +++ b/drivers/md/dm-verity-verify-sig.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Microsoft Corporation. + * + * Author: Jaskaran Singh Khurana + * + */ +#include +#include +#include +#include +#include "dm-verity.h" +#include "dm-verity-verify-sig.h" + +#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s + +static bool require_signatures; +module_param(require_signatures, bool, 0444); +MODULE_PARM_DESC(require_signatures, + "Verify the roothash of dm-verity hash tree"); + +#define DM_VERITY_IS_SIG_FORCE_ENABLED() \ + (require_signatures != false) + +bool verity_verify_is_sig_opt_arg(const char *arg_name) +{ + return (!strcasecmp(arg_name, + DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY)); +} + +static int verity_verify_get_sig_from_key(const char *key_desc, + struct dm_verity_sig_opts *sig_opts) +{ + struct key *key; + const struct user_key_payload *ukp; + int ret = 0; + + key = request_key(&key_type_user, + key_desc, NULL); + if (IS_ERR(key)) + return PTR_ERR(key); + + down_read(&key->sem); + + ukp = user_key_payload_locked(key); + if (!ukp) { + ret = -EKEYREVOKED; + goto end; + } + + sig_opts->sig = kmalloc(ukp->datalen, GFP_KERNEL); + if (!sig_opts->sig) { + ret = -ENOMEM; + goto end; + } + sig_opts->sig_size = ukp->datalen; + + memcpy(sig_opts->sig, ukp->data, sig_opts->sig_size); + +end: + up_read(&key->sem); + key_put(key); + + return ret; +} + +int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, + struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, + const char *arg_name) +{ + struct dm_target *ti = v->ti; + int ret = 0; + const char *sig_key = NULL; + + if (!*argc) { + ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); + return -EINVAL; + } + + sig_key = dm_shift_arg(as); + (*argc)--; + + ret = verity_verify_get_sig_from_key(sig_key, sig_opts); + if (ret < 0) + ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); + + v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); + if (!v->signature_key_desc) + return -ENOMEM; + + return ret; +} + +/* + * verify_verify_roothash - Verify the root hash of the verity hash device + * using builtin trusted keys. + * + * @root_hash: For verity, the roothash/data to be verified. + * @root_hash_len: Size of the roothash/data to be verified. + * @sig_data: The trusted signature that verifies the roothash/data. + * @sig_len: Size of the signature. + * + */ +int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, + const void *sig_data, size_t sig_len) +{ + int ret; + + if (!root_hash || root_hash_len == 0) + return -EINVAL; + + if (!sig_data || sig_len == 0) { + if (DM_VERITY_IS_SIG_FORCE_ENABLED()) + return -ENOKEY; + else + return 0; + } + + ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, + sig_len, +#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING + VERIFY_USE_SECONDARY_KEYRING, +#else + NULL, +#endif + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); + + return ret; +} + +void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) +{ + kfree(sig_opts->sig); + sig_opts->sig = NULL; + sig_opts->sig_size = 0; +} diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h new file mode 100644 index 000000000..3987c7141 --- /dev/null +++ b/drivers/md/dm-verity-verify-sig.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Microsoft Corporation. + * + * Author: Jaskaran Singh Khurana + * + */ +#ifndef DM_VERITY_SIG_VERIFICATION_H +#define DM_VERITY_SIG_VERIFICATION_H + +#define DM_VERITY_ROOT_HASH_VERIFICATION "DM Verity Sig Verification" +#define DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY "root_hash_sig_key_desc" + +struct dm_verity_sig_opts { + unsigned int sig_size; + u8 *sig; +}; + +#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG + +#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 2 + +int verity_verify_root_hash(const void *data, size_t data_len, + const void *sig_data, size_t sig_len); +bool verity_verify_is_sig_opt_arg(const char *arg_name); + +int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, const char *arg_name); + +void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); + +#else + +#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0 + +static inline int verity_verify_root_hash(const void *data, size_t data_len, + const void *sig_data, size_t sig_len) +{ + return 0; +} + +static inline bool verity_verify_is_sig_opt_arg(const char *arg_name) +{ + return false; +} + +static inline int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, const char *arg_name) +{ + return -EINVAL; +} + +static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) +{ +} + +#endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */ +#endif /* DM_VERITY_SIG_VERIFICATION_H */ diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h new file mode 100644 index 000000000..f9d522c87 --- /dev/null +++ b/drivers/md/dm-verity.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2015 Google, Inc. + * + * Author: Mikulas Patocka + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + */ + +#ifndef DM_VERITY_H +#define DM_VERITY_H + +#include +#include +#include +#include + +#define DM_VERITY_MAX_LEVELS 63 + +enum verity_mode { + DM_VERITY_MODE_EIO, + DM_VERITY_MODE_LOGGING, + DM_VERITY_MODE_RESTART, + DM_VERITY_MODE_PANIC +}; + +enum verity_block_type { + DM_VERITY_BLOCK_TYPE_DATA, + DM_VERITY_BLOCK_TYPE_METADATA +}; + +struct dm_verity_fec; + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_ahash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + u8 *zero_digest; /* digest for a zero block */ + unsigned int salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + bool hash_failed:1; /* set if hash of any block failed */ + bool use_tasklet:1; /* try to verify in tasklet before work-queue */ + unsigned int digest_size; /* digest size for the current hash algorithm */ + unsigned int ahash_reqsize;/* the size of temporary space for crypto */ + enum verity_mode mode; /* mode for handling verification errors */ + unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; + + struct dm_verity_fec *fec; /* forward error correction */ + unsigned long *validated_blocks; /* bitset blocks validated */ + + char *signature_key_desc; /* signature keyring reference */ +}; + +struct dm_verity_io { + struct dm_verity *v; + + /* original value of bio->bi_end_io */ + bio_end_io_t *orig_bi_end_io; + + sector_t block; + unsigned int n_blocks; + bool in_tasklet; + + struct bvec_iter iter; + + struct work_struct work; + struct tasklet_struct tasklet; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_req[v->ahash_reqsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: verity_io_hash_req(), verity_io_real_digest() + * and verity_io_want_digest(). + */ +}; + +static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (struct ahash_request *)(io + 1); +} + +static inline u8 *verity_io_real_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->ahash_reqsize; +} + +static inline u8 *verity_io_want_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size; +} + +extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, + u8 *data, size_t len)); + +extern int verity_hash(struct dm_verity *v, struct ahash_request *req, + const u8 *data, size_t len, u8 *digest, bool may_sleep); + +extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest, bool *is_zero); + +extern bool dm_is_verity_target(struct dm_target *ti); +extern int dm_verity_get_mode(struct dm_target *ti); +extern int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, + unsigned int *digest_size); + +#endif /* DM_VERITY_H */ diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c new file mode 100644 index 000000000..c6ff43a8f --- /dev/null +++ b/drivers/md/dm-writecache.c @@ -0,0 +1,2775 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dm-io-tracker.h" + +#define DM_MSG_PREFIX "writecache" + +#define HIGH_WATERMARK 50 +#define LOW_WATERMARK 45 +#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) +#define ENDIO_LATENCY 16 +#define WRITEBACK_LATENCY 64 +#define AUTOCOMMIT_BLOCKS_SSD 65536 +#define AUTOCOMMIT_BLOCKS_PMEM 64 +#define AUTOCOMMIT_MSEC 1000 +#define MAX_AGE_DIV 16 +#define MAX_AGE_UNSPECIFIED -1UL +#define PAUSE_WRITEBACK (HZ * 3) + +#define BITMAP_GRANULARITY 65536 +#if BITMAP_GRANULARITY < PAGE_SIZE +#undef BITMAP_GRANULARITY +#define BITMAP_GRANULARITY PAGE_SIZE +#endif + +#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) +#define DM_WRITECACHE_HAS_PMEM +#endif + +#ifdef DM_WRITECACHE_HAS_PMEM +#define pmem_assign(dest, src) \ +do { \ + typeof(dest) uniq = (src); \ + memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ +} while (0) +#else +#define pmem_assign(dest, src) ((dest) = (src)) +#endif + +#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) +#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS +#endif + +#define MEMORY_SUPERBLOCK_MAGIC 0x23489321 +#define MEMORY_SUPERBLOCK_VERSION 1 + +struct wc_memory_entry { + __le64 original_sector; + __le64 seq_count; +}; + +struct wc_memory_superblock { + union { + struct { + __le32 magic; + __le32 version; + __le32 block_size; + __le32 pad; + __le64 n_blocks; + __le64 seq_count; + }; + __le64 padding[8]; + }; + struct wc_memory_entry entries[]; +}; + +struct wc_entry { + struct rb_node rb_node; + struct list_head lru; + unsigned short wc_list_contiguous; + bool write_in_progress +#if BITS_PER_LONG == 64 + : 1 +#endif + ; + unsigned long index +#if BITS_PER_LONG == 64 + : 47 +#endif + ; + unsigned long age; +#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS + uint64_t original_sector; + uint64_t seq_count; +#endif +}; + +#ifdef DM_WRITECACHE_HAS_PMEM +#define WC_MODE_PMEM(wc) ((wc)->pmem_mode) +#define WC_MODE_FUA(wc) ((wc)->writeback_fua) +#else +#define WC_MODE_PMEM(wc) false +#define WC_MODE_FUA(wc) false +#endif +#define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) + +struct dm_writecache { + struct mutex lock; + struct list_head lru; + union { + struct list_head freelist; + struct { + struct rb_root freetree; + struct wc_entry *current_free; + }; + }; + struct rb_root tree; + + size_t freelist_size; + size_t writeback_size; + size_t freelist_high_watermark; + size_t freelist_low_watermark; + unsigned long max_age; + unsigned long pause; + + unsigned int uncommitted_blocks; + unsigned int autocommit_blocks; + unsigned int max_writeback_jobs; + + int error; + + unsigned long autocommit_jiffies; + struct timer_list autocommit_timer; + struct wait_queue_head freelist_wait; + + struct timer_list max_age_timer; + + atomic_t bio_in_progress[2]; + struct wait_queue_head bio_in_progress_wait[2]; + + struct dm_target *ti; + struct dm_dev *dev; + struct dm_dev *ssd_dev; + sector_t start_sector; + void *memory_map; + uint64_t memory_map_size; + size_t metadata_sectors; + size_t n_blocks; + uint64_t seq_count; + sector_t data_device_sectors; + void *block_start; + struct wc_entry *entries; + unsigned int block_size; + unsigned char block_size_bits; + + bool pmem_mode:1; + bool writeback_fua:1; + + bool overwrote_committed:1; + bool memory_vmapped:1; + + bool start_sector_set:1; + bool high_wm_percent_set:1; + bool low_wm_percent_set:1; + bool max_writeback_jobs_set:1; + bool autocommit_blocks_set:1; + bool autocommit_time_set:1; + bool max_age_set:1; + bool writeback_fua_set:1; + bool flush_on_suspend:1; + bool cleaner:1; + bool cleaner_set:1; + bool metadata_only:1; + bool pause_set:1; + + unsigned int high_wm_percent_value; + unsigned int low_wm_percent_value; + unsigned int autocommit_time_value; + unsigned int max_age_value; + unsigned int pause_value; + + unsigned int writeback_all; + struct workqueue_struct *writeback_wq; + struct work_struct writeback_work; + struct work_struct flush_work; + + struct dm_io_tracker iot; + + struct dm_io_client *dm_io; + + raw_spinlock_t endio_list_lock; + struct list_head endio_list; + struct task_struct *endio_thread; + + struct task_struct *flush_thread; + struct bio_list flush_list; + + struct dm_kcopyd_client *dm_kcopyd; + unsigned long *dirty_bitmap; + unsigned int dirty_bitmap_size; + + struct bio_set bio_set; + mempool_t copy_pool; + + struct { + unsigned long long reads; + unsigned long long read_hits; + unsigned long long writes; + unsigned long long write_hits_uncommitted; + unsigned long long write_hits_committed; + unsigned long long writes_around; + unsigned long long writes_allocate; + unsigned long long writes_blocked_on_freelist; + unsigned long long flushes; + unsigned long long discards; + } stats; +}; + +#define WB_LIST_INLINE 16 + +struct writeback_struct { + struct list_head endio_entry; + struct dm_writecache *wc; + struct wc_entry **wc_list; + unsigned int wc_list_n; + struct wc_entry *wc_list_inline[WB_LIST_INLINE]; + struct bio bio; +}; + +struct copy_struct { + struct list_head endio_entry; + struct dm_writecache *wc; + struct wc_entry *e; + unsigned int n_entries; + int error; +}; + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, + "A percentage of time allocated for data copying"); + +static void wc_lock(struct dm_writecache *wc) +{ + mutex_lock(&wc->lock); +} + +static void wc_unlock(struct dm_writecache *wc) +{ + mutex_unlock(&wc->lock); +} + +#ifdef DM_WRITECACHE_HAS_PMEM +static int persistent_memory_claim(struct dm_writecache *wc) +{ + int r; + loff_t s; + long p, da; + pfn_t pfn; + int id; + struct page **pages; + sector_t offset; + + wc->memory_vmapped = false; + + s = wc->memory_map_size; + p = s >> PAGE_SHIFT; + if (!p) { + r = -EINVAL; + goto err1; + } + if (p != s >> PAGE_SHIFT) { + r = -EOVERFLOW; + goto err1; + } + + offset = get_start_sect(wc->ssd_dev->bdev); + if (offset & (PAGE_SIZE / 512 - 1)) { + r = -EINVAL; + goto err1; + } + offset >>= PAGE_SHIFT - 9; + + id = dax_read_lock(); + + da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, + &wc->memory_map, &pfn); + if (da < 0) { + wc->memory_map = NULL; + r = da; + goto err2; + } + if (!pfn_t_has_page(pfn)) { + wc->memory_map = NULL; + r = -EOPNOTSUPP; + goto err2; + } + if (da != p) { + long i; + wc->memory_map = NULL; + pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + r = -ENOMEM; + goto err2; + } + i = 0; + do { + long daa; + daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, + p - i, DAX_ACCESS, NULL, &pfn); + if (daa <= 0) { + r = daa ? daa : -EINVAL; + goto err3; + } + if (!pfn_t_has_page(pfn)) { + r = -EOPNOTSUPP; + goto err3; + } + while (daa-- && i < p) { + pages[i++] = pfn_t_to_page(pfn); + pfn.val++; + if (!(i & 15)) + cond_resched(); + } + } while (i < p); + wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); + if (!wc->memory_map) { + r = -ENOMEM; + goto err3; + } + kvfree(pages); + wc->memory_vmapped = true; + } + + dax_read_unlock(id); + + wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; + wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; + + return 0; +err3: + kvfree(pages); +err2: + dax_read_unlock(id); +err1: + return r; +} +#else +static int persistent_memory_claim(struct dm_writecache *wc) +{ + return -EOPNOTSUPP; +} +#endif + +static void persistent_memory_release(struct dm_writecache *wc) +{ + if (wc->memory_vmapped) + vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); +} + +static struct page *persistent_memory_page(void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + else + return virt_to_page(addr); +} + +static unsigned int persistent_memory_page_offset(void *addr) +{ + return (unsigned long)addr & (PAGE_SIZE - 1); +} + +static void persistent_memory_flush_cache(void *ptr, size_t size) +{ + if (is_vmalloc_addr(ptr)) + flush_kernel_vmap_range(ptr, size); +} + +static void persistent_memory_invalidate_cache(void *ptr, size_t size) +{ + if (is_vmalloc_addr(ptr)) + invalidate_kernel_vmap_range(ptr, size); +} + +static struct wc_memory_superblock *sb(struct dm_writecache *wc) +{ + return wc->memory_map; +} + +static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) +{ + return &sb(wc)->entries[e->index]; +} + +static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) +{ + return (char *)wc->block_start + (e->index << wc->block_size_bits); +} + +static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) +{ + return wc->start_sector + wc->metadata_sectors + + ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); +} + +static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) +{ +#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS + return e->original_sector; +#else + return le64_to_cpu(memory_entry(wc, e)->original_sector); +#endif +} + +static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) +{ +#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS + return e->seq_count; +#else + return le64_to_cpu(memory_entry(wc, e)->seq_count); +#endif +} + +static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) +{ +#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS + e->seq_count = -1; +#endif + pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); +} + +static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, + uint64_t original_sector, uint64_t seq_count) +{ + struct wc_memory_entry me; +#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS + e->original_sector = original_sector; + e->seq_count = seq_count; +#endif + me.original_sector = cpu_to_le64(original_sector); + me.seq_count = cpu_to_le64(seq_count); + pmem_assign(*memory_entry(wc, e), me); +} + +#define writecache_error(wc, err, msg, arg...) \ +do { \ + if (!cmpxchg(&(wc)->error, 0, err)) \ + DMERR(msg, ##arg); \ + wake_up(&(wc)->freelist_wait); \ +} while (0) + +#define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) + +static void writecache_flush_all_metadata(struct dm_writecache *wc) +{ + if (!WC_MODE_PMEM(wc)) + memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); +} + +static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) +{ + if (!WC_MODE_PMEM(wc)) + __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, + wc->dirty_bitmap); +} + +static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); + +struct io_notify { + struct dm_writecache *wc; + struct completion c; + atomic_t count; +}; + +static void writecache_notify_io(unsigned long error, void *context) +{ + struct io_notify *endio = context; + + if (unlikely(error != 0)) + writecache_error(endio->wc, -EIO, "error writing metadata"); + BUG_ON(atomic_read(&endio->count) <= 0); + if (atomic_dec_and_test(&endio->count)) + complete(&endio->c); +} + +static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) +{ + wait_event(wc->bio_in_progress_wait[direction], + !atomic_read(&wc->bio_in_progress[direction])); +} + +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) +{ + struct dm_io_region region; + struct dm_io_request req; + struct io_notify endio = { + wc, + COMPLETION_INITIALIZER_ONSTACK(endio.c), + ATOMIC_INIT(1), + }; + unsigned int bitmap_bits = wc->dirty_bitmap_size * 8; + unsigned int i = 0; + + while (1) { + unsigned int j; + i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); + if (unlikely(i == bitmap_bits)) + break; + j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); + + region.bdev = wc->ssd_dev->bdev; + region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); + region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); + + if (unlikely(region.sector >= wc->metadata_sectors)) + break; + if (unlikely(region.sector + region.count > wc->metadata_sectors)) + region.count = wc->metadata_sectors - region.sector; + + region.sector += wc->start_sector; + atomic_inc(&endio.count); + req.bi_opf = REQ_OP_WRITE | REQ_SYNC; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; + req.client = wc->dm_io; + req.notify.fn = writecache_notify_io; + req.notify.context = &endio; + + /* writing via async dm-io (implied by notify.fn above) won't return an error */ + (void) dm_io(&req, 1, ®ion, NULL); + i = j; + } + + writecache_notify_io(0, &endio); + wait_for_completion_io(&endio.c); + + if (wait_for_ios) + writecache_wait_for_ios(wc, WRITE); + + writecache_disk_flush(wc, wc->ssd_dev); + + memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); +} + +static void ssd_commit_superblock(struct dm_writecache *wc) +{ + int r; + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = wc->ssd_dev->bdev; + region.sector = 0; + region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; + + if (unlikely(region.sector + region.count > wc->metadata_sectors)) + region.count = wc->metadata_sectors - region.sector; + + region.sector += wc->start_sector; + + req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map; + req.client = wc->dm_io; + req.notify.fn = NULL; + req.notify.context = NULL; + + r = dm_io(&req, 1, ®ion, NULL); + if (unlikely(r)) + writecache_error(wc, r, "error writing superblock"); +} + +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) +{ + if (WC_MODE_PMEM(wc)) + pmem_wmb(); + else + ssd_commit_flushed(wc, wait_for_ios); +} + +static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) +{ + int r; + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = dev->bdev; + region.sector = 0; + region.count = 0; + req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + req.mem.type = DM_IO_KMEM; + req.mem.ptr.addr = NULL; + req.client = wc->dm_io; + req.notify.fn = NULL; + + r = dm_io(&req, 1, ®ion, NULL); + if (unlikely(r)) + writecache_error(wc, r, "error flushing metadata: %d", r); +} + +#define WFE_RETURN_FOLLOWING 1 +#define WFE_LOWEST_SEQ 2 + +static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, + uint64_t block, int flags) +{ + struct wc_entry *e; + struct rb_node *node = wc->tree.rb_node; + + if (unlikely(!node)) + return NULL; + + while (1) { + e = container_of(node, struct wc_entry, rb_node); + if (read_original_sector(wc, e) == block) + break; + + node = (read_original_sector(wc, e) >= block ? + e->rb_node.rb_left : e->rb_node.rb_right); + if (unlikely(!node)) { + if (!(flags & WFE_RETURN_FOLLOWING)) + return NULL; + if (read_original_sector(wc, e) >= block) { + return e; + } else { + node = rb_next(&e->rb_node); + if (unlikely(!node)) + return NULL; + e = container_of(node, struct wc_entry, rb_node); + return e; + } + } + } + + while (1) { + struct wc_entry *e2; + if (flags & WFE_LOWEST_SEQ) + node = rb_prev(&e->rb_node); + else + node = rb_next(&e->rb_node); + if (unlikely(!node)) + return e; + e2 = container_of(node, struct wc_entry, rb_node); + if (read_original_sector(wc, e2) != block) + return e; + e = e2; + } +} + +static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) +{ + struct wc_entry *e; + struct rb_node **node = &wc->tree.rb_node, *parent = NULL; + + while (*node) { + e = container_of(*node, struct wc_entry, rb_node); + parent = &e->rb_node; + if (read_original_sector(wc, e) > read_original_sector(wc, ins)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + rb_link_node(&ins->rb_node, parent, node); + rb_insert_color(&ins->rb_node, &wc->tree); + list_add(&ins->lru, &wc->lru); + ins->age = jiffies; +} + +static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) +{ + list_del(&e->lru); + rb_erase(&e->rb_node, &wc->tree); +} + +static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) +{ + if (WC_MODE_SORT_FREELIST(wc)) { + struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; + if (unlikely(!*node)) + wc->current_free = e; + while (*node) { + parent = *node; + if (&e->rb_node < *node) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + rb_link_node(&e->rb_node, parent, node); + rb_insert_color(&e->rb_node, &wc->freetree); + } else { + list_add_tail(&e->lru, &wc->freelist); + } + wc->freelist_size++; +} + +static inline void writecache_verify_watermark(struct dm_writecache *wc) +{ + if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) + queue_work(wc->writeback_wq, &wc->writeback_work); +} + +static void writecache_max_age_timer(struct timer_list *t) +{ + struct dm_writecache *wc = from_timer(wc, t, max_age_timer); + + if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { + queue_work(wc->writeback_wq, &wc->writeback_work); + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + } +} + +static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) +{ + struct wc_entry *e; + + if (WC_MODE_SORT_FREELIST(wc)) { + struct rb_node *next; + if (unlikely(!wc->current_free)) + return NULL; + e = wc->current_free; + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) + return NULL; + next = rb_next(&e->rb_node); + rb_erase(&e->rb_node, &wc->freetree); + if (unlikely(!next)) + next = rb_first(&wc->freetree); + wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; + } else { + if (unlikely(list_empty(&wc->freelist))) + return NULL; + e = container_of(wc->freelist.next, struct wc_entry, lru); + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) + return NULL; + list_del(&e->lru); + } + wc->freelist_size--; + + writecache_verify_watermark(wc); + + return e; +} + +static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) +{ + writecache_unlink(wc, e); + writecache_add_to_freelist(wc, e); + clear_seq_count(wc, e); + writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); + if (unlikely(waitqueue_active(&wc->freelist_wait))) + wake_up(&wc->freelist_wait); +} + +static void writecache_wait_on_freelist(struct dm_writecache *wc) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); + wc_unlock(wc); + io_schedule(); + finish_wait(&wc->freelist_wait, &wait); + wc_lock(wc); +} + +static void writecache_poison_lists(struct dm_writecache *wc) +{ + /* + * Catch incorrect access to these values while the device is suspended. + */ + memset(&wc->tree, -1, sizeof wc->tree); + wc->lru.next = LIST_POISON1; + wc->lru.prev = LIST_POISON2; + wc->freelist.next = LIST_POISON1; + wc->freelist.prev = LIST_POISON2; +} + +static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) +{ + writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); + if (WC_MODE_PMEM(wc)) + writecache_flush_region(wc, memory_data(wc, e), wc->block_size); +} + +static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) +{ + return read_seq_count(wc, e) < wc->seq_count; +} + +static void writecache_flush(struct dm_writecache *wc) +{ + struct wc_entry *e, *e2; + bool need_flush_after_free; + + wc->uncommitted_blocks = 0; + del_timer(&wc->autocommit_timer); + + if (list_empty(&wc->lru)) + return; + + e = container_of(wc->lru.next, struct wc_entry, lru); + if (writecache_entry_is_committed(wc, e)) { + if (wc->overwrote_committed) { + writecache_wait_for_ios(wc, WRITE); + writecache_disk_flush(wc, wc->ssd_dev); + wc->overwrote_committed = false; + } + return; + } + while (1) { + writecache_flush_entry(wc, e); + if (unlikely(e->lru.next == &wc->lru)) + break; + e2 = container_of(e->lru.next, struct wc_entry, lru); + if (writecache_entry_is_committed(wc, e2)) + break; + e = e2; + cond_resched(); + } + writecache_commit_flushed(wc, true); + + wc->seq_count++; + pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); + if (WC_MODE_PMEM(wc)) + writecache_commit_flushed(wc, false); + else + ssd_commit_superblock(wc); + + wc->overwrote_committed = false; + + need_flush_after_free = false; + while (1) { + /* Free another committed entry with lower seq-count */ + struct rb_node *rb_node = rb_prev(&e->rb_node); + + if (rb_node) { + e2 = container_of(rb_node, struct wc_entry, rb_node); + if (read_original_sector(wc, e2) == read_original_sector(wc, e) && + likely(!e2->write_in_progress)) { + writecache_free_entry(wc, e2); + need_flush_after_free = true; + } + } + if (unlikely(e->lru.prev == &wc->lru)) + break; + e = container_of(e->lru.prev, struct wc_entry, lru); + cond_resched(); + } + + if (need_flush_after_free) + writecache_commit_flushed(wc, false); +} + +static void writecache_flush_work(struct work_struct *work) +{ + struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); + + wc_lock(wc); + writecache_flush(wc); + wc_unlock(wc); +} + +static void writecache_autocommit_timer(struct timer_list *t) +{ + struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); + if (!writecache_has_error(wc)) + queue_work(wc->writeback_wq, &wc->flush_work); +} + +static void writecache_schedule_autocommit(struct dm_writecache *wc) +{ + if (!timer_pending(&wc->autocommit_timer)) + mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); +} + +static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) +{ + struct wc_entry *e; + bool discarded_something = false; + + e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); + if (unlikely(!e)) + return; + + while (read_original_sector(wc, e) < end) { + struct rb_node *node = rb_next(&e->rb_node); + + if (likely(!e->write_in_progress)) { + if (!discarded_something) { + if (!WC_MODE_PMEM(wc)) { + writecache_wait_for_ios(wc, READ); + writecache_wait_for_ios(wc, WRITE); + } + discarded_something = true; + } + if (!writecache_entry_is_committed(wc, e)) + wc->uncommitted_blocks--; + writecache_free_entry(wc, e); + } + + if (unlikely(!node)) + break; + + e = container_of(node, struct wc_entry, rb_node); + } + + if (discarded_something) + writecache_commit_flushed(wc, false); +} + +static bool writecache_wait_for_writeback(struct dm_writecache *wc) +{ + if (wc->writeback_size) { + writecache_wait_on_freelist(wc); + return true; + } + return false; +} + +static void writecache_suspend(struct dm_target *ti) +{ + struct dm_writecache *wc = ti->private; + bool flush_on_suspend; + + del_timer_sync(&wc->autocommit_timer); + del_timer_sync(&wc->max_age_timer); + + wc_lock(wc); + writecache_flush(wc); + flush_on_suspend = wc->flush_on_suspend; + if (flush_on_suspend) { + wc->flush_on_suspend = false; + wc->writeback_all++; + queue_work(wc->writeback_wq, &wc->writeback_work); + } + wc_unlock(wc); + + drain_workqueue(wc->writeback_wq); + + wc_lock(wc); + if (flush_on_suspend) + wc->writeback_all--; + while (writecache_wait_for_writeback(wc)); + + if (WC_MODE_PMEM(wc)) + persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); + + writecache_poison_lists(wc); + + wc_unlock(wc); +} + +static int writecache_alloc_entries(struct dm_writecache *wc) +{ + size_t b; + + if (wc->entries) + return 0; + wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); + if (!wc->entries) + return -ENOMEM; + for (b = 0; b < wc->n_blocks; b++) { + struct wc_entry *e = &wc->entries[b]; + e->index = b; + e->write_in_progress = false; + cond_resched(); + } + + return 0; +} + +static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) +{ + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = wc->ssd_dev->bdev; + region.sector = wc->start_sector; + region.count = n_sectors; + req.bi_opf = REQ_OP_READ | REQ_SYNC; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map; + req.client = wc->dm_io; + req.notify.fn = NULL; + + return dm_io(&req, 1, ®ion, NULL); +} + +static void writecache_resume(struct dm_target *ti) +{ + struct dm_writecache *wc = ti->private; + size_t b; + bool need_flush = false; + __le64 sb_seq_count; + int r; + + wc_lock(wc); + + wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); + + if (WC_MODE_PMEM(wc)) { + persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); + } else { + r = writecache_read_metadata(wc, wc->metadata_sectors); + if (r) { + size_t sb_entries_offset; + writecache_error(wc, r, "unable to read metadata: %d", r); + sb_entries_offset = offsetof(struct wc_memory_superblock, entries); + memset((char *)wc->memory_map + sb_entries_offset, -1, + (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); + } + } + + wc->tree = RB_ROOT; + INIT_LIST_HEAD(&wc->lru); + if (WC_MODE_SORT_FREELIST(wc)) { + wc->freetree = RB_ROOT; + wc->current_free = NULL; + } else { + INIT_LIST_HEAD(&wc->freelist); + } + wc->freelist_size = 0; + + r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, + sizeof(uint64_t)); + if (r) { + writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); + sb_seq_count = cpu_to_le64(0); + } + wc->seq_count = le64_to_cpu(sb_seq_count); + +#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS + for (b = 0; b < wc->n_blocks; b++) { + struct wc_entry *e = &wc->entries[b]; + struct wc_memory_entry wme; + if (writecache_has_error(wc)) { + e->original_sector = -1; + e->seq_count = -1; + continue; + } + r = copy_mc_to_kernel(&wme, memory_entry(wc, e), + sizeof(struct wc_memory_entry)); + if (r) { + writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", + (unsigned long)b, r); + e->original_sector = -1; + e->seq_count = -1; + } else { + e->original_sector = le64_to_cpu(wme.original_sector); + e->seq_count = le64_to_cpu(wme.seq_count); + } + cond_resched(); + } +#endif + for (b = 0; b < wc->n_blocks; b++) { + struct wc_entry *e = &wc->entries[b]; + if (!writecache_entry_is_committed(wc, e)) { + if (read_seq_count(wc, e) != -1) { +erase_this: + clear_seq_count(wc, e); + need_flush = true; + } + writecache_add_to_freelist(wc, e); + } else { + struct wc_entry *old; + + old = writecache_find_entry(wc, read_original_sector(wc, e), 0); + if (!old) { + writecache_insert_entry(wc, e); + } else { + if (read_seq_count(wc, old) == read_seq_count(wc, e)) { + writecache_error(wc, -EINVAL, + "two identical entries, position %llu, sector %llu, sequence %llu", + (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), + (unsigned long long)read_seq_count(wc, e)); + } + if (read_seq_count(wc, old) > read_seq_count(wc, e)) { + goto erase_this; + } else { + writecache_free_entry(wc, old); + writecache_insert_entry(wc, e); + need_flush = true; + } + } + } + cond_resched(); + } + + if (need_flush) { + writecache_flush_all_metadata(wc); + writecache_commit_flushed(wc, false); + } + + writecache_verify_watermark(wc); + + if (wc->max_age != MAX_AGE_UNSPECIFIED) + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + + wc_unlock(wc); +} + +static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + if (dm_suspended(wc->ti)) { + wc_unlock(wc); + return -EBUSY; + } + if (writecache_has_error(wc)) { + wc_unlock(wc); + return -EIO; + } + + writecache_flush(wc); + wc->writeback_all++; + queue_work(wc->writeback_wq, &wc->writeback_work); + wc_unlock(wc); + + flush_workqueue(wc->writeback_wq); + + wc_lock(wc); + wc->writeback_all--; + if (writecache_has_error(wc)) { + wc_unlock(wc); + return -EIO; + } + wc_unlock(wc); + + return 0; +} + +static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + wc->flush_on_suspend = true; + wc_unlock(wc); + + return 0; +} + +static void activate_cleaner(struct dm_writecache *wc) +{ + wc->flush_on_suspend = true; + wc->cleaner = true; + wc->freelist_high_watermark = wc->n_blocks; + wc->freelist_low_watermark = wc->n_blocks; +} + +static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + activate_cleaner(wc); + if (!dm_suspended(wc->ti)) + writecache_verify_watermark(wc); + wc_unlock(wc); + + return 0; +} + +static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + memset(&wc->stats, 0, sizeof wc->stats); + wc_unlock(wc); + + return 0; +} + +static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + int r = -EINVAL; + struct dm_writecache *wc = ti->private; + + if (!strcasecmp(argv[0], "flush")) + r = process_flush_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "flush_on_suspend")) + r = process_flush_on_suspend_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "cleaner")) + r = process_cleaner_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "clear_stats")) + r = process_clear_stats_mesg(argc, argv, wc); + else + DMERR("unrecognised message received: %s", argv[0]); + + return r; +} + +static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) +{ + /* + * clflushopt performs better with block size 1024, 2048, 4096 + * non-temporal stores perform better with block size 512 + * + * block size 512 1024 2048 4096 + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s + * + * We see that movnti performs better for 512-byte blocks, and + * clflushopt performs better for 1024-byte and larger blocks. So, we + * prefer clflushopt for sizes >= 768. + * + * NOTE: this happens to be the case now (with dm-writecache's single + * threaded model) but re-evaluate this once memcpy_flushcache() is + * enabled to use movdir64b which might invalidate this performance + * advantage seen with cache-allocating-writes plus flushing. + */ +#ifdef CONFIG_X86 + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && + likely(boot_cpu_data.x86_clflush_size == 64) && + likely(size >= 768)) { + do { + memcpy((void *)dest, (void *)source, 64); + clflushopt((void *)dest); + dest += 64; + source += 64; + size -= 64; + } while (size >= 64); + return; + } +#endif + memcpy_flushcache(dest, source, size); +} + +static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) +{ + void *buf; + unsigned int size; + int rw = bio_data_dir(bio); + unsigned int remaining_size = wc->block_size; + + do { + struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); + buf = bvec_kmap_local(&bv); + size = bv.bv_len; + if (unlikely(size > remaining_size)) + size = remaining_size; + + if (rw == READ) { + int r; + r = copy_mc_to_kernel(buf, data, size); + flush_dcache_page(bio_page(bio)); + if (unlikely(r)) { + writecache_error(wc, r, "hardware memory error when reading data: %d", r); + bio->bi_status = BLK_STS_IOERR; + } + } else { + flush_dcache_page(bio_page(bio)); + memcpy_flushcache_optimized(data, buf, size); + } + + kunmap_local(buf); + + data = (char *)data + size; + remaining_size -= size; + bio_advance(bio, size); + } while (unlikely(remaining_size)); +} + +static int writecache_flush_thread(void *data) +{ + struct dm_writecache *wc = data; + + while (1) { + struct bio *bio; + + wc_lock(wc); + bio = bio_list_pop(&wc->flush_list); + if (!bio) { + set_current_state(TASK_INTERRUPTIBLE); + wc_unlock(wc); + + if (unlikely(kthread_should_stop())) { + set_current_state(TASK_RUNNING); + break; + } + + schedule(); + continue; + } + + if (bio_op(bio) == REQ_OP_DISCARD) { + writecache_discard(wc, bio->bi_iter.bi_sector, + bio_end_sector(bio)); + wc_unlock(wc); + bio_set_dev(bio, wc->dev->bdev); + submit_bio_noacct(bio); + } else { + writecache_flush(wc); + wc_unlock(wc); + if (writecache_has_error(wc)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } + } + + return 0; +} + +static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) +{ + if (bio_list_empty(&wc->flush_list)) + wake_up_process(wc->flush_thread); + bio_list_add(&wc->flush_list, bio); +} + +enum wc_map_op { + WC_MAP_SUBMIT, + WC_MAP_REMAP, + WC_MAP_REMAP_ORIGIN, + WC_MAP_RETURN, + WC_MAP_ERROR, +}; + +static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e) +{ + if (e) { + sector_t next_boundary = + read_original_sector(wc, e) - bio->bi_iter.bi_sector; + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) + dm_accept_partial_bio(bio, next_boundary); + } +} + +static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) +{ + enum wc_map_op map_op; + struct wc_entry *e; + +read_next_block: + wc->stats.reads++; + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { + wc->stats.read_hits++; + if (WC_MODE_PMEM(wc)) { + bio_copy_block(wc, bio, memory_data(wc, e)); + if (bio->bi_iter.bi_size) + goto read_next_block; + map_op = WC_MAP_SUBMIT; + } else { + dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); + bio_set_dev(bio, wc->ssd_dev->bdev); + bio->bi_iter.bi_sector = cache_sector(wc, e); + if (!writecache_entry_is_committed(wc, e)) + writecache_wait_for_ios(wc, WRITE); + map_op = WC_MAP_REMAP; + } + } else { + writecache_map_remap_origin(wc, bio, e); + wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + map_op = WC_MAP_REMAP_ORIGIN; + } + + return map_op; +} + +static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e, bool search_used) +{ + unsigned int bio_size = wc->block_size; + sector_t start_cache_sec = cache_sector(wc, e); + sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); + + while (bio_size < bio->bi_iter.bi_size) { + if (!search_used) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; + } else { + struct wc_entry *f; + struct rb_node *next = rb_next(&e->rb_node); + if (!next) + break; + f = container_of(next, struct wc_entry, rb_node); + if (f != e + 1) + break; + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(f->write_in_progress)) + break; + if (writecache_entry_is_committed(wc, f)) + wc->overwrote_committed = true; + e = f; + } + bio_size += wc->block_size; + current_cache_sec += wc->block_size >> SECTOR_SHIFT; + } + + bio_set_dev(bio, wc->ssd_dev->bdev); + bio->bi_iter.bi_sector = start_cache_sec; + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { + wc->uncommitted_blocks = 0; + queue_work(wc->writeback_wq, &wc->flush_work); + } else { + writecache_schedule_autocommit(wc); + } +} + +static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) +{ + struct wc_entry *e; + + do { + bool found_entry = false; + bool search_used = false; + if (writecache_has_error(wc)) { + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + return WC_MAP_ERROR; + } + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); + if (e) { + if (!writecache_entry_is_committed(wc, e)) { + wc->stats.write_hits_uncommitted++; + search_used = true; + goto bio_copy; + } + wc->stats.write_hits_committed++; + if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { + wc->overwrote_committed = true; + search_used = true; + goto bio_copy; + } + found_entry = true; + } else { + if (unlikely(wc->cleaner) || + (wc->metadata_only && !(bio->bi_opf & REQ_META))) + goto direct_write; + } + e = writecache_pop_from_freelist(wc, (sector_t)-1); + if (unlikely(!e)) { + if (!WC_MODE_PMEM(wc) && !found_entry) { +direct_write: + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + writecache_map_remap_origin(wc, bio, e); + wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + return WC_MAP_REMAP_ORIGIN; + } + wc->stats.writes_blocked_on_freelist++; + writecache_wait_on_freelist(wc); + continue; + } + write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); + writecache_insert_entry(wc, e); + wc->uncommitted_blocks++; + wc->stats.writes_allocate++; +bio_copy: + if (WC_MODE_PMEM(wc)) { + bio_copy_block(wc, bio, memory_data(wc, e)); + wc->stats.writes++; + } else { + writecache_bio_copy_ssd(wc, bio, e, search_used); + return WC_MAP_REMAP; + } + } while (bio->bi_iter.bi_size); + + if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) + writecache_flush(wc); + else + writecache_schedule_autocommit(wc); + + return WC_MAP_SUBMIT; +} + +static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) +{ + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + + if (WC_MODE_PMEM(wc)) { + wc->stats.flushes++; + writecache_flush(wc); + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) + return WC_MAP_REMAP_ORIGIN; + return WC_MAP_SUBMIT; + } + /* SSD: */ + if (dm_bio_get_target_bio_nr(bio)) + return WC_MAP_REMAP_ORIGIN; + wc->stats.flushes++; + writecache_offload_bio(wc, bio); + return WC_MAP_RETURN; +} + +static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) +{ + wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; + + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + + if (WC_MODE_PMEM(wc)) { + writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); + return WC_MAP_REMAP_ORIGIN; + } + /* SSD: */ + writecache_offload_bio(wc, bio); + return WC_MAP_RETURN; +} + +static int writecache_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_writecache *wc = ti->private; + enum wc_map_op map_op; + + bio->bi_private = NULL; + + wc_lock(wc); + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + map_op = writecache_map_flush(wc, bio); + goto done; + } + + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) & + (wc->block_size / 512 - 1)) != 0)) { + DMERR("I/O is not aligned, sector %llu, size %u, block size %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, wc->block_size); + map_op = WC_MAP_ERROR; + goto done; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + map_op = writecache_map_discard(wc, bio); + goto done; + } + + if (bio_data_dir(bio) == READ) + map_op = writecache_map_read(wc, bio); + else + map_op = writecache_map_write(wc, bio); +done: + switch (map_op) { + case WC_MAP_REMAP_ORIGIN: + if (likely(wc->pause != 0)) { + if (bio_op(bio) == REQ_OP_WRITE) { + dm_iot_io_begin(&wc->iot, 1); + bio->bi_private = (void *)2; + } + } + bio_set_dev(bio, wc->dev->bdev); + wc_unlock(wc); + return DM_MAPIO_REMAPPED; + + case WC_MAP_REMAP: + /* make sure that writecache_end_io decrements bio_in_progress: */ + bio->bi_private = (void *)1; + atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); + wc_unlock(wc); + return DM_MAPIO_REMAPPED; + + case WC_MAP_SUBMIT: + wc_unlock(wc); + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + + case WC_MAP_RETURN: + wc_unlock(wc); + return DM_MAPIO_SUBMITTED; + + case WC_MAP_ERROR: + wc_unlock(wc); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + + default: + BUG(); + wc_unlock(wc); + return DM_MAPIO_KILL; + } +} + +static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) +{ + struct dm_writecache *wc = ti->private; + + if (bio->bi_private == (void *)1) { + int dir = bio_data_dir(bio); + if (atomic_dec_and_test(&wc->bio_in_progress[dir])) + if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) + wake_up(&wc->bio_in_progress_wait[dir]); + } else if (bio->bi_private == (void *)2) { + dm_iot_io_end(&wc->iot, 1); + } + return 0; +} + +static int writecache_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_writecache *wc = ti->private; + + return fn(ti, wc->dev, 0, ti->len, data); +} + +static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_writecache *wc = ti->private; + + if (limits->logical_block_size < wc->block_size) + limits->logical_block_size = wc->block_size; + + if (limits->physical_block_size < wc->block_size) + limits->physical_block_size = wc->block_size; + + if (limits->io_min < wc->block_size) + limits->io_min = wc->block_size; +} + + +static void writecache_writeback_endio(struct bio *bio) +{ + struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); + struct dm_writecache *wc = wb->wc; + unsigned long flags; + + raw_spin_lock_irqsave(&wc->endio_list_lock, flags); + if (unlikely(list_empty(&wc->endio_list))) + wake_up_process(wc->endio_thread); + list_add_tail(&wb->endio_entry, &wc->endio_list); + raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); +} + +static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) +{ + struct copy_struct *c = ptr; + struct dm_writecache *wc = c->wc; + + c->error = likely(!(read_err | write_err)) ? 0 : -EIO; + + raw_spin_lock_irq(&wc->endio_list_lock); + if (unlikely(list_empty(&wc->endio_list))) + wake_up_process(wc->endio_thread); + list_add_tail(&c->endio_entry, &wc->endio_list); + raw_spin_unlock_irq(&wc->endio_list_lock); +} + +static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) +{ + unsigned int i; + struct writeback_struct *wb; + struct wc_entry *e; + unsigned long n_walked = 0; + + do { + wb = list_entry(list->next, struct writeback_struct, endio_entry); + list_del(&wb->endio_entry); + + if (unlikely(wb->bio.bi_status != BLK_STS_OK)) + writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), + "write error %d", wb->bio.bi_status); + i = 0; + do { + e = wb->wc_list[i]; + BUG_ON(!e->write_in_progress); + e->write_in_progress = false; + INIT_LIST_HEAD(&e->lru); + if (!writecache_has_error(wc)) + writecache_free_entry(wc, e); + BUG_ON(!wc->writeback_size); + wc->writeback_size--; + n_walked++; + if (unlikely(n_walked >= ENDIO_LATENCY)) { + writecache_commit_flushed(wc, false); + wc_unlock(wc); + wc_lock(wc); + n_walked = 0; + } + } while (++i < wb->wc_list_n); + + if (wb->wc_list != wb->wc_list_inline) + kfree(wb->wc_list); + bio_put(&wb->bio); + } while (!list_empty(list)); +} + +static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) +{ + struct copy_struct *c; + struct wc_entry *e; + + do { + c = list_entry(list->next, struct copy_struct, endio_entry); + list_del(&c->endio_entry); + + if (unlikely(c->error)) + writecache_error(wc, c->error, "copy error"); + + e = c->e; + do { + BUG_ON(!e->write_in_progress); + e->write_in_progress = false; + INIT_LIST_HEAD(&e->lru); + if (!writecache_has_error(wc)) + writecache_free_entry(wc, e); + + BUG_ON(!wc->writeback_size); + wc->writeback_size--; + e++; + } while (--c->n_entries); + mempool_free(c, &wc->copy_pool); + } while (!list_empty(list)); +} + +static int writecache_endio_thread(void *data) +{ + struct dm_writecache *wc = data; + + while (1) { + struct list_head list; + + raw_spin_lock_irq(&wc->endio_list_lock); + if (!list_empty(&wc->endio_list)) + goto pop_from_list; + set_current_state(TASK_INTERRUPTIBLE); + raw_spin_unlock_irq(&wc->endio_list_lock); + + if (unlikely(kthread_should_stop())) { + set_current_state(TASK_RUNNING); + break; + } + + schedule(); + + continue; + +pop_from_list: + list = wc->endio_list; + list.next->prev = list.prev->next = &list; + INIT_LIST_HEAD(&wc->endio_list); + raw_spin_unlock_irq(&wc->endio_list_lock); + + if (!WC_MODE_FUA(wc)) + writecache_disk_flush(wc, wc->dev); + + wc_lock(wc); + + if (WC_MODE_PMEM(wc)) { + __writecache_endio_pmem(wc, &list); + } else { + __writecache_endio_ssd(wc, &list); + writecache_wait_for_ios(wc, READ); + } + + writecache_commit_flushed(wc, false); + + wc_unlock(wc); + } + + return 0; +} + +static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) +{ + struct dm_writecache *wc = wb->wc; + unsigned int block_size = wc->block_size; + void *address = memory_data(wc, e); + + persistent_memory_flush_cache(address, block_size); + + if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) + return true; + + return bio_add_page(&wb->bio, persistent_memory_page(address), + block_size, persistent_memory_page_offset(address)) != 0; +} + +struct writeback_list { + struct list_head list; + size_t size; +}; + +static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) +{ + if (unlikely(wc->max_writeback_jobs)) { + if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { + wc_lock(wc); + while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) + writecache_wait_on_freelist(wc); + wc_unlock(wc); + } + } + cond_resched(); +} + +static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) +{ + struct wc_entry *e, *f; + struct bio *bio; + struct writeback_struct *wb; + unsigned int max_pages; + + while (wbl->size) { + wbl->size--; + e = container_of(wbl->list.prev, struct wc_entry, lru); + list_del(&e->lru); + + max_pages = e->wc_list_contiguous; + + bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, + GFP_NOIO, &wc->bio_set); + wb = container_of(bio, struct writeback_struct, bio); + wb->wc = wc; + bio->bi_end_io = writecache_writeback_endio; + bio->bi_iter.bi_sector = read_original_sector(wc, e); + if (max_pages <= WB_LIST_INLINE || + unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), + GFP_NOIO | __GFP_NORETRY | + __GFP_NOMEMALLOC | __GFP_NOWARN)))) { + wb->wc_list = wb->wc_list_inline; + max_pages = WB_LIST_INLINE; + } + + BUG_ON(!wc_add_block(wb, e)); + + wb->wc_list[0] = e; + wb->wc_list_n = 1; + + while (wbl->size && wb->wc_list_n < max_pages) { + f = container_of(wbl->list.prev, struct wc_entry, lru); + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (!wc_add_block(wb, f)) + break; + wbl->size--; + list_del(&f->lru); + wb->wc_list[wb->wc_list_n++] = f; + e = f; + } + if (WC_MODE_FUA(wc)) + bio->bi_opf |= REQ_FUA; + if (writecache_has_error(wc)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } else if (unlikely(!bio_sectors(bio))) { + bio->bi_status = BLK_STS_OK; + bio_endio(bio); + } else { + submit_bio(bio); + } + + __writeback_throttle(wc, wbl); + } +} + +static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) +{ + struct wc_entry *e, *f; + struct dm_io_region from, to; + struct copy_struct *c; + + while (wbl->size) { + unsigned int n_sectors; + + wbl->size--; + e = container_of(wbl->list.prev, struct wc_entry, lru); + list_del(&e->lru); + + n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); + + from.bdev = wc->ssd_dev->bdev; + from.sector = cache_sector(wc, e); + from.count = n_sectors; + to.bdev = wc->dev->bdev; + to.sector = read_original_sector(wc, e); + to.count = n_sectors; + + c = mempool_alloc(&wc->copy_pool, GFP_NOIO); + c->wc = wc; + c->e = e; + c->n_entries = e->wc_list_contiguous; + + while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { + wbl->size--; + f = container_of(wbl->list.prev, struct wc_entry, lru); + BUG_ON(f != e + 1); + list_del(&f->lru); + e = f; + } + + if (unlikely(to.sector + to.count > wc->data_device_sectors)) { + if (to.sector >= wc->data_device_sectors) { + writecache_copy_endio(0, 0, c); + continue; + } + from.count = to.count = wc->data_device_sectors - to.sector; + } + + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); + + __writeback_throttle(wc, wbl); + } +} + +static void writecache_writeback(struct work_struct *work) +{ + struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); + struct blk_plug plug; + struct wc_entry *f, *g, *e = NULL; + struct rb_node *node, *next_node; + struct list_head skipped; + struct writeback_list wbl; + unsigned long n_walked; + + if (!WC_MODE_PMEM(wc)) { + /* Wait for any active kcopyd work on behalf of ssd writeback */ + dm_kcopyd_client_flush(wc->dm_kcopyd); + } + + if (likely(wc->pause != 0)) { + while (1) { + unsigned long idle; + if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || + unlikely(dm_suspended(wc->ti))) + break; + idle = dm_iot_idle_time(&wc->iot); + if (idle >= wc->pause) + break; + idle = wc->pause - idle; + if (idle > HZ) + idle = HZ; + schedule_timeout_idle(idle); + } + } + + wc_lock(wc); +restart: + if (writecache_has_error(wc)) { + wc_unlock(wc); + return; + } + + if (unlikely(wc->writeback_all)) { + if (writecache_wait_for_writeback(wc)) + goto restart; + } + + if (wc->overwrote_committed) { + writecache_wait_for_ios(wc, WRITE); + } + + n_walked = 0; + INIT_LIST_HEAD(&skipped); + INIT_LIST_HEAD(&wbl.list); + wbl.size = 0; + while (!list_empty(&wc->lru) && + (wc->writeback_all || + wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || + (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= + wc->max_age - wc->max_age / MAX_AGE_DIV))) { + + n_walked++; + if (unlikely(n_walked > WRITEBACK_LATENCY) && + likely(!wc->writeback_all)) { + if (likely(!dm_suspended(wc->ti))) + queue_work(wc->writeback_wq, &wc->writeback_work); + break; + } + + if (unlikely(wc->writeback_all)) { + if (unlikely(!e)) { + writecache_flush(wc); + e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); + } else + e = g; + } else + e = container_of(wc->lru.prev, struct wc_entry, lru); + BUG_ON(e->write_in_progress); + if (unlikely(!writecache_entry_is_committed(wc, e))) { + writecache_flush(wc); + } + node = rb_prev(&e->rb_node); + if (node) { + f = container_of(node, struct wc_entry, rb_node); + if (unlikely(read_original_sector(wc, f) == + read_original_sector(wc, e))) { + BUG_ON(!f->write_in_progress); + list_move(&e->lru, &skipped); + cond_resched(); + continue; + } + } + wc->writeback_size++; + list_move(&e->lru, &wbl.list); + wbl.size++; + e->write_in_progress = true; + e->wc_list_contiguous = 1; + + f = e; + + while (1) { + next_node = rb_next(&f->rb_node); + if (unlikely(!next_node)) + break; + g = container_of(next_node, struct wc_entry, rb_node); + if (unlikely(read_original_sector(wc, g) == + read_original_sector(wc, f))) { + f = g; + continue; + } + if (read_original_sector(wc, g) != + read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(g->write_in_progress)) + break; + if (unlikely(!writecache_entry_is_committed(wc, g))) + break; + + if (!WC_MODE_PMEM(wc)) { + if (g != f + 1) + break; + } + + n_walked++; + //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) + // break; + + wc->writeback_size++; + list_move(&g->lru, &wbl.list); + wbl.size++; + g->write_in_progress = true; + g->wc_list_contiguous = BIO_MAX_VECS; + f = g; + e->wc_list_contiguous++; + if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { + if (unlikely(wc->writeback_all)) { + next_node = rb_next(&f->rb_node); + if (likely(next_node)) + g = container_of(next_node, struct wc_entry, rb_node); + } + break; + } + } + cond_resched(); + } + + if (!list_empty(&skipped)) { + list_splice_tail(&skipped, &wc->lru); + /* + * If we didn't do any progress, we must wait until some + * writeback finishes to avoid burning CPU in a loop + */ + if (unlikely(!wbl.size)) + writecache_wait_for_writeback(wc); + } + + wc_unlock(wc); + + blk_start_plug(&plug); + + if (WC_MODE_PMEM(wc)) + __writecache_writeback_pmem(wc, &wbl); + else + __writecache_writeback_ssd(wc, &wbl); + + blk_finish_plug(&plug); + + if (unlikely(wc->writeback_all)) { + wc_lock(wc); + while (writecache_wait_for_writeback(wc)); + wc_unlock(wc); + } +} + +static int calculate_memory_size(uint64_t device_size, unsigned int block_size, + size_t *n_blocks_p, size_t *n_metadata_blocks_p) +{ + uint64_t n_blocks, offset; + struct wc_entry e; + + n_blocks = device_size; + do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); + + while (1) { + if (!n_blocks) + return -ENOSPC; + /* Verify the following entries[n_blocks] won't overflow */ + if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / + sizeof(struct wc_memory_entry))) + return -EFBIG; + offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); + offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); + if (offset + n_blocks * block_size <= device_size) + break; + n_blocks--; + } + + /* check if the bit field overflows */ + e.index = n_blocks; + if (e.index != n_blocks) + return -EFBIG; + + if (n_blocks_p) + *n_blocks_p = n_blocks; + if (n_metadata_blocks_p) + *n_metadata_blocks_p = offset >> __ffs(block_size); + return 0; +} + +static int init_memory(struct dm_writecache *wc) +{ + size_t b; + int r; + + r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); + if (r) + return r; + + r = writecache_alloc_entries(wc); + if (r) + return r; + + for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) + pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); + pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); + pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); + pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); + pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); + + for (b = 0; b < wc->n_blocks; b++) { + write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); + cond_resched(); + } + + writecache_flush_all_metadata(wc); + writecache_commit_flushed(wc, false); + pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); + writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); + writecache_commit_flushed(wc, false); + + return 0; +} + +static void writecache_dtr(struct dm_target *ti) +{ + struct dm_writecache *wc = ti->private; + + if (!wc) + return; + + if (wc->endio_thread) + kthread_stop(wc->endio_thread); + + if (wc->flush_thread) + kthread_stop(wc->flush_thread); + + bioset_exit(&wc->bio_set); + + mempool_exit(&wc->copy_pool); + + if (wc->writeback_wq) + destroy_workqueue(wc->writeback_wq); + + if (wc->dev) + dm_put_device(ti, wc->dev); + + if (wc->ssd_dev) + dm_put_device(ti, wc->ssd_dev); + + vfree(wc->entries); + + if (wc->memory_map) { + if (WC_MODE_PMEM(wc)) + persistent_memory_release(wc); + else + vfree(wc->memory_map); + } + + if (wc->dm_kcopyd) + dm_kcopyd_client_destroy(wc->dm_kcopyd); + + if (wc->dm_io) + dm_io_client_destroy(wc->dm_io); + + vfree(wc->dirty_bitmap); + + kfree(wc); +} + +static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_writecache *wc; + struct dm_arg_set as; + const char *string; + unsigned int opt_params; + size_t offset, data_size; + int i, r; + char dummy; + int high_wm_percent = HIGH_WATERMARK; + int low_wm_percent = LOW_WATERMARK; + uint64_t x; + struct wc_memory_superblock s; + + static struct dm_arg _args[] = { + {0, 18, "Invalid number of feature args"}, + }; + + as.argc = argc; + as.argv = argv; + + wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); + if (!wc) { + ti->error = "Cannot allocate writecache structure"; + r = -ENOMEM; + goto bad; + } + ti->private = wc; + wc->ti = ti; + + mutex_init(&wc->lock); + wc->max_age = MAX_AGE_UNSPECIFIED; + writecache_poison_lists(wc); + init_waitqueue_head(&wc->freelist_wait); + timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); + timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); + + for (i = 0; i < 2; i++) { + atomic_set(&wc->bio_in_progress[i], 0); + init_waitqueue_head(&wc->bio_in_progress_wait[i]); + } + + wc->dm_io = dm_io_client_create(); + if (IS_ERR(wc->dm_io)) { + r = PTR_ERR(wc->dm_io); + ti->error = "Unable to allocate dm-io client"; + wc->dm_io = NULL; + goto bad; + } + + wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); + if (!wc->writeback_wq) { + r = -ENOMEM; + ti->error = "Could not allocate writeback workqueue"; + goto bad; + } + INIT_WORK(&wc->writeback_work, writecache_writeback); + INIT_WORK(&wc->flush_work, writecache_flush_work); + + dm_iot_init(&wc->iot); + + raw_spin_lock_init(&wc->endio_list_lock); + INIT_LIST_HEAD(&wc->endio_list); + wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); + if (IS_ERR(wc->endio_thread)) { + r = PTR_ERR(wc->endio_thread); + wc->endio_thread = NULL; + ti->error = "Couldn't spawn endio thread"; + goto bad; + } + + /* + * Parse the mode (pmem or ssd) + */ + string = dm_shift_arg(&as); + if (!string) + goto bad_arguments; + + if (!strcasecmp(string, "s")) { + wc->pmem_mode = false; + } else if (!strcasecmp(string, "p")) { +#ifdef DM_WRITECACHE_HAS_PMEM + wc->pmem_mode = true; + wc->writeback_fua = true; +#else + /* + * If the architecture doesn't support persistent memory or + * the kernel doesn't support any DAX drivers, this driver can + * only be used in SSD-only mode. + */ + r = -EOPNOTSUPP; + ti->error = "Persistent memory or DAX not supported on this system"; + goto bad; +#endif + } else { + goto bad_arguments; + } + + if (WC_MODE_PMEM(wc)) { + r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, + offsetof(struct writeback_struct, bio), + BIOSET_NEED_BVECS); + if (r) { + ti->error = "Could not allocate bio set"; + goto bad; + } + } else { + wc->pause = PAUSE_WRITEBACK; + r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); + if (r) { + ti->error = "Could not allocate mempool"; + goto bad; + } + } + + /* + * Parse the origin data device + */ + string = dm_shift_arg(&as); + if (!string) + goto bad_arguments; + r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); + if (r) { + ti->error = "Origin data device lookup failed"; + goto bad; + } + + /* + * Parse cache data device (be it pmem or ssd) + */ + string = dm_shift_arg(&as); + if (!string) + goto bad_arguments; + + r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); + if (r) { + ti->error = "Cache data device lookup failed"; + goto bad; + } + wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); + + /* + * Parse the cache block size + */ + string = dm_shift_arg(&as); + if (!string) + goto bad_arguments; + if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || + wc->block_size < 512 || wc->block_size > PAGE_SIZE || + (wc->block_size & (wc->block_size - 1))) { + r = -EINVAL; + ti->error = "Invalid block size"; + goto bad; + } + if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || + wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { + r = -EINVAL; + ti->error = "Block size is smaller than device logical block size"; + goto bad; + } + wc->block_size_bits = __ffs(wc->block_size); + + wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; + wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; + wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); + + /* + * Parse optional arguments + */ + r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (r) + goto bad; + + while (opt_params) { + string = dm_shift_arg(&as), opt_params--; + if (!strcasecmp(string, "start_sector") && opt_params >= 1) { + unsigned long long start_sector; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) + goto invalid_optional; + wc->start_sector = start_sector; + wc->start_sector_set = true; + if (wc->start_sector != start_sector || + wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) + goto invalid_optional; + } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) + goto invalid_optional; + if (high_wm_percent < 0 || high_wm_percent > 100) + goto invalid_optional; + wc->high_wm_percent_value = high_wm_percent; + wc->high_wm_percent_set = true; + } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) + goto invalid_optional; + if (low_wm_percent < 0 || low_wm_percent > 100) + goto invalid_optional; + wc->low_wm_percent_value = low_wm_percent; + wc->low_wm_percent_set = true; + } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) + goto invalid_optional; + wc->max_writeback_jobs_set = true; + } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) + goto invalid_optional; + wc->autocommit_blocks_set = true; + } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { + unsigned int autocommit_msecs; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) + goto invalid_optional; + if (autocommit_msecs > 3600000) + goto invalid_optional; + wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); + wc->autocommit_time_value = autocommit_msecs; + wc->autocommit_time_set = true; + } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { + unsigned int max_age_msecs; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) + goto invalid_optional; + if (max_age_msecs > 86400000) + goto invalid_optional; + wc->max_age = msecs_to_jiffies(max_age_msecs); + wc->max_age_set = true; + wc->max_age_value = max_age_msecs; + } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner_set = true; + wc->cleaner = true; + } else if (!strcasecmp(string, "fua")) { + if (WC_MODE_PMEM(wc)) { + wc->writeback_fua = true; + wc->writeback_fua_set = true; + } else goto invalid_optional; + } else if (!strcasecmp(string, "nofua")) { + if (WC_MODE_PMEM(wc)) { + wc->writeback_fua = false; + wc->writeback_fua_set = true; + } else goto invalid_optional; + } else if (!strcasecmp(string, "metadata_only")) { + wc->metadata_only = true; + } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { + unsigned int pause_msecs; + if (WC_MODE_PMEM(wc)) + goto invalid_optional; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) + goto invalid_optional; + if (pause_msecs > 60000) + goto invalid_optional; + wc->pause = msecs_to_jiffies(pause_msecs); + wc->pause_set = true; + wc->pause_value = pause_msecs; + } else { +invalid_optional: + r = -EINVAL; + ti->error = "Invalid optional argument"; + goto bad; + } + } + + if (high_wm_percent < low_wm_percent) { + r = -EINVAL; + ti->error = "High watermark must be greater than or equal to low watermark"; + goto bad; + } + + if (WC_MODE_PMEM(wc)) { + if (!dax_synchronous(wc->ssd_dev->dax_dev)) { + r = -EOPNOTSUPP; + ti->error = "Asynchronous persistent memory not supported as pmem cache"; + goto bad; + } + + r = persistent_memory_claim(wc); + if (r) { + ti->error = "Unable to map persistent memory for cache"; + goto bad; + } + } else { + size_t n_blocks, n_metadata_blocks; + uint64_t n_bitmap_bits; + + wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; + + bio_list_init(&wc->flush_list); + wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); + if (IS_ERR(wc->flush_thread)) { + r = PTR_ERR(wc->flush_thread); + wc->flush_thread = NULL; + ti->error = "Couldn't spawn flush thread"; + goto bad; + } + + r = calculate_memory_size(wc->memory_map_size, wc->block_size, + &n_blocks, &n_metadata_blocks); + if (r) { + ti->error = "Invalid device size"; + goto bad; + } + + n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + + BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; + /* this is limitation of test_bit functions */ + if (n_bitmap_bits > 1U << 31) { + r = -EFBIG; + ti->error = "Invalid device size"; + goto bad; + } + + wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); + if (!wc->memory_map) { + r = -ENOMEM; + ti->error = "Unable to allocate memory for metadata"; + goto bad; + } + + wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(wc->dm_kcopyd)) { + r = PTR_ERR(wc->dm_kcopyd); + ti->error = "Unable to allocate dm-kcopyd client"; + wc->dm_kcopyd = NULL; + goto bad; + } + + wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); + wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / + BITS_PER_LONG * sizeof(unsigned long); + wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); + if (!wc->dirty_bitmap) { + r = -ENOMEM; + ti->error = "Unable to allocate dirty bitmap"; + goto bad; + } + + r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); + if (r) { + ti->error = "Unable to read first block of metadata"; + goto bad; + } + } + + r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); + if (r) { + ti->error = "Hardware memory error when reading superblock"; + goto bad; + } + if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { + r = init_memory(wc); + if (r) { + ti->error = "Unable to initialize device"; + goto bad; + } + r = copy_mc_to_kernel(&s, sb(wc), + sizeof(struct wc_memory_superblock)); + if (r) { + ti->error = "Hardware memory error when reading superblock"; + goto bad; + } + } + + if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { + ti->error = "Invalid magic in the superblock"; + r = -EINVAL; + goto bad; + } + + if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { + ti->error = "Invalid version in the superblock"; + r = -EINVAL; + goto bad; + } + + if (le32_to_cpu(s.block_size) != wc->block_size) { + ti->error = "Block size does not match superblock"; + r = -EINVAL; + goto bad; + } + + wc->n_blocks = le64_to_cpu(s.n_blocks); + + offset = wc->n_blocks * sizeof(struct wc_memory_entry); + if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { +overflow: + ti->error = "Overflow in size calculation"; + r = -EINVAL; + goto bad; + } + offset += sizeof(struct wc_memory_superblock); + if (offset < sizeof(struct wc_memory_superblock)) + goto overflow; + offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); + data_size = wc->n_blocks * (size_t)wc->block_size; + if (!offset || (data_size / wc->block_size != wc->n_blocks) || + (offset + data_size < offset)) + goto overflow; + if (offset + data_size > wc->memory_map_size) { + ti->error = "Memory area is too small"; + r = -EINVAL; + goto bad; + } + + wc->metadata_sectors = offset >> SECTOR_SHIFT; + wc->block_start = (char *)sb(wc) + offset; + + x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); + x += 50; + do_div(x, 100); + wc->freelist_high_watermark = x; + x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); + x += 50; + do_div(x, 100); + wc->freelist_low_watermark = x; + + if (wc->cleaner) + activate_cleaner(wc); + + r = writecache_alloc_entries(wc); + if (r) { + ti->error = "Cannot allocate memory"; + goto bad; + } + + ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; + ti->flush_supported = true; + ti->num_discard_bios = 1; + + if (WC_MODE_PMEM(wc)) + persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); + + return 0; + +bad_arguments: + r = -EINVAL; + ti->error = "Bad arguments"; +bad: + writecache_dtr(ti); + return r; +} + +static void writecache_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dm_writecache *wc = ti->private; + unsigned int extra_args; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + writecache_has_error(wc), + (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, + (unsigned long long)wc->writeback_size, + wc->stats.reads, + wc->stats.read_hits, + wc->stats.writes, + wc->stats.write_hits_uncommitted, + wc->stats.write_hits_committed, + wc->stats.writes_around, + wc->stats.writes_allocate, + wc->stats.writes_blocked_on_freelist, + wc->stats.flushes, + wc->stats.discards); + break; + case STATUSTYPE_TABLE: + DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', + wc->dev->name, wc->ssd_dev->name, wc->block_size); + extra_args = 0; + if (wc->start_sector_set) + extra_args += 2; + if (wc->high_wm_percent_set) + extra_args += 2; + if (wc->low_wm_percent_set) + extra_args += 2; + if (wc->max_writeback_jobs_set) + extra_args += 2; + if (wc->autocommit_blocks_set) + extra_args += 2; + if (wc->autocommit_time_set) + extra_args += 2; + if (wc->max_age_set) + extra_args += 2; + if (wc->cleaner_set) + extra_args++; + if (wc->writeback_fua_set) + extra_args++; + if (wc->metadata_only) + extra_args++; + if (wc->pause_set) + extra_args += 2; + + DMEMIT("%u", extra_args); + if (wc->start_sector_set) + DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); + if (wc->high_wm_percent_set) + DMEMIT(" high_watermark %u", wc->high_wm_percent_value); + if (wc->low_wm_percent_set) + DMEMIT(" low_watermark %u", wc->low_wm_percent_value); + if (wc->max_writeback_jobs_set) + DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); + if (wc->autocommit_blocks_set) + DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); + if (wc->autocommit_time_set) + DMEMIT(" autocommit_time %u", wc->autocommit_time_value); + if (wc->max_age_set) + DMEMIT(" max_age %u", wc->max_age_value); + if (wc->cleaner_set) + DMEMIT(" cleaner"); + if (wc->writeback_fua_set) + DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); + if (wc->metadata_only) + DMEMIT(" metadata_only"); + if (wc->pause_set) + DMEMIT(" pause_writeback %u", wc->pause_value); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static struct target_type writecache_target = { + .name = "writecache", + .version = {1, 6, 0}, + .module = THIS_MODULE, + .ctr = writecache_ctr, + .dtr = writecache_dtr, + .status = writecache_status, + .postsuspend = writecache_suspend, + .resume = writecache_resume, + .message = writecache_message, + .map = writecache_map, + .end_io = writecache_end_io, + .iterate_devices = writecache_iterate_devices, + .io_hints = writecache_io_hints, +}; + +static int __init dm_writecache_init(void) +{ + int r; + + r = dm_register_target(&writecache_target); + if (r < 0) { + DMERR("register failed %d", r); + return r; + } + + return 0; +} + +static void __exit dm_writecache_exit(void) +{ + dm_unregister_target(&writecache_target); +} + +module_init(dm_writecache_init); +module_exit(dm_writecache_exit); + +MODULE_DESCRIPTION(DM_NAME " writecache target"); +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c new file mode 100644 index 000000000..faa1dbffc --- /dev/null +++ b/drivers/md/dm-zero.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2003 Jana Saout + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include + +#define DM_MSG_PREFIX "zero" + +/* + * Construct a dummy mapping that only returns zeros + */ +static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + if (argc != 0) { + ti->error = "No arguments required"; + return -EINVAL; + } + + /* + * Silently drop discards, avoiding -EOPNOTSUPP. + */ + ti->num_discard_bios = 1; + + return 0; +} + +/* + * Return zeros only on reads + */ +static int zero_map(struct dm_target *ti, struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bio->bi_opf & REQ_RAHEAD) { + /* readahead of null bytes only wastes buffer cache */ + return DM_MAPIO_KILL; + } + zero_fill_bio(bio); + break; + case REQ_OP_WRITE: + /* writes get silently dropped */ + break; + default: + return DM_MAPIO_KILL; + } + + bio_endio(bio); + + /* accepted bio, don't make new request */ + return DM_MAPIO_SUBMITTED; +} + +static struct target_type zero_target = { + .name = "zero", + .version = {1, 1, 0}, + .features = DM_TARGET_NOWAIT, + .module = THIS_MODULE, + .ctr = zero_ctr, + .map = zero_map, +}; + +static int __init dm_zero_init(void) +{ + int r = dm_register_target(&zero_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_zero_exit(void) +{ + dm_unregister_target(&zero_target); +} + +module_init(dm_zero_init) +module_exit(dm_zero_exit) + +MODULE_AUTHOR("Jana Saout "); +MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c new file mode 100644 index 000000000..3dafc0e8b --- /dev/null +++ b/drivers/md/dm-zone.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include +#include + +#include "dm-core.h" + +#define DM_MSG_PREFIX "zone" + +#define DM_ZONE_INVALID_WP_OFST UINT_MAX + +/* + * For internal zone reports bypassing the top BIO submission path. + */ +static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + report_zones_cb cb, void *data) +{ + struct gendisk *disk = md->disk; + int ret; + struct dm_report_zones_args args = { + .next_sector = sector, + .orig_data = data, + .orig_cb = cb, + }; + + do { + struct dm_target *tgt; + + tgt = dm_table_find_target(t, args.next_sector); + if (WARN_ON_ONCE(!tgt->type->report_zones)) + return -EIO; + + args.tgt = tgt; + ret = tgt->type->report_zones(tgt, &args, + nr_zones - args.zone_idx); + if (ret < 0) + return ret; + } while (args.zone_idx < nr_zones && + args.next_sector < get_capacity(disk)); + + return args.zone_idx; +} + +/* + * User facing dm device block device report zone operation. This calls the + * report_zones operation for each target of a device table. This operation is + * generally implemented by targets using dm_report_zones(). + */ +int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct mapped_device *md = disk->private_data; + struct dm_table *map; + int srcu_idx, ret; + + if (dm_suspended_md(md)) + return -EAGAIN; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) + return -EIO; + + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct dm_report_zones_args *args = data; + sector_t sector_diff = args->tgt->begin - args->start; + + /* + * Ignore zones beyond the target range. + */ + if (zone->start >= args->start + args->tgt->len) + return 0; + + /* + * Remap the start sector and write pointer position of the zone + * to match its position in the target range. + */ + zone->start += sector_diff; + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = zone->start + zone->len; + else if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->wp = zone->start; + else + zone->wp += sector_diff; + } + + args->next_sector = zone->start + zone->len; + return args->orig_cb(zone, args->zone_idx++, args->orig_data); +} + +/* + * Helper for drivers of zoned targets to implement struct target_type + * report_zones operation. + */ +int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + /* + * Set the target mapping start sector first so that + * dm_report_zones_cb() can correctly remap zone information. + */ + args->start = start; + + return blkdev_report_zones(bdev, sector, nr_zones, + dm_report_zones_cb, args); +} +EXPORT_SYMBOL_GPL(dm_report_zones); + +bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) +{ + struct request_queue *q = md->queue; + + if (!blk_queue_is_zoned(q)) + return false; + + switch (bio_op(bio)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE: + return !op_is_flush(bio->bi_opf) && bio_sectors(bio); + default: + return false; + } +} + +void dm_cleanup_zoned_dev(struct mapped_device *md) +{ + if (md->disk) { + kfree(md->disk->conv_zones_bitmap); + md->disk->conv_zones_bitmap = NULL; + kfree(md->disk->seq_zones_wlock); + md->disk->seq_zones_wlock = NULL; + } + + kvfree(md->zwp_offset); + md->zwp_offset = NULL; + md->nr_zones = 0; +} + +static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. Use 0 as for an empty zone. + */ + return 0; + } +} + +static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct mapped_device *md = data; + struct gendisk *disk = md->disk; + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!disk->conv_zones_bitmap) { + disk->conv_zones_bitmap = + kcalloc(BITS_TO_LONGS(disk->nr_zones), + sizeof(unsigned long), GFP_NOIO); + if (!disk->conv_zones_bitmap) + return -ENOMEM; + } + set_bit(idx, disk->conv_zones_bitmap); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (!disk->seq_zones_wlock) { + disk->seq_zones_wlock = + kcalloc(BITS_TO_LONGS(disk->nr_zones), + sizeof(unsigned long), GFP_NOIO); + if (!disk->seq_zones_wlock) + return -ENOMEM; + } + if (!md->zwp_offset) { + md->zwp_offset = + kvcalloc(disk->nr_zones, sizeof(unsigned int), + GFP_KERNEL); + if (!md->zwp_offset) + return -ENOMEM; + } + md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); + + break; + default: + DMERR("Invalid zone type 0x%x at sectors %llu", + (int)zone->type, zone->start); + return -ENODEV; + } + + return 0; +} + +/* + * Revalidate the zones of a mapped device to initialize resource necessary + * for zone append emulation. Note that we cannot simply use the block layer + * blk_revalidate_disk_zones() function here as the mapped device is suspended + * (this is called from __bind() context). + */ +static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) +{ + struct gendisk *disk = md->disk; + unsigned int noio_flag; + int ret; + + /* + * Check if something changed. If yes, cleanup the current resources + * and reallocate everything. + */ + if (!disk->nr_zones || disk->nr_zones != md->nr_zones) + dm_cleanup_zoned_dev(md); + if (md->nr_zones) + return 0; + + /* + * Scan all zones to initialize everything. Ensure that all vmalloc + * operations in this context are done as if GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones, + dm_zone_revalidate_cb, md); + memalloc_noio_restore(noio_flag); + if (ret < 0) + goto err; + if (ret != disk->nr_zones) { + ret = -EIO; + goto err; + } + + md->nr_zones = disk->nr_zones; + + return 0; + +err: + DMERR("Revalidate zones failed %d", ret); + dm_cleanup_zoned_dev(md); + return ret; +} + +static int device_not_zone_append_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return !bdev_is_zoned(dev->bdev); +} + +static bool dm_table_supports_zone_append(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->emulate_zone_append) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL)) + return false; + } + + return true; +} + +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +{ + struct mapped_device *md = t->md; + + /* + * For a zoned target, the number of zones should be updated for the + * correct value to be exposed in sysfs queue/nr_zones. + */ + WARN_ON_ONCE(queue_is_mq(q)); + md->disk->nr_zones = bdev_nr_zones(md->disk->part0); + + /* Check if zone append is natively supported */ + if (dm_table_supports_zone_append(t)) { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + dm_cleanup_zoned_dev(md); + return 0; + } + + /* + * Mark the mapped device as needing zone append emulation and + * initialize the emulation resources once the capacity is set. + */ + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!get_capacity(md->disk)) + return 0; + + return dm_revalidate_zones(md, t); +} + +static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + unsigned int *wp_offset = data; + + *wp_offset = dm_get_zone_wp_offset(zone); + + return 0; +} + +static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, + unsigned int *wp_ofst) +{ + sector_t sector = zno * bdev_zone_sectors(md->disk->part0); + unsigned int noio_flag; + struct dm_table *t; + int srcu_idx, ret; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return -EIO; + + /* + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = dm_blk_do_report_zones(md, t, sector, 1, + dm_update_zone_wp_offset_cb, wp_ofst); + memalloc_noio_restore(noio_flag); + + dm_put_live_table(md, srcu_idx); + + if (ret != 1) + return -EIO; + + return 0; +} + +struct orig_bio_details { + enum req_op op; + unsigned int nr_sectors; +}; + +/* + * First phase of BIO mapping for targets with zone append emulation: + * check all BIO that change a zone writer pointer and change zone + * append operations into regular write operations. + */ +static bool dm_zone_map_bio_begin(struct mapped_device *md, + unsigned int zno, struct bio *clone) +{ + sector_t zsectors = bdev_zone_sectors(md->disk->part0); + unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); + + /* + * If the target zone is in an error state, recover by inspecting the + * zone to get its current write pointer position. Note that since the + * target zone is already locked, a BIO issuing context should never + * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. + */ + if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { + if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) + return false; + WRITE_ONCE(md->zwp_offset[zno], zwp_offset); + } + + switch (bio_op(clone)) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_FINISH: + return true; + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE: + /* Writes must be aligned to the zone write pointer */ + if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) + return false; + break; + case REQ_OP_ZONE_APPEND: + /* + * Change zone append operations into a non-mergeable regular + * writes directed at the current write pointer position of the + * target zone. + */ + clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | + (clone->bi_opf & (~REQ_OP_MASK)); + clone->bi_iter.bi_sector += zwp_offset; + break; + default: + DMWARN_LIMIT("Invalid BIO operation"); + return false; + } + + /* Cannot write to a full zone */ + if (zwp_offset >= zsectors) + return false; + + return true; +} + +/* + * Second phase of BIO mapping for targets with zone append emulation: + * update the zone write pointer offset array to account for the additional + * data written to a zone. Note that at this point, the remapped clone BIO + * may already have completed, so we do not touch it. + */ +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno, + struct orig_bio_details *orig_bio_details, + unsigned int nr_sectors) +{ + unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); + + /* The clone BIO may already have been completed and failed */ + if (zwp_offset == DM_ZONE_INVALID_WP_OFST) + return BLK_STS_IOERR; + + /* Update the zone wp offset */ + switch (orig_bio_details->op) { + case REQ_OP_ZONE_RESET: + WRITE_ONCE(md->zwp_offset[zno], 0); + return BLK_STS_OK; + case REQ_OP_ZONE_FINISH: + WRITE_ONCE(md->zwp_offset[zno], + bdev_zone_sectors(md->disk->part0)); + return BLK_STS_OK; + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE: + WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); + return BLK_STS_OK; + case REQ_OP_ZONE_APPEND: + /* + * Check that the target did not truncate the write operation + * emulating a zone append. + */ + if (nr_sectors != orig_bio_details->nr_sectors) { + DMWARN_LIMIT("Truncated write for zone append"); + return BLK_STS_IOERR; + } + WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); + return BLK_STS_OK; + default: + DMWARN_LIMIT("Invalid BIO operation"); + return BLK_STS_IOERR; + } +} + +static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno, + struct bio *clone) +{ + if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) + return; + + wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); + bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); +} + +static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno, + struct bio *clone) +{ + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) + return; + + WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock)); + clear_bit_unlock(zno, disk->seq_zones_wlock); + smp_mb__after_atomic(); + wake_up_bit(disk->seq_zones_wlock, zno); + + bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); +} + +static bool dm_need_zone_wp_tracking(struct bio *bio) +{ + /* + * Special processing is not needed for operations that do not need the + * zone write lock, that is, all operations that target conventional + * zones and all operations that do not modify directly a sequential + * zone write pointer. + */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + switch (bio_op(bio)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_APPEND: + return bio_zone_is_seq(bio); + default: + return false; + } +} + +/* + * Special IO mapping for targets needing zone append emulation. + */ +int dm_zone_map_bio(struct dm_target_io *tio) +{ + struct dm_io *io = tio->io; + struct dm_target *ti = tio->ti; + struct mapped_device *md = io->md; + struct bio *clone = &tio->clone; + struct orig_bio_details orig_bio_details; + unsigned int zno; + blk_status_t sts; + int r; + + /* + * IOs that do not change a zone write pointer do not need + * any additional special processing. + */ + if (!dm_need_zone_wp_tracking(clone)) + return ti->type->map(ti, clone); + + /* Lock the target zone */ + zno = bio_zone_no(clone); + dm_zone_lock(md->disk, zno, clone); + + orig_bio_details.nr_sectors = bio_sectors(clone); + orig_bio_details.op = bio_op(clone); + + /* + * Check that the bio and the target zone write pointer offset are + * both valid, and if the bio is a zone append, remap it to a write. + */ + if (!dm_zone_map_bio_begin(md, zno, clone)) { + dm_zone_unlock(md->disk, zno, clone); + return DM_MAPIO_KILL; + } + + /* Let the target do its work */ + r = ti->type->map(ti, clone); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* + * The target submitted the clone BIO. The target zone will + * be unlocked on completion of the clone. + */ + sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, + *tio->len_ptr); + break; + case DM_MAPIO_REMAPPED: + /* + * The target only remapped the clone BIO. In case of error, + * unlock the target zone here as the clone will not be + * submitted. + */ + sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, + *tio->len_ptr); + if (sts != BLK_STS_OK) + dm_zone_unlock(md->disk, zno, clone); + break; + case DM_MAPIO_REQUEUE: + case DM_MAPIO_KILL: + default: + dm_zone_unlock(md->disk, zno, clone); + sts = BLK_STS_IOERR; + break; + } + + if (sts != BLK_STS_OK) + return DM_MAPIO_KILL; + + return r; +} + +/* + * IO completion callback called from clone_endio(). + */ +void dm_zone_endio(struct dm_io *io, struct bio *clone) +{ + struct mapped_device *md = io->md; + struct gendisk *disk = md->disk; + struct bio *orig_bio = io->orig_bio; + unsigned int zwp_offset; + unsigned int zno; + + /* + * For targets that do not emulate zone append, we only need to + * handle native zone-append bios. + */ + if (!dm_emulate_zone_append(md)) { + /* + * Get the offset within the zone of the written sector + * and add that to the original bio sector position. + */ + if (clone->bi_status == BLK_STS_OK && + bio_op(clone) == REQ_OP_ZONE_APPEND) { + sector_t mask = + (sector_t)bdev_zone_sectors(disk->part0) - 1; + + orig_bio->bi_iter.bi_sector += + clone->bi_iter.bi_sector & mask; + } + + return; + } + + /* + * For targets that do emulate zone append, if the clone BIO does not + * own the target zone write lock, we have nothing to do. + */ + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) + return; + + zno = bio_zone_no(orig_bio); + + if (clone->bi_status != BLK_STS_OK) { + /* + * BIOs that modify a zone write pointer may leave the zone + * in an unknown state in case of failure (e.g. the write + * pointer was only partially advanced). In this case, set + * the target zone write pointer as invalid unless it is + * already being updated. + */ + WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); + } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { + /* + * Get the written sector for zone append operation that were + * emulated using regular write operations. + */ + zwp_offset = READ_ONCE(md->zwp_offset[zno]); + if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) + WRITE_ONCE(md->zwp_offset[zno], + DM_ZONE_INVALID_WP_OFST); + else + orig_bio->bi_iter.bi_sector += + zwp_offset - bio_sectors(orig_bio); + } + + dm_zone_unlock(disk, zno, clone); +} diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c new file mode 100644 index 000000000..0548b5d92 --- /dev/null +++ b/drivers/md/dm-zoned-metadata.c @@ -0,0 +1,3046 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#include "dm-zoned.h" + +#include +#include +#include + +#define DM_MSG_PREFIX "zoned metadata" + +/* + * Metadata version. + */ +#define DMZ_META_VER 2 + +/* + * On-disk super block magic. + */ +#define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ + (((unsigned int)('Z')) << 16) | \ + (((unsigned int)('B')) << 8) | \ + ((unsigned int)('D'))) + +/* + * On disk super block. + * This uses only 512 B but uses on disk a full 4KB block. This block is + * followed on disk by the mapping table of chunks to zones and the bitmap + * blocks indicating zone block validity. + * The overall resulting metadata format is: + * (1) Super block (1 block) + * (2) Chunk mapping table (nr_map_blocks) + * (3) Bitmap blocks (nr_bitmap_blocks) + * All metadata blocks are stored in conventional zones, starting from + * the first conventional zone found on disk. + */ +struct dmz_super { + /* Magic number */ + __le32 magic; /* 4 */ + + /* Metadata version number */ + __le32 version; /* 8 */ + + /* Generation number */ + __le64 gen; /* 16 */ + + /* This block number */ + __le64 sb_block; /* 24 */ + + /* The number of metadata blocks, including this super block */ + __le32 nr_meta_blocks; /* 28 */ + + /* The number of sequential zones reserved for reclaim */ + __le32 nr_reserved_seq; /* 32 */ + + /* The number of entries in the mapping table */ + __le32 nr_chunks; /* 36 */ + + /* The number of blocks used for the chunk mapping table */ + __le32 nr_map_blocks; /* 40 */ + + /* The number of blocks used for the block bitmaps */ + __le32 nr_bitmap_blocks; /* 44 */ + + /* Checksum */ + __le32 crc; /* 48 */ + + /* DM-Zoned label */ + u8 dmz_label[32]; /* 80 */ + + /* DM-Zoned UUID */ + u8 dmz_uuid[16]; /* 96 */ + + /* Device UUID */ + u8 dev_uuid[16]; /* 112 */ + + /* Padding to full 512B sector */ + u8 reserved[400]; /* 512 */ +}; + +/* + * Chunk mapping entry: entries are indexed by chunk number + * and give the zone ID (dzone_id) mapping the chunk on disk. + * This zone may be sequential or random. If it is a sequential + * zone, a second zone (bzone_id) used as a write buffer may + * also be specified. This second zone will always be a randomly + * writeable zone. + */ +struct dmz_map { + __le32 dzone_id; + __le32 bzone_id; +}; + +/* + * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. + */ +#define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) +#define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) +#define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) +#define DMZ_MAP_UNMAPPED UINT_MAX + +/* + * Meta data block descriptor (for cached metadata blocks). + */ +struct dmz_mblock { + struct rb_node node; + struct list_head link; + sector_t no; + unsigned int ref; + unsigned long state; + struct page *page; + void *data; +}; + +/* + * Metadata block state flags. + */ +enum { + DMZ_META_DIRTY, + DMZ_META_READING, + DMZ_META_WRITING, + DMZ_META_ERROR, +}; + +/* + * Super block information (one per metadata set). + */ +struct dmz_sb { + sector_t block; + struct dmz_dev *dev; + struct dmz_mblock *mblk; + struct dmz_super *sb; + struct dm_zone *zone; +}; + +/* + * In-memory metadata. + */ +struct dmz_metadata { + struct dmz_dev *dev; + unsigned int nr_devs; + + char devname[BDEVNAME_SIZE]; + char label[BDEVNAME_SIZE]; + uuid_t uuid; + + sector_t zone_bitmap_size; + unsigned int zone_nr_bitmap_blocks; + unsigned int zone_bits_per_mblk; + + sector_t zone_nr_blocks; + sector_t zone_nr_blocks_shift; + + sector_t zone_nr_sectors; + sector_t zone_nr_sectors_shift; + + unsigned int nr_bitmap_blocks; + unsigned int nr_map_blocks; + + unsigned int nr_zones; + unsigned int nr_useable_zones; + unsigned int nr_meta_blocks; + unsigned int nr_meta_zones; + unsigned int nr_data_zones; + unsigned int nr_cache_zones; + unsigned int nr_rnd_zones; + unsigned int nr_reserved_seq; + unsigned int nr_chunks; + + /* Zone information array */ + struct xarray zones; + + struct dmz_sb sb[2]; + unsigned int mblk_primary; + unsigned int sb_version; + u64 sb_gen; + unsigned int min_nr_mblks; + unsigned int max_nr_mblks; + atomic_t nr_mblks; + struct rw_semaphore mblk_sem; + struct mutex mblk_flush_lock; + spinlock_t mblk_lock; + struct rb_root mblk_rbtree; + struct list_head mblk_lru_list; + struct list_head mblk_dirty_list; + struct shrinker mblk_shrinker; + + /* Zone allocation management */ + struct mutex map_lock; + struct dmz_mblock **map_mblk; + + unsigned int nr_cache; + atomic_t unmap_nr_cache; + struct list_head unmap_cache_list; + struct list_head map_cache_list; + + atomic_t nr_reserved_seq_zones; + struct list_head reserved_seq_zones_list; + + wait_queue_head_t free_wq; +}; + +#define dmz_zmd_info(zmd, format, args...) \ + DMINFO("(%s): " format, (zmd)->label, ## args) + +#define dmz_zmd_err(zmd, format, args...) \ + DMERR("(%s): " format, (zmd)->label, ## args) + +#define dmz_zmd_warn(zmd, format, args...) \ + DMWARN("(%s): " format, (zmd)->label, ## args) + +#define dmz_zmd_debug(zmd, format, args...) \ + DMDEBUG("(%s): " format, (zmd)->label, ## args) +/* + * Various accessors + */ +static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + if (WARN_ON(!zone)) + return 0; + + return zone->id - zone->dev->zone_offset; +} + +sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_sectors_shift; +} + +sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_blocks_shift; +} + +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks; +} + +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks_shift; +} + +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors; +} + +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors_shift; +} + +unsigned int dmz_nr_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_zones; +} + +unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) +{ + return zmd->nr_chunks; +} + +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) +{ + return zmd->dev[idx].nr_rnd; +} + +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) +{ + return atomic_read(&zmd->dev[idx].unmap_nr_rnd); +} + +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_cache; +} + +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) +{ + return atomic_read(&zmd->unmap_nr_cache); +} + +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) +{ + return zmd->dev[idx].nr_seq; +} + +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) +{ + return atomic_read(&zmd->dev[idx].unmap_nr_seq); +} + +static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) +{ + return xa_load(&zmd->zones, zone_id); +} + +static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, + unsigned int zone_id, struct dmz_dev *dev) +{ + struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); + + if (!zone) + return ERR_PTR(-ENOMEM); + + if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { + kfree(zone); + return ERR_PTR(-EBUSY); + } + + INIT_LIST_HEAD(&zone->link); + atomic_set(&zone->refcount, 0); + zone->id = zone_id; + zone->chunk = DMZ_MAP_UNMAPPED; + zone->dev = dev; + + return zone; +} + +const char *dmz_metadata_label(struct dmz_metadata *zmd) +{ + return (const char *)zmd->label; +} + +bool dmz_check_dev(struct dmz_metadata *zmd) +{ + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (!dmz_check_bdev(&zmd->dev[i])) + return false; + } + return true; +} + +bool dmz_dev_is_dying(struct dmz_metadata *zmd) +{ + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (dmz_bdev_is_dying(&zmd->dev[i])) + return true; + } + return false; +} + +/* + * Lock/unlock mapping table. + * The map lock also protects all the zone lists. + */ +void dmz_lock_map(struct dmz_metadata *zmd) +{ + mutex_lock(&zmd->map_lock); +} + +void dmz_unlock_map(struct dmz_metadata *zmd) +{ + mutex_unlock(&zmd->map_lock); +} + +/* + * Lock/unlock metadata access. This is a "read" lock on a semaphore + * that prevents metadata flush from running while metadata are being + * modified. The actual metadata write mutual exclusion is achieved with + * the map lock and zone state management (active and reclaim state are + * mutually exclusive). + */ +void dmz_lock_metadata(struct dmz_metadata *zmd) +{ + down_read(&zmd->mblk_sem); +} + +void dmz_unlock_metadata(struct dmz_metadata *zmd) +{ + up_read(&zmd->mblk_sem); +} + +/* + * Lock/unlock flush: prevent concurrent executions + * of dmz_flush_metadata as well as metadata modification in reclaim + * while flush is being executed. + */ +void dmz_lock_flush(struct dmz_metadata *zmd) +{ + mutex_lock(&zmd->mblk_flush_lock); +} + +void dmz_unlock_flush(struct dmz_metadata *zmd) +{ + mutex_unlock(&zmd->mblk_flush_lock); +} + +/* + * Allocate a metadata block. + */ +static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, + sector_t mblk_no) +{ + struct dmz_mblock *mblk = NULL; + + /* See if we can reuse cached blocks */ + if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { + spin_lock(&zmd->mblk_lock); + mblk = list_first_entry_or_null(&zmd->mblk_lru_list, + struct dmz_mblock, link); + if (mblk) { + list_del_init(&mblk->link); + rb_erase(&mblk->node, &zmd->mblk_rbtree); + mblk->no = mblk_no; + } + spin_unlock(&zmd->mblk_lock); + if (mblk) + return mblk; + } + + /* Allocate a new block */ + mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); + if (!mblk) + return NULL; + + mblk->page = alloc_page(GFP_NOIO); + if (!mblk->page) { + kfree(mblk); + return NULL; + } + + RB_CLEAR_NODE(&mblk->node); + INIT_LIST_HEAD(&mblk->link); + mblk->ref = 0; + mblk->state = 0; + mblk->no = mblk_no; + mblk->data = page_address(mblk->page); + + atomic_inc(&zmd->nr_mblks); + + return mblk; +} + +/* + * Free a metadata block. + */ +static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) +{ + __free_pages(mblk->page, 0); + kfree(mblk); + + atomic_dec(&zmd->nr_mblks); +} + +/* + * Insert a metadata block in the rbtree. + */ +static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) +{ + struct rb_root *root = &zmd->mblk_rbtree; + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct dmz_mblock *b; + + /* Figure out where to put the new node */ + while (*new) { + b = container_of(*new, struct dmz_mblock, node); + parent = *new; + new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); + } + + /* Add new node and rebalance tree */ + rb_link_node(&mblk->node, parent, new); + rb_insert_color(&mblk->node, root); +} + +/* + * Lookup a metadata block in the rbtree. If the block is found, increment + * its reference count. + */ +static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, + sector_t mblk_no) +{ + struct rb_root *root = &zmd->mblk_rbtree; + struct rb_node *node = root->rb_node; + struct dmz_mblock *mblk; + + while (node) { + mblk = container_of(node, struct dmz_mblock, node); + if (mblk->no == mblk_no) { + /* + * If this is the first reference to the block, + * remove it from the LRU list. + */ + mblk->ref++; + if (mblk->ref == 1 && + !test_bit(DMZ_META_DIRTY, &mblk->state)) + list_del_init(&mblk->link); + return mblk; + } + node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; + } + + return NULL; +} + +/* + * Metadata block BIO end callback. + */ +static void dmz_mblock_bio_end_io(struct bio *bio) +{ + struct dmz_mblock *mblk = bio->bi_private; + int flag; + + if (bio->bi_status) + set_bit(DMZ_META_ERROR, &mblk->state); + + if (bio_op(bio) == REQ_OP_WRITE) + flag = DMZ_META_WRITING; + else + flag = DMZ_META_READING; + + clear_bit_unlock(flag, &mblk->state); + smp_mb__after_atomic(); + wake_up_bit(&mblk->state, flag); + + bio_put(bio); +} + +/* + * Read an uncached metadata block from disk and add it to the cache. + */ +static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, + sector_t mblk_no) +{ + struct dmz_mblock *mblk, *m; + sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; + struct bio *bio; + + if (dmz_bdev_is_dying(dev)) + return ERR_PTR(-EIO); + + /* Get a new block and a BIO to read it */ + mblk = dmz_alloc_mblock(zmd, mblk_no); + if (!mblk) + return ERR_PTR(-ENOMEM); + + bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO, + GFP_NOIO); + + spin_lock(&zmd->mblk_lock); + + /* + * Make sure that another context did not start reading + * the block already. + */ + m = dmz_get_mblock_fast(zmd, mblk_no); + if (m) { + spin_unlock(&zmd->mblk_lock); + dmz_free_mblock(zmd, mblk); + bio_put(bio); + return m; + } + + mblk->ref++; + set_bit(DMZ_META_READING, &mblk->state); + dmz_insert_mblock(zmd, mblk); + + spin_unlock(&zmd->mblk_lock); + + /* Submit read BIO */ + bio->bi_iter.bi_sector = dmz_blk2sect(block); + bio->bi_private = mblk; + bio->bi_end_io = dmz_mblock_bio_end_io; + bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); + submit_bio(bio); + + return mblk; +} + +/* + * Free metadata blocks. + */ +static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, + unsigned long limit) +{ + struct dmz_mblock *mblk; + unsigned long count = 0; + + if (!zmd->max_nr_mblks) + return 0; + + while (!list_empty(&zmd->mblk_lru_list) && + atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && + count < limit) { + mblk = list_first_entry(&zmd->mblk_lru_list, + struct dmz_mblock, link); + list_del_init(&mblk->link); + rb_erase(&mblk->node, &zmd->mblk_rbtree); + dmz_free_mblock(zmd, mblk); + count++; + } + + return count; +} + +/* + * For mblock shrinker: get the number of unused metadata blocks in the cache. + */ +static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + + return atomic_read(&zmd->nr_mblks); +} + +/* + * For mblock shrinker: scan unused metadata blocks and shrink the cache. + */ +static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + unsigned long count; + + spin_lock(&zmd->mblk_lock); + count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); + spin_unlock(&zmd->mblk_lock); + + return count ? count : SHRINK_STOP; +} + +/* + * Release a metadata block. + */ +static void dmz_release_mblock(struct dmz_metadata *zmd, + struct dmz_mblock *mblk) +{ + + if (!mblk) + return; + + spin_lock(&zmd->mblk_lock); + + mblk->ref--; + if (mblk->ref == 0) { + if (test_bit(DMZ_META_ERROR, &mblk->state)) { + rb_erase(&mblk->node, &zmd->mblk_rbtree); + dmz_free_mblock(zmd, mblk); + } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { + list_add_tail(&mblk->link, &zmd->mblk_lru_list); + dmz_shrink_mblock_cache(zmd, 1); + } + } + + spin_unlock(&zmd->mblk_lock); +} + +/* + * Get a metadata block from the rbtree. If the block + * is not present, read it from disk. + */ +static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, + sector_t mblk_no) +{ + struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; + + /* Check rbtree */ + spin_lock(&zmd->mblk_lock); + mblk = dmz_get_mblock_fast(zmd, mblk_no); + spin_unlock(&zmd->mblk_lock); + + if (!mblk) { + /* Cache miss: read the block from disk */ + mblk = dmz_get_mblock_slow(zmd, mblk_no); + if (IS_ERR(mblk)) + return mblk; + } + + /* Wait for on-going read I/O and check for error */ + wait_on_bit_io(&mblk->state, DMZ_META_READING, + TASK_UNINTERRUPTIBLE); + if (test_bit(DMZ_META_ERROR, &mblk->state)) { + dmz_release_mblock(zmd, mblk); + dmz_check_bdev(dev); + return ERR_PTR(-EIO); + } + + return mblk; +} + +/* + * Mark a metadata block dirty. + */ +static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) +{ + spin_lock(&zmd->mblk_lock); + if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) + list_add_tail(&mblk->link, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); +} + +/* + * Issue a metadata block write BIO. + */ +static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, + unsigned int set) +{ + struct dmz_dev *dev = zmd->sb[set].dev; + sector_t block = zmd->sb[set].block + mblk->no; + struct bio *bio; + + if (dmz_bdev_is_dying(dev)) + return -EIO; + + bio = bio_alloc(dev->bdev, 1, REQ_OP_WRITE | REQ_META | REQ_PRIO, + GFP_NOIO); + + set_bit(DMZ_META_WRITING, &mblk->state); + + bio->bi_iter.bi_sector = dmz_blk2sect(block); + bio->bi_private = mblk; + bio->bi_end_io = dmz_mblock_bio_end_io; + bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); + submit_bio(bio); + + return 0; +} + +/* + * Read/write a metadata block. + */ +static int dmz_rdwr_block(struct dmz_dev *dev, enum req_op op, + sector_t block, struct page *page) +{ + struct bio *bio; + int ret; + + if (WARN_ON(!dev)) + return -EIO; + + if (dmz_bdev_is_dying(dev)) + return -EIO; + + bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOIO); + bio->bi_iter.bi_sector = dmz_blk2sect(block); + bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); + ret = submit_bio_wait(bio); + bio_put(bio); + + if (ret) + dmz_check_bdev(dev); + return ret; +} + +/* + * Write super block of the specified metadata set. + */ +static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) +{ + struct dmz_mblock *mblk = zmd->sb[set].mblk; + struct dmz_super *sb = zmd->sb[set].sb; + struct dmz_dev *dev = zmd->sb[set].dev; + sector_t sb_block; + u64 sb_gen = zmd->sb_gen + 1; + int ret; + + sb->magic = cpu_to_le32(DMZ_MAGIC); + + sb->version = cpu_to_le32(zmd->sb_version); + if (zmd->sb_version > 1) { + BUILD_BUG_ON(UUID_SIZE != 16); + export_uuid(sb->dmz_uuid, &zmd->uuid); + memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); + export_uuid(sb->dev_uuid, &dev->uuid); + } + + sb->gen = cpu_to_le64(sb_gen); + + /* + * The metadata always references the absolute block address, + * ie relative to the entire block range, not the per-device + * block address. + */ + sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; + sb->sb_block = cpu_to_le64(sb_block); + sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); + sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); + sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); + + sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); + sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); + + sb->crc = 0; + sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); + + ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, + mblk->page); + if (ret == 0) + ret = blkdev_issue_flush(dev->bdev); + + return ret; +} + +/* + * Write dirty metadata blocks to the specified set. + */ +static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, + struct list_head *write_list, + unsigned int set) +{ + struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[set].dev; + struct blk_plug plug; + int ret = 0, nr_mblks_submitted = 0; + + /* Issue writes */ + blk_start_plug(&plug); + list_for_each_entry(mblk, write_list, link) { + ret = dmz_write_mblock(zmd, mblk, set); + if (ret) + break; + nr_mblks_submitted++; + } + blk_finish_plug(&plug); + + /* Wait for completion */ + list_for_each_entry(mblk, write_list, link) { + if (!nr_mblks_submitted) + break; + wait_on_bit_io(&mblk->state, DMZ_META_WRITING, + TASK_UNINTERRUPTIBLE); + if (test_bit(DMZ_META_ERROR, &mblk->state)) { + clear_bit(DMZ_META_ERROR, &mblk->state); + dmz_check_bdev(dev); + ret = -EIO; + } + nr_mblks_submitted--; + } + + /* Flush drive cache (this will also sync data) */ + if (ret == 0) + ret = blkdev_issue_flush(dev->bdev); + + return ret; +} + +/* + * Log dirty metadata blocks. + */ +static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, + struct list_head *write_list) +{ + unsigned int log_set = zmd->mblk_primary ^ 0x1; + int ret; + + /* Write dirty blocks to the log */ + ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); + if (ret) + return ret; + + /* + * No error so far: now validate the log by updating the + * log index super block generation. + */ + ret = dmz_write_sb(zmd, log_set); + if (ret) + return ret; + + return 0; +} + +/* + * Flush dirty metadata blocks. + */ +int dmz_flush_metadata(struct dmz_metadata *zmd) +{ + struct dmz_mblock *mblk; + struct list_head write_list; + struct dmz_dev *dev; + int ret; + + if (WARN_ON(!zmd)) + return 0; + + INIT_LIST_HEAD(&write_list); + + /* + * Make sure that metadata blocks are stable before logging: take + * the write lock on the metadata semaphore to prevent target BIOs + * from modifying metadata. + */ + down_write(&zmd->mblk_sem); + dev = zmd->sb[zmd->mblk_primary].dev; + + /* + * This is called from the target flush work and reclaim work. + * Concurrent execution is not allowed. + */ + dmz_lock_flush(zmd); + + if (dmz_bdev_is_dying(dev)) { + ret = -EIO; + goto out; + } + + /* Get dirty blocks */ + spin_lock(&zmd->mblk_lock); + list_splice_init(&zmd->mblk_dirty_list, &write_list); + spin_unlock(&zmd->mblk_lock); + + /* If there are no dirty metadata blocks, just flush the device cache */ + if (list_empty(&write_list)) { + ret = blkdev_issue_flush(dev->bdev); + goto err; + } + + /* + * The primary metadata set is still clean. Keep it this way until + * all updates are successful in the secondary set. That is, use + * the secondary set as a log. + */ + ret = dmz_log_dirty_mblocks(zmd, &write_list); + if (ret) + goto err; + + /* + * The log is on disk. It is now safe to update in place + * in the primary metadata set. + */ + ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); + if (ret) + goto err; + + ret = dmz_write_sb(zmd, zmd->mblk_primary); + if (ret) + goto err; + + while (!list_empty(&write_list)) { + mblk = list_first_entry(&write_list, struct dmz_mblock, link); + list_del_init(&mblk->link); + + spin_lock(&zmd->mblk_lock); + clear_bit(DMZ_META_DIRTY, &mblk->state); + if (mblk->ref == 0) + list_add_tail(&mblk->link, &zmd->mblk_lru_list); + spin_unlock(&zmd->mblk_lock); + } + + zmd->sb_gen++; +out: + dmz_unlock_flush(zmd); + up_write(&zmd->mblk_sem); + + return ret; + +err: + if (!list_empty(&write_list)) { + spin_lock(&zmd->mblk_lock); + list_splice(&write_list, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); + } + if (!dmz_check_bdev(dev)) + ret = -EIO; + goto out; +} + +/* + * Check super block. + */ +static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, + bool tertiary) +{ + struct dmz_super *sb = dsb->sb; + struct dmz_dev *dev = dsb->dev; + unsigned int nr_meta_zones, nr_data_zones; + u32 crc, stored_crc; + u64 gen, sb_block; + + if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { + dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", + DMZ_MAGIC, le32_to_cpu(sb->magic)); + return -ENXIO; + } + + zmd->sb_version = le32_to_cpu(sb->version); + if (zmd->sb_version > DMZ_META_VER) { + dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", + DMZ_META_VER, zmd->sb_version); + return -EINVAL; + } + if (zmd->sb_version < 2 && tertiary) { + dmz_dev_err(dev, "Tertiary superblocks are not supported"); + return -EINVAL; + } + + gen = le64_to_cpu(sb->gen); + stored_crc = le32_to_cpu(sb->crc); + sb->crc = 0; + crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); + if (crc != stored_crc) { + dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", + crc, stored_crc); + return -ENXIO; + } + + sb_block = le64_to_cpu(sb->sb_block); + if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift) { + dmz_dev_err(dev, "Invalid superblock position (is %llu expected %llu)", + sb_block, (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); + return -EINVAL; + } + if (zmd->sb_version > 1) { + uuid_t sb_uuid; + + import_uuid(&sb_uuid, sb->dmz_uuid); + if (uuid_is_null(&sb_uuid)) { + dmz_dev_err(dev, "NULL DM-Zoned uuid"); + return -ENXIO; + } else if (uuid_is_null(&zmd->uuid)) { + uuid_copy(&zmd->uuid, &sb_uuid); + } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { + dmz_dev_err(dev, "mismatching DM-Zoned uuid, is %pUl expected %pUl", + &sb_uuid, &zmd->uuid); + return -ENXIO; + } + if (!strlen(zmd->label)) + memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); + else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { + dmz_dev_err(dev, "mismatching DM-Zoned label, is %s expected %s", + sb->dmz_label, zmd->label); + return -ENXIO; + } + import_uuid(&dev->uuid, sb->dev_uuid); + if (uuid_is_null(&dev->uuid)) { + dmz_dev_err(dev, "NULL device uuid"); + return -ENXIO; + } + + if (tertiary) { + /* + * Generation number should be 0, but it doesn't + * really matter if it isn't. + */ + if (gen != 0) + dmz_dev_warn(dev, "Invalid generation %llu", + gen); + return 0; + } + } + + nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) + >> zmd->zone_nr_blocks_shift; + if (!nr_meta_zones || + (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) || + (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) { + dmz_dev_err(dev, "Invalid number of metadata blocks"); + return -ENXIO; + } + + if (!le32_to_cpu(sb->nr_reserved_seq) || + le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { + dmz_dev_err(dev, "Invalid number of reserved sequential zones"); + return -ENXIO; + } + + nr_data_zones = zmd->nr_useable_zones - + (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); + if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { + dmz_dev_err(dev, "Invalid number of chunks %u / %u", + le32_to_cpu(sb->nr_chunks), nr_data_zones); + return -ENXIO; + } + + /* OK */ + zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); + zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); + zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); + zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); + zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); + zmd->nr_meta_zones = nr_meta_zones; + zmd->nr_data_zones = nr_data_zones; + + return 0; +} + +/* + * Read the first or second super block from disk. + */ +static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) +{ + dmz_zmd_debug(zmd, "read superblock set %d dev %pg block %llu", + set, sb->dev->bdev, sb->block); + + return dmz_rdwr_block(sb->dev, REQ_OP_READ, + sb->block, sb->mblk->page); +} + +/* + * Determine the position of the secondary super blocks on disk. + * This is used only if a corruption of the primary super block + * is detected. + */ +static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) +{ + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; + struct dmz_mblock *mblk; + unsigned int zone_id = zmd->sb[0].zone->id; + int i; + + /* Allocate a block */ + mblk = dmz_alloc_mblock(zmd, 0); + if (!mblk) + return -ENOMEM; + + zmd->sb[1].mblk = mblk; + zmd->sb[1].sb = mblk->data; + + /* Bad first super block: search for the second one */ + zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; + zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); + zmd->sb[1].dev = zmd->sb[0].dev; + for (i = 1; i < zmd->nr_rnd_zones; i++) { + if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) + break; + if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) + return 0; + zmd->sb[1].block += zone_nr_blocks; + zmd->sb[1].zone = dmz_get(zmd, zone_id + i); + } + + dmz_free_mblock(zmd, mblk); + zmd->sb[1].mblk = NULL; + zmd->sb[1].zone = NULL; + zmd->sb[1].dev = NULL; + + return -EIO; +} + +/* + * Read a super block from disk. + */ +static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) +{ + struct dmz_mblock *mblk; + int ret; + + /* Allocate a block */ + mblk = dmz_alloc_mblock(zmd, 0); + if (!mblk) + return -ENOMEM; + + sb->mblk = mblk; + sb->sb = mblk->data; + + /* Read super block */ + ret = dmz_read_sb(zmd, sb, set); + if (ret) { + dmz_free_mblock(zmd, mblk); + sb->mblk = NULL; + return ret; + } + + return 0; +} + +/* + * Recover a metadata set. + */ +static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) +{ + unsigned int src_set = dst_set ^ 0x1; + struct page *page; + int i, ret; + + dmz_dev_warn(zmd->sb[dst_set].dev, + "Metadata set %u invalid: recovering", dst_set); + + if (dst_set == 0) + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + else + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); + + page = alloc_page(GFP_NOIO); + if (!page) + return -ENOMEM; + + /* Copy metadata blocks */ + for (i = 1; i < zmd->nr_meta_blocks; i++) { + ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, + zmd->sb[src_set].block + i, page); + if (ret) + goto out; + ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, + zmd->sb[dst_set].block + i, page); + if (ret) + goto out; + } + + /* Finalize with the super block */ + if (!zmd->sb[dst_set].mblk) { + zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); + if (!zmd->sb[dst_set].mblk) { + ret = -ENOMEM; + goto out; + } + zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; + } + + ret = dmz_write_sb(zmd, dst_set); +out: + __free_pages(page, 0); + + return ret; +} + +/* + * Get super block from disk. + */ +static int dmz_load_sb(struct dmz_metadata *zmd) +{ + bool sb_good[2] = {false, false}; + u64 sb_gen[2] = {0, 0}; + int ret; + + if (!zmd->sb[0].zone) { + dmz_zmd_err(zmd, "Primary super block zone not set"); + return -ENXIO; + } + + /* Read and check the primary super block */ + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + zmd->sb[0].dev = zmd->sb[0].zone->dev; + ret = dmz_get_sb(zmd, &zmd->sb[0], 0); + if (ret) { + dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); + return ret; + } + + ret = dmz_check_sb(zmd, &zmd->sb[0], false); + + /* Read and check secondary super block */ + if (ret == 0) { + sb_good[0] = true; + if (!zmd->sb[1].zone) { + unsigned int zone_id = + zmd->sb[0].zone->id + zmd->nr_meta_zones; + + zmd->sb[1].zone = dmz_get(zmd, zone_id); + } + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = zmd->sb[0].dev; + ret = dmz_get_sb(zmd, &zmd->sb[1], 1); + } else + ret = dmz_lookup_secondary_sb(zmd); + + if (ret) { + dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); + return ret; + } + + ret = dmz_check_sb(zmd, &zmd->sb[1], false); + if (ret == 0) + sb_good[1] = true; + + /* Use highest generation sb first */ + if (!sb_good[0] && !sb_good[1]) { + dmz_zmd_err(zmd, "No valid super block found"); + return -EIO; + } + + if (sb_good[0]) + sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); + else { + ret = dmz_recover_mblocks(zmd, 0); + if (ret) { + dmz_dev_err(zmd->sb[0].dev, + "Recovery of superblock 0 failed"); + return -EIO; + } + } + + if (sb_good[1]) + sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); + else { + ret = dmz_recover_mblocks(zmd, 1); + + if (ret) { + dmz_dev_err(zmd->sb[1].dev, + "Recovery of superblock 1 failed"); + return -EIO; + } + } + + if (sb_gen[0] >= sb_gen[1]) { + zmd->sb_gen = sb_gen[0]; + zmd->mblk_primary = 0; + } else { + zmd->sb_gen = sb_gen[1]; + zmd->mblk_primary = 1; + } + + dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, + "Using super block %u (gen %llu)", + zmd->mblk_primary, zmd->sb_gen); + + if (zmd->sb_version > 1) { + int i; + struct dmz_sb *sb; + + sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); + if (!sb) + return -ENOMEM; + for (i = 1; i < zmd->nr_devs; i++) { + sb->block = 0; + sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); + sb->dev = &zmd->dev[i]; + if (!dmz_is_meta(sb->zone)) { + dmz_dev_err(sb->dev, + "Tertiary super block zone %u not marked as metadata zone", + sb->zone->id); + ret = -EINVAL; + goto out_kfree; + } + ret = dmz_get_sb(zmd, sb, i + 1); + if (ret) { + dmz_dev_err(sb->dev, + "Read tertiary super block failed"); + dmz_free_mblock(zmd, sb->mblk); + goto out_kfree; + } + ret = dmz_check_sb(zmd, sb, true); + dmz_free_mblock(zmd, sb->mblk); + if (ret == -EINVAL) + goto out_kfree; + } + out_kfree: + kfree(sb); + } + return ret; +} + +/* + * Initialize a zone descriptor. + */ +static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) +{ + struct dmz_dev *dev = data; + struct dmz_metadata *zmd = dev->metadata; + int idx = num + dev->zone_offset; + struct dm_zone *zone; + + zone = dmz_insert(zmd, idx, dev); + if (IS_ERR(zone)) + return PTR_ERR(zone); + + if (blkz->len != zmd->zone_nr_sectors) { + if (zmd->sb_version > 1) { + /* Ignore the eventual runt (smaller) zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + return 0; + } else if (blkz->start + blkz->len == dev->capacity) + return 0; + return -ENXIO; + } + + /* + * Devices that have zones with a capacity smaller than the zone size + * (e.g. NVMe zoned namespaces) are not supported. + */ + if (blkz->capacity != blkz->len) + return -ENXIO; + + switch (blkz->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + set_bit(DMZ_RND, &zone->flags); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + set_bit(DMZ_SEQ, &zone->flags); + break; + default: + return -ENXIO; + } + + if (dmz_is_rnd(zone)) + zone->wp_block = 0; + else + zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); + + if (blkz->cond == BLK_ZONE_COND_OFFLINE) + set_bit(DMZ_OFFLINE, &zone->flags); + else if (blkz->cond == BLK_ZONE_COND_READONLY) + set_bit(DMZ_READ_ONLY, &zone->flags); + else { + zmd->nr_useable_zones++; + if (dmz_is_rnd(zone)) { + zmd->nr_rnd_zones++; + if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { + /* Primary super block zone */ + zmd->sb[0].zone = zone; + } + } + if (zmd->nr_devs > 1 && num == 0) { + /* + * Tertiary superblock zones are always at the + * start of the zoned devices, so mark them + * as metadata zone. + */ + set_bit(DMZ_META, &zone->flags); + } + } + return 0; +} + +static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) +{ + int idx; + sector_t zone_offset = 0; + + for(idx = 0; idx < dev->nr_zones; idx++) { + struct dm_zone *zone; + + zone = dmz_insert(zmd, idx, dev); + if (IS_ERR(zone)) + return PTR_ERR(zone); + set_bit(DMZ_CACHE, &zone->flags); + zone->wp_block = 0; + zmd->nr_cache_zones++; + zmd->nr_useable_zones++; + if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { + /* Disable runt zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + break; + } + zone_offset += zmd->zone_nr_sectors; + } + return 0; +} + +/* + * Free zones descriptors. + */ +static void dmz_drop_zones(struct dmz_metadata *zmd) +{ + int idx; + + for(idx = 0; idx < zmd->nr_zones; idx++) { + struct dm_zone *zone = xa_load(&zmd->zones, idx); + + kfree(zone); + xa_erase(&zmd->zones, idx); + } + xa_destroy(&zmd->zones); +} + +/* + * Allocate and initialize zone descriptors using the zone + * information from disk. + */ +static int dmz_init_zones(struct dmz_metadata *zmd) +{ + int i, ret; + struct dmz_dev *zoned_dev = &zmd->dev[0]; + + /* Init */ + zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; + zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); + zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); + zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); + zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; + zmd->zone_nr_bitmap_blocks = + max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); + zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, + DMZ_BLOCK_SIZE_BITS); + + /* Allocate zone array */ + zmd->nr_zones = 0; + for (i = 0; i < zmd->nr_devs; i++) { + struct dmz_dev *dev = &zmd->dev[i]; + + dev->metadata = zmd; + zmd->nr_zones += dev->nr_zones; + + atomic_set(&dev->unmap_nr_rnd, 0); + INIT_LIST_HEAD(&dev->unmap_rnd_list); + INIT_LIST_HEAD(&dev->map_rnd_list); + + atomic_set(&dev->unmap_nr_seq, 0); + INIT_LIST_HEAD(&dev->unmap_seq_list); + INIT_LIST_HEAD(&dev->map_seq_list); + } + + if (!zmd->nr_zones) { + DMERR("(%s): No zones found", zmd->devname); + return -ENXIO; + } + xa_init(&zmd->zones); + + DMDEBUG("(%s): Using %zu B for zone information", + zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); + + if (zmd->nr_devs > 1) { + ret = dmz_emulate_zones(zmd, &zmd->dev[0]); + if (ret < 0) { + DMDEBUG("(%s): Failed to emulate zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + + /* + * Primary superblock zone is always at zone 0 when multiple + * drives are present. + */ + zmd->sb[0].zone = dmz_get(zmd, 0); + + for (i = 1; i < zmd->nr_devs; i++) { + zoned_dev = &zmd->dev[i]; + + ret = blkdev_report_zones(zoned_dev->bdev, 0, + BLK_ALL_ZONES, + dmz_init_zone, zoned_dev); + if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + } + return 0; + } + + /* + * Get zone information and initialize zone descriptors. At the same + * time, determine where the super block should be: first block of the + * first randomly writable zone. + */ + ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, + dmz_init_zone, zoned_dev); + if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + + return 0; +} + +static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, + void *data) +{ + struct dm_zone *zone = data; + + clear_bit(DMZ_OFFLINE, &zone->flags); + clear_bit(DMZ_READ_ONLY, &zone->flags); + if (blkz->cond == BLK_ZONE_COND_OFFLINE) + set_bit(DMZ_OFFLINE, &zone->flags); + else if (blkz->cond == BLK_ZONE_COND_READONLY) + set_bit(DMZ_READ_ONLY, &zone->flags); + + if (dmz_is_seq(zone)) + zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); + else + zone->wp_block = 0; + return 0; +} + +/* + * Update a zone information. + */ +static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + struct dmz_dev *dev = zone->dev; + unsigned int noio_flag; + int ret; + + if (dev->flags & DMZ_BDEV_REGULAR) + return 0; + + /* + * Get zone information from disk. Since blkdev_report_zones() uses + * GFP_KERNEL by default for memory allocations, set the per-task + * PF_MEMALLOC_NOIO flag so that all allocations are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, + dmz_update_zone_cb, zone); + memalloc_noio_restore(noio_flag); + + if (ret == 0) + ret = -EIO; + if (ret < 0) { + dmz_dev_err(dev, "Get zone %u report failed", + zone->id); + dmz_check_bdev(dev); + return ret; + } + + return 0; +} + +/* + * Check a zone write pointer position when the zone is marked + * with the sequential write error flag. + */ +static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, + struct dm_zone *zone) +{ + struct dmz_dev *dev = zone->dev; + unsigned int wp = 0; + int ret; + + wp = zone->wp_block; + ret = dmz_update_zone(zmd, zone); + if (ret) + return ret; + + dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", + zone->id, zone->wp_block, wp); + + if (zone->wp_block < wp) { + dmz_invalidate_blocks(zmd, zone, zone->wp_block, + wp - zone->wp_block); + } + + return 0; +} + +/* + * Reset a zone write pointer. + */ +static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + int ret; + + /* + * Ignore offline zones, read only zones, + * and conventional zones. + */ + if (dmz_is_offline(zone) || + dmz_is_readonly(zone) || + dmz_is_rnd(zone)) + return 0; + + if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { + struct dmz_dev *dev = zone->dev; + + ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, + dmz_start_sect(zmd, zone), + zmd->zone_nr_sectors, GFP_NOIO); + if (ret) { + dmz_dev_err(dev, "Reset zone %u failed %d", + zone->id, ret); + return ret; + } + } + + /* Clear write error bit and rewind write pointer position */ + clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); + zone->wp_block = 0; + + return 0; +} + +static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); + +/* + * Initialize chunk mapping. + */ +static int dmz_load_mapping(struct dmz_metadata *zmd) +{ + struct dm_zone *dzone, *bzone; + struct dmz_mblock *dmap_mblk = NULL; + struct dmz_map *dmap; + unsigned int i = 0, e = 0, chunk = 0; + unsigned int dzone_id; + unsigned int bzone_id; + + /* Metadata block array for the chunk mapping table */ + zmd->map_mblk = kcalloc(zmd->nr_map_blocks, + sizeof(struct dmz_mblk *), GFP_KERNEL); + if (!zmd->map_mblk) + return -ENOMEM; + + /* Get chunk mapping table blocks and initialize zone mapping */ + while (chunk < zmd->nr_chunks) { + if (!dmap_mblk) { + /* Get mapping block */ + dmap_mblk = dmz_get_mblock(zmd, i + 1); + if (IS_ERR(dmap_mblk)) + return PTR_ERR(dmap_mblk); + zmd->map_mblk[i] = dmap_mblk; + dmap = (struct dmz_map *) dmap_mblk->data; + i++; + e = 0; + } + + /* Check data zone */ + dzone_id = le32_to_cpu(dmap[e].dzone_id); + if (dzone_id == DMZ_MAP_UNMAPPED) + goto next; + + if (dzone_id >= zmd->nr_zones) { + dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", + chunk, dzone_id); + return -EIO; + } + + dzone = dmz_get(zmd, dzone_id); + if (!dzone) { + dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", + chunk, dzone_id); + return -EIO; + } + set_bit(DMZ_DATA, &dzone->flags); + dzone->chunk = chunk; + dmz_get_zone_weight(zmd, dzone); + + if (dmz_is_cache(dzone)) + list_add_tail(&dzone->link, &zmd->map_cache_list); + else if (dmz_is_rnd(dzone)) + list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); + else + list_add_tail(&dzone->link, &dzone->dev->map_seq_list); + + /* Check buffer zone */ + bzone_id = le32_to_cpu(dmap[e].bzone_id); + if (bzone_id == DMZ_MAP_UNMAPPED) + goto next; + + if (bzone_id >= zmd->nr_zones) { + dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", + chunk, bzone_id); + return -EIO; + } + + bzone = dmz_get(zmd, bzone_id); + if (!bzone) { + dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", + chunk, bzone_id); + return -EIO; + } + if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { + dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", + chunk, bzone_id); + return -EIO; + } + + set_bit(DMZ_DATA, &bzone->flags); + set_bit(DMZ_BUF, &bzone->flags); + bzone->chunk = chunk; + bzone->bzone = dzone; + dzone->bzone = bzone; + dmz_get_zone_weight(zmd, bzone); + if (dmz_is_cache(bzone)) + list_add_tail(&bzone->link, &zmd->map_cache_list); + else + list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); +next: + chunk++; + e++; + if (e >= DMZ_MAP_ENTRIES) + dmap_mblk = NULL; + } + + /* + * At this point, only meta zones and mapped data zones were + * fully initialized. All remaining zones are unmapped data + * zones. Finish initializing those here. + */ + for (i = 0; i < zmd->nr_zones; i++) { + dzone = dmz_get(zmd, i); + if (!dzone) + continue; + if (dmz_is_meta(dzone)) + continue; + if (dmz_is_offline(dzone)) + continue; + + if (dmz_is_cache(dzone)) + zmd->nr_cache++; + else if (dmz_is_rnd(dzone)) + dzone->dev->nr_rnd++; + else + dzone->dev->nr_seq++; + + if (dmz_is_data(dzone)) { + /* Already initialized */ + continue; + } + + /* Unmapped data zone */ + set_bit(DMZ_DATA, &dzone->flags); + dzone->chunk = DMZ_MAP_UNMAPPED; + if (dmz_is_cache(dzone)) { + list_add_tail(&dzone->link, &zmd->unmap_cache_list); + atomic_inc(&zmd->unmap_nr_cache); + } else if (dmz_is_rnd(dzone)) { + list_add_tail(&dzone->link, + &dzone->dev->unmap_rnd_list); + atomic_inc(&dzone->dev->unmap_nr_rnd); + } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { + list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); + set_bit(DMZ_RESERVED, &dzone->flags); + atomic_inc(&zmd->nr_reserved_seq_zones); + dzone->dev->nr_seq--; + } else { + list_add_tail(&dzone->link, + &dzone->dev->unmap_seq_list); + atomic_inc(&dzone->dev->unmap_nr_seq); + } + } + + return 0; +} + +/* + * Set a data chunk mapping. + */ +static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, + unsigned int dzone_id, unsigned int bzone_id) +{ + struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; + struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; + int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; + + dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); + dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); + dmz_dirty_mblock(zmd, dmap_mblk); +} + +/* + * The list of mapped zones is maintained in LRU order. + * This rotates a zone at the end of its map list. + */ +static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + if (list_empty(&zone->link)) + return; + + list_del_init(&zone->link); + if (dmz_is_seq(zone)) { + /* LRU rotate sequential zone */ + list_add_tail(&zone->link, &zone->dev->map_seq_list); + } else if (dmz_is_cache(zone)) { + /* LRU rotate cache zone */ + list_add_tail(&zone->link, &zmd->map_cache_list); + } else { + /* LRU rotate random zone */ + list_add_tail(&zone->link, &zone->dev->map_rnd_list); + } +} + +/* + * The list of mapped random zones is maintained + * in LRU order. This rotates a zone at the end of the list. + */ +static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + __dmz_lru_zone(zmd, zone); + if (zone->bzone) + __dmz_lru_zone(zmd, zone->bzone); +} + +/* + * Wait for any zone to be freed. + */ +static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); + dmz_unlock_map(zmd); + dmz_unlock_metadata(zmd); + + io_schedule_timeout(HZ); + + dmz_lock_metadata(zmd); + dmz_lock_map(zmd); + finish_wait(&zmd->free_wq, &wait); +} + +/* + * Lock a zone for reclaim (set the zone RECLAIM bit). + * Returns false if the zone cannot be locked or if it is already locked + * and 1 otherwise. + */ +int dmz_lock_zone_reclaim(struct dm_zone *zone) +{ + /* Active zones cannot be reclaimed */ + if (dmz_is_active(zone)) + return 0; + + return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); +} + +/* + * Clear a zone reclaim flag. + */ +void dmz_unlock_zone_reclaim(struct dm_zone *zone) +{ + WARN_ON(dmz_is_active(zone)); + WARN_ON(!dmz_in_reclaim(zone)); + + clear_bit_unlock(DMZ_RECLAIM, &zone->flags); + smp_mb__after_atomic(); + wake_up_bit(&zone->flags, DMZ_RECLAIM); +} + +/* + * Wait for a zone reclaim to complete. + */ +static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + dmz_unlock_map(zmd); + dmz_unlock_metadata(zmd); + set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); + wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); + clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); + dmz_lock_metadata(zmd); + dmz_lock_map(zmd); +} + +/* + * Select a cache or random write zone for reclaim. + */ +static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int idx, bool idle) +{ + struct dm_zone *dzone = NULL; + struct dm_zone *zone, *maxw_z = NULL; + struct list_head *zone_list; + + /* If we have cache zones select from the cache zone list */ + if (zmd->nr_cache) { + zone_list = &zmd->map_cache_list; + /* Try to relaim random zones, too, when idle */ + if (idle && list_empty(zone_list)) + zone_list = &zmd->dev[idx].map_rnd_list; + } else + zone_list = &zmd->dev[idx].map_rnd_list; + + /* + * Find the buffer zone with the heaviest weight or the first (oldest) + * data zone that can be reclaimed. + */ + list_for_each_entry(zone, zone_list, link) { + if (dmz_is_buf(zone)) { + dzone = zone->bzone; + if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) + continue; + if (!maxw_z || maxw_z->weight < dzone->weight) + maxw_z = dzone; + } else { + dzone = zone; + if (dmz_lock_zone_reclaim(dzone)) + return dzone; + } + } + + if (maxw_z && dmz_lock_zone_reclaim(maxw_z)) + return maxw_z; + + /* + * If we come here, none of the zones inspected could be locked for + * reclaim. Try again, being more aggressive, that is, find the + * first zone that can be reclaimed regardless of its weitght. + */ + list_for_each_entry(zone, zone_list, link) { + if (dmz_is_buf(zone)) { + dzone = zone->bzone; + if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) + continue; + } else + dzone = zone; + if (dmz_lock_zone_reclaim(dzone)) + return dzone; + } + + return NULL; +} + +/* + * Select a buffered sequential zone for reclaim. + */ +static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int idx) +{ + struct dm_zone *zone; + + list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { + if (!zone->bzone) + continue; + if (dmz_lock_zone_reclaim(zone)) + return zone; + } + + return NULL; +} + +/* + * Select a zone for reclaim. + */ +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int dev_idx, bool idle) +{ + struct dm_zone *zone = NULL; + + /* + * Search for a zone candidate to reclaim: 2 cases are possible. + * (1) There is no free sequential zones. Then a random data zone + * cannot be reclaimed. So choose a sequential zone to reclaim so + * that afterward a random zone can be reclaimed. + * (2) At least one free sequential zone is available, then choose + * the oldest random zone (data or buffer) that can be locked. + */ + dmz_lock_map(zmd); + if (list_empty(&zmd->reserved_seq_zones_list)) + zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); + if (!zone) + zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); + dmz_unlock_map(zmd); + + return zone; +} + +/* + * Get the zone mapping a chunk, if the chunk is mapped already. + * If no mapping exist and the operation is WRITE, a zone is + * allocated and used to map the chunk. + * The zone returned will be set to the active state. + */ +struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, + unsigned int chunk, enum req_op op) +{ + struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; + struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; + int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; + unsigned int dzone_id; + struct dm_zone *dzone = NULL; + int ret = 0; + int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; + + dmz_lock_map(zmd); +again: + /* Get the chunk mapping */ + dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); + if (dzone_id == DMZ_MAP_UNMAPPED) { + /* + * Read or discard in unmapped chunks are fine. But for + * writes, we need a mapping, so get one. + */ + if (op != REQ_OP_WRITE) + goto out; + + /* Allocate a random zone */ + dzone = dmz_alloc_zone(zmd, 0, alloc_flags); + if (!dzone) { + if (dmz_dev_is_dying(zmd)) { + dzone = ERR_PTR(-EIO); + goto out; + } + dmz_wait_for_free_zones(zmd); + goto again; + } + + dmz_map_zone(zmd, dzone, chunk); + + } else { + /* The chunk is already mapped: get the mapping zone */ + dzone = dmz_get(zmd, dzone_id); + if (!dzone) { + dzone = ERR_PTR(-EIO); + goto out; + } + if (dzone->chunk != chunk) { + dzone = ERR_PTR(-EIO); + goto out; + } + + /* Repair write pointer if the sequential dzone has error */ + if (dmz_seq_write_err(dzone)) { + ret = dmz_handle_seq_write_err(zmd, dzone); + if (ret) { + dzone = ERR_PTR(-EIO); + goto out; + } + clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); + } + } + + /* + * If the zone is being reclaimed, the chunk mapping may change + * to a different zone. So wait for reclaim and retry. Otherwise, + * activate the zone (this will prevent reclaim from touching it). + */ + if (dmz_in_reclaim(dzone)) { + dmz_wait_for_reclaim(zmd, dzone); + goto again; + } + dmz_activate_zone(dzone); + dmz_lru_zone(zmd, dzone); +out: + dmz_unlock_map(zmd); + + return dzone; +} + +/* + * Write and discard change the block validity of data zones and their buffer + * zones. Check here that valid blocks are still present. If all blocks are + * invalid, the zones can be unmapped on the fly without waiting for reclaim + * to do it. + */ +void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) +{ + struct dm_zone *bzone; + + dmz_lock_map(zmd); + + bzone = dzone->bzone; + if (bzone) { + if (dmz_weight(bzone)) + dmz_lru_zone(zmd, bzone); + else { + /* Empty buffer zone: reclaim it */ + dmz_unmap_zone(zmd, bzone); + dmz_free_zone(zmd, bzone); + bzone = NULL; + } + } + + /* Deactivate the data zone */ + dmz_deactivate_zone(dzone); + if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) + dmz_lru_zone(zmd, dzone); + else { + /* Unbuffered inactive empty data zone: reclaim it */ + dmz_unmap_zone(zmd, dzone); + dmz_free_zone(zmd, dzone); + } + + dmz_unlock_map(zmd); +} + +/* + * Allocate and map a random zone to buffer a chunk + * already mapped to a sequential zone. + */ +struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, + struct dm_zone *dzone) +{ + struct dm_zone *bzone; + int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; + + dmz_lock_map(zmd); +again: + bzone = dzone->bzone; + if (bzone) + goto out; + + /* Allocate a random zone */ + bzone = dmz_alloc_zone(zmd, 0, alloc_flags); + if (!bzone) { + if (dmz_dev_is_dying(zmd)) { + bzone = ERR_PTR(-EIO); + goto out; + } + dmz_wait_for_free_zones(zmd); + goto again; + } + + /* Update the chunk mapping */ + dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); + + set_bit(DMZ_BUF, &bzone->flags); + bzone->chunk = dzone->chunk; + bzone->bzone = dzone; + dzone->bzone = bzone; + if (dmz_is_cache(bzone)) + list_add_tail(&bzone->link, &zmd->map_cache_list); + else + list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); +out: + dmz_unlock_map(zmd); + + return bzone; +} + +/* + * Get an unmapped (free) zone. + * This must be called with the mapping lock held. + */ +struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, + unsigned long flags) +{ + struct list_head *list; + struct dm_zone *zone; + int i; + + /* Schedule reclaim to ensure free zones are available */ + if (!(flags & DMZ_ALLOC_RECLAIM)) { + for (i = 0; i < zmd->nr_devs; i++) + dmz_schedule_reclaim(zmd->dev[i].reclaim); + } + + i = 0; +again: + if (flags & DMZ_ALLOC_CACHE) + list = &zmd->unmap_cache_list; + else if (flags & DMZ_ALLOC_RND) + list = &zmd->dev[dev_idx].unmap_rnd_list; + else + list = &zmd->dev[dev_idx].unmap_seq_list; + + if (list_empty(list)) { + /* + * No free zone: return NULL if this is for not reclaim. + */ + if (!(flags & DMZ_ALLOC_RECLAIM)) + return NULL; + /* + * Try to allocate from other devices + */ + if (i < zmd->nr_devs) { + dev_idx = (dev_idx + 1) % zmd->nr_devs; + i++; + goto again; + } + + /* + * Fallback to the reserved sequential zones + */ + zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, + struct dm_zone, link); + if (zone) { + list_del_init(&zone->link); + atomic_dec(&zmd->nr_reserved_seq_zones); + } + return zone; + } + + zone = list_first_entry(list, struct dm_zone, link); + list_del_init(&zone->link); + + if (dmz_is_cache(zone)) + atomic_dec(&zmd->unmap_nr_cache); + else if (dmz_is_rnd(zone)) + atomic_dec(&zone->dev->unmap_nr_rnd); + else + atomic_dec(&zone->dev->unmap_nr_seq); + + if (dmz_is_offline(zone)) { + dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); + zone = NULL; + goto again; + } + if (dmz_is_meta(zone)) { + dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); + zone = NULL; + goto again; + } + return zone; +} + +/* + * Free a zone. + * This must be called with the mapping lock held. + */ +void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + /* If this is a sequential zone, reset it */ + if (dmz_is_seq(zone)) + dmz_reset_zone(zmd, zone); + + /* Return the zone to its type unmap list */ + if (dmz_is_cache(zone)) { + list_add_tail(&zone->link, &zmd->unmap_cache_list); + atomic_inc(&zmd->unmap_nr_cache); + } else if (dmz_is_rnd(zone)) { + list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); + atomic_inc(&zone->dev->unmap_nr_rnd); + } else if (dmz_is_reserved(zone)) { + list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); + atomic_inc(&zmd->nr_reserved_seq_zones); + } else { + list_add_tail(&zone->link, &zone->dev->unmap_seq_list); + atomic_inc(&zone->dev->unmap_nr_seq); + } + + wake_up_all(&zmd->free_wq); +} + +/* + * Map a chunk to a zone. + * This must be called with the mapping lock held. + */ +void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, + unsigned int chunk) +{ + /* Set the chunk mapping */ + dmz_set_chunk_mapping(zmd, chunk, dzone->id, + DMZ_MAP_UNMAPPED); + dzone->chunk = chunk; + if (dmz_is_cache(dzone)) + list_add_tail(&dzone->link, &zmd->map_cache_list); + else if (dmz_is_rnd(dzone)) + list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); + else + list_add_tail(&dzone->link, &dzone->dev->map_seq_list); +} + +/* + * Unmap a zone. + * This must be called with the mapping lock held. + */ +void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + unsigned int chunk = zone->chunk; + unsigned int dzone_id; + + if (chunk == DMZ_MAP_UNMAPPED) { + /* Already unmapped */ + return; + } + + if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { + /* + * Unmapping the chunk buffer zone: clear only + * the chunk buffer mapping + */ + dzone_id = zone->bzone->id; + zone->bzone->bzone = NULL; + zone->bzone = NULL; + + } else { + /* + * Unmapping the chunk data zone: the zone must + * not be buffered. + */ + if (WARN_ON(zone->bzone)) { + zone->bzone->bzone = NULL; + zone->bzone = NULL; + } + dzone_id = DMZ_MAP_UNMAPPED; + } + + dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); + + zone->chunk = DMZ_MAP_UNMAPPED; + list_del_init(&zone->link); +} + +/* + * Set @nr_bits bits in @bitmap starting from @bit. + * Return the number of bits changed from 0 to 1. + */ +static unsigned int dmz_set_bits(unsigned long *bitmap, + unsigned int bit, unsigned int nr_bits) +{ + unsigned long *addr; + unsigned int end = bit + nr_bits; + unsigned int n = 0; + + while (bit < end) { + if (((bit & (BITS_PER_LONG - 1)) == 0) && + ((end - bit) >= BITS_PER_LONG)) { + /* Try to set the whole word at once */ + addr = bitmap + BIT_WORD(bit); + if (*addr == 0) { + *addr = ULONG_MAX; + n += BITS_PER_LONG; + bit += BITS_PER_LONG; + continue; + } + } + + if (!test_and_set_bit(bit, bitmap)) + n++; + bit++; + } + + return n; +} + +/* + * Get the bitmap block storing the bit for chunk_block in zone. + */ +static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, + struct dm_zone *zone, + sector_t chunk_block) +{ + sector_t bitmap_block = 1 + zmd->nr_map_blocks + + (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + + (chunk_block >> DMZ_BLOCK_SHIFT_BITS); + + return dmz_get_mblock(zmd, bitmap_block); +} + +/* + * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. + */ +int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, + struct dm_zone *to_zone) +{ + struct dmz_mblock *from_mblk, *to_mblk; + sector_t chunk_block = 0; + + /* Get the zones bitmap blocks */ + while (chunk_block < zmd->zone_nr_blocks) { + from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); + if (IS_ERR(from_mblk)) + return PTR_ERR(from_mblk); + to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); + if (IS_ERR(to_mblk)) { + dmz_release_mblock(zmd, from_mblk); + return PTR_ERR(to_mblk); + } + + memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); + dmz_dirty_mblock(zmd, to_mblk); + + dmz_release_mblock(zmd, to_mblk); + dmz_release_mblock(zmd, from_mblk); + + chunk_block += zmd->zone_bits_per_mblk; + } + + to_zone->weight = from_zone->weight; + + return 0; +} + +/* + * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, + * starting from chunk_block. + */ +int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, + struct dm_zone *to_zone, sector_t chunk_block) +{ + unsigned int nr_blocks; + int ret; + + /* Get the zones bitmap blocks */ + while (chunk_block < zmd->zone_nr_blocks) { + /* Get a valid region from the source zone */ + ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); + if (ret <= 0) + return ret; + + nr_blocks = ret; + ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); + if (ret) + return ret; + + chunk_block += nr_blocks; + } + + return 0; +} + +/* + * Validate all the blocks in the range [block..block+nr_blocks-1]. + */ +int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block, unsigned int nr_blocks) +{ + unsigned int count, bit, nr_bits; + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; + struct dmz_mblock *mblk; + unsigned int n = 0; + + dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", + zone->id, (unsigned long long)chunk_block, + nr_blocks); + + WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); + + while (nr_blocks) { + /* Get bitmap block */ + mblk = dmz_get_bitmap(zmd, zone, chunk_block); + if (IS_ERR(mblk)) + return PTR_ERR(mblk); + + /* Set bits */ + bit = chunk_block & DMZ_BLOCK_MASK_BITS; + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); + + count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); + if (count) { + dmz_dirty_mblock(zmd, mblk); + n += count; + } + dmz_release_mblock(zmd, mblk); + + nr_blocks -= nr_bits; + chunk_block += nr_bits; + } + + if (likely(zone->weight + n <= zone_nr_blocks)) + zone->weight += n; + else { + dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", + zone->id, zone->weight, + zone_nr_blocks - n); + zone->weight = zone_nr_blocks; + } + + return 0; +} + +/* + * Clear nr_bits bits in bitmap starting from bit. + * Return the number of bits cleared. + */ +static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) +{ + unsigned long *addr; + int end = bit + nr_bits; + int n = 0; + + while (bit < end) { + if (((bit & (BITS_PER_LONG - 1)) == 0) && + ((end - bit) >= BITS_PER_LONG)) { + /* Try to clear whole word at once */ + addr = bitmap + BIT_WORD(bit); + if (*addr == ULONG_MAX) { + *addr = 0; + n += BITS_PER_LONG; + bit += BITS_PER_LONG; + continue; + } + } + + if (test_and_clear_bit(bit, bitmap)) + n++; + bit++; + } + + return n; +} + +/* + * Invalidate all the blocks in the range [block..block+nr_blocks-1]. + */ +int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block, unsigned int nr_blocks) +{ + unsigned int count, bit, nr_bits; + struct dmz_mblock *mblk; + unsigned int n = 0; + + dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", + zone->id, (u64)chunk_block, nr_blocks); + + WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); + + while (nr_blocks) { + /* Get bitmap block */ + mblk = dmz_get_bitmap(zmd, zone, chunk_block); + if (IS_ERR(mblk)) + return PTR_ERR(mblk); + + /* Clear bits */ + bit = chunk_block & DMZ_BLOCK_MASK_BITS; + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); + + count = dmz_clear_bits((unsigned long *)mblk->data, + bit, nr_bits); + if (count) { + dmz_dirty_mblock(zmd, mblk); + n += count; + } + dmz_release_mblock(zmd, mblk); + + nr_blocks -= nr_bits; + chunk_block += nr_bits; + } + + if (zone->weight >= n) + zone->weight -= n; + else { + dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", + zone->id, zone->weight, n); + zone->weight = 0; + } + + return 0; +} + +/* + * Get a block bit value. + */ +static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block) +{ + struct dmz_mblock *mblk; + int ret; + + WARN_ON(chunk_block >= zmd->zone_nr_blocks); + + /* Get bitmap block */ + mblk = dmz_get_bitmap(zmd, zone, chunk_block); + if (IS_ERR(mblk)) + return PTR_ERR(mblk); + + /* Get offset */ + ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, + (unsigned long *) mblk->data) != 0; + + dmz_release_mblock(zmd, mblk); + + return ret; +} + +/* + * Return the number of blocks from chunk_block to the first block with a bit + * value specified by set. Search at most nr_blocks blocks from chunk_block. + */ +static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block, unsigned int nr_blocks, + int set) +{ + struct dmz_mblock *mblk; + unsigned int bit, set_bit, nr_bits; + unsigned int zone_bits = zmd->zone_bits_per_mblk; + unsigned long *bitmap; + int n = 0; + + WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); + + while (nr_blocks) { + /* Get bitmap block */ + mblk = dmz_get_bitmap(zmd, zone, chunk_block); + if (IS_ERR(mblk)) + return PTR_ERR(mblk); + + /* Get offset */ + bitmap = (unsigned long *) mblk->data; + bit = chunk_block & DMZ_BLOCK_MASK_BITS; + nr_bits = min(nr_blocks, zone_bits - bit); + if (set) + set_bit = find_next_bit(bitmap, zone_bits, bit); + else + set_bit = find_next_zero_bit(bitmap, zone_bits, bit); + dmz_release_mblock(zmd, mblk); + + n += set_bit - bit; + if (set_bit < zone_bits) + break; + + nr_blocks -= nr_bits; + chunk_block += nr_bits; + } + + return n; +} + +/* + * Test if chunk_block is valid. If it is, the number of consecutive + * valid blocks from chunk_block will be returned. + */ +int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block) +{ + int valid; + + valid = dmz_test_block(zmd, zone, chunk_block); + if (valid <= 0) + return valid; + + /* The block is valid: get the number of valid blocks from block */ + return dmz_to_next_set_block(zmd, zone, chunk_block, + zmd->zone_nr_blocks - chunk_block, 0); +} + +/* + * Find the first valid block from @chunk_block in @zone. + * If such a block is found, its number is returned using + * @chunk_block and the total number of valid blocks from @chunk_block + * is returned. + */ +int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t *chunk_block) +{ + sector_t start_block = *chunk_block; + int ret; + + ret = dmz_to_next_set_block(zmd, zone, start_block, + zmd->zone_nr_blocks - start_block, 1); + if (ret < 0) + return ret; + + start_block += ret; + *chunk_block = start_block; + + return dmz_to_next_set_block(zmd, zone, start_block, + zmd->zone_nr_blocks - start_block, 0); +} + +/* + * Count the number of bits set starting from bit up to bit + nr_bits - 1. + */ +static int dmz_count_bits(void *bitmap, int bit, int nr_bits) +{ + unsigned long *addr; + int end = bit + nr_bits; + int n = 0; + + while (bit < end) { + if (((bit & (BITS_PER_LONG - 1)) == 0) && + ((end - bit) >= BITS_PER_LONG)) { + addr = (unsigned long *)bitmap + BIT_WORD(bit); + if (*addr == ULONG_MAX) { + n += BITS_PER_LONG; + bit += BITS_PER_LONG; + continue; + } + } + + if (test_bit(bit, bitmap)) + n++; + bit++; + } + + return n; +} + +/* + * Get a zone weight. + */ +static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + struct dmz_mblock *mblk; + sector_t chunk_block = 0; + unsigned int bit, nr_bits; + unsigned int nr_blocks = zmd->zone_nr_blocks; + void *bitmap; + int n = 0; + + while (nr_blocks) { + /* Get bitmap block */ + mblk = dmz_get_bitmap(zmd, zone, chunk_block); + if (IS_ERR(mblk)) { + n = 0; + break; + } + + /* Count bits in this block */ + bitmap = mblk->data; + bit = chunk_block & DMZ_BLOCK_MASK_BITS; + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); + n += dmz_count_bits(bitmap, bit, nr_bits); + + dmz_release_mblock(zmd, mblk); + + nr_blocks -= nr_bits; + chunk_block += nr_bits; + } + + zone->weight = n; +} + +/* + * Cleanup the zoned metadata resources. + */ +static void dmz_cleanup_metadata(struct dmz_metadata *zmd) +{ + struct rb_root *root; + struct dmz_mblock *mblk, *next; + int i; + + /* Release zone mapping resources */ + if (zmd->map_mblk) { + for (i = 0; i < zmd->nr_map_blocks; i++) + dmz_release_mblock(zmd, zmd->map_mblk[i]); + kfree(zmd->map_mblk); + zmd->map_mblk = NULL; + } + + /* Release super blocks */ + for (i = 0; i < 2; i++) { + if (zmd->sb[i].mblk) { + dmz_free_mblock(zmd, zmd->sb[i].mblk); + zmd->sb[i].mblk = NULL; + } + } + + /* Free cached blocks */ + while (!list_empty(&zmd->mblk_dirty_list)) { + mblk = list_first_entry(&zmd->mblk_dirty_list, + struct dmz_mblock, link); + dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", + (u64)mblk->no, mblk->ref); + list_del_init(&mblk->link); + rb_erase(&mblk->node, &zmd->mblk_rbtree); + dmz_free_mblock(zmd, mblk); + } + + while (!list_empty(&zmd->mblk_lru_list)) { + mblk = list_first_entry(&zmd->mblk_lru_list, + struct dmz_mblock, link); + list_del_init(&mblk->link); + rb_erase(&mblk->node, &zmd->mblk_rbtree); + dmz_free_mblock(zmd, mblk); + } + + /* Sanity checks: the mblock rbtree should now be empty */ + root = &zmd->mblk_rbtree; + rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { + dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", + (u64)mblk->no, mblk->ref); + mblk->ref = 0; + dmz_free_mblock(zmd, mblk); + } + + /* Free the zone descriptors */ + dmz_drop_zones(zmd); + + mutex_destroy(&zmd->mblk_flush_lock); + mutex_destroy(&zmd->map_lock); +} + +static void dmz_print_dev(struct dmz_metadata *zmd, int num) +{ + struct dmz_dev *dev = &zmd->dev[num]; + + if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) + dmz_dev_info(dev, "Regular block device"); + else + dmz_dev_info(dev, "Host-%s zoned block device", + bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? + "aware" : "managed"); + if (zmd->sb_version > 1) { + sector_t sector_offset = + dev->zone_offset << zmd->zone_nr_sectors_shift; + + dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", + (u64)dev->capacity, (u64)sector_offset); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", + dev->nr_zones, (u64)zmd->zone_nr_sectors, + (u64)dev->zone_offset); + } else { + dmz_dev_info(dev, " %llu 512-byte logical sectors", + (u64)dev->capacity); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", + dev->nr_zones, (u64)zmd->zone_nr_sectors); + } +} + +/* + * Initialize the zoned metadata. + */ +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, + struct dmz_metadata **metadata, + const char *devname) +{ + struct dmz_metadata *zmd; + unsigned int i; + struct dm_zone *zone; + int ret; + + zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); + if (!zmd) + return -ENOMEM; + + strcpy(zmd->devname, devname); + zmd->dev = dev; + zmd->nr_devs = num_dev; + zmd->mblk_rbtree = RB_ROOT; + init_rwsem(&zmd->mblk_sem); + mutex_init(&zmd->mblk_flush_lock); + spin_lock_init(&zmd->mblk_lock); + INIT_LIST_HEAD(&zmd->mblk_lru_list); + INIT_LIST_HEAD(&zmd->mblk_dirty_list); + + mutex_init(&zmd->map_lock); + + atomic_set(&zmd->unmap_nr_cache, 0); + INIT_LIST_HEAD(&zmd->unmap_cache_list); + INIT_LIST_HEAD(&zmd->map_cache_list); + + atomic_set(&zmd->nr_reserved_seq_zones, 0); + INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); + + init_waitqueue_head(&zmd->free_wq); + + /* Initialize zone descriptors */ + ret = dmz_init_zones(zmd); + if (ret) + goto err; + + /* Get super block */ + ret = dmz_load_sb(zmd); + if (ret) + goto err; + + /* Set metadata zones starting from sb_zone */ + for (i = 0; i < zmd->nr_meta_zones << 1; i++) { + zone = dmz_get(zmd, zmd->sb[0].zone->id + i); + if (!zone) { + dmz_zmd_err(zmd, + "metadata zone %u not present", i); + ret = -ENXIO; + goto err; + } + if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { + dmz_zmd_err(zmd, + "metadata zone %d is not random", i); + ret = -ENXIO; + goto err; + } + set_bit(DMZ_META, &zone->flags); + } + /* Load mapping table */ + ret = dmz_load_mapping(zmd); + if (ret) + goto err; + + /* + * Cache size boundaries: allow at least 2 super blocks, the chunk map + * blocks and enough blocks to be able to cache the bitmap blocks of + * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow + * the cache to add 512 more metadata blocks. + */ + zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; + zmd->max_nr_mblks = zmd->min_nr_mblks + 512; + zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; + zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; + zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; + + /* Metadata cache shrinker */ + ret = register_shrinker(&zmd->mblk_shrinker, "dm-zoned-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); + if (ret) { + dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); + goto err; + } + + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); + for (i = 0; i < zmd->nr_devs; i++) + dmz_print_dev(zmd, i); + + dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", + zmd->nr_zones, (u64)zmd->zone_nr_sectors); + dmz_zmd_debug(zmd, " %u metadata zones", + zmd->nr_meta_zones * 2); + dmz_zmd_debug(zmd, " %u data zones for %u chunks", + zmd->nr_data_zones, zmd->nr_chunks); + dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", + zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); + for (i = 0; i < zmd->nr_devs; i++) { + dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", + dmz_nr_rnd_zones(zmd, i), + dmz_nr_unmap_rnd_zones(zmd, i)); + dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", + dmz_nr_seq_zones(zmd, i), + dmz_nr_unmap_seq_zones(zmd, i)); + } + dmz_zmd_debug(zmd, " %u reserved sequential data zones", + zmd->nr_reserved_seq); + dmz_zmd_debug(zmd, "Format:"); + dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", + zmd->nr_meta_blocks, zmd->max_nr_mblks); + dmz_zmd_debug(zmd, " %u data zone mapping blocks", + zmd->nr_map_blocks); + dmz_zmd_debug(zmd, " %u bitmap blocks", + zmd->nr_bitmap_blocks); + + *metadata = zmd; + + return 0; +err: + dmz_cleanup_metadata(zmd); + kfree(zmd); + *metadata = NULL; + + return ret; +} + +/* + * Cleanup the zoned metadata resources. + */ +void dmz_dtr_metadata(struct dmz_metadata *zmd) +{ + unregister_shrinker(&zmd->mblk_shrinker); + dmz_cleanup_metadata(zmd); + kfree(zmd); +} + +/* + * Check zone information on resume. + */ +int dmz_resume_metadata(struct dmz_metadata *zmd) +{ + struct dm_zone *zone; + sector_t wp_block; + unsigned int i; + int ret; + + /* Check zones */ + for (i = 0; i < zmd->nr_zones; i++) { + zone = dmz_get(zmd, i); + if (!zone) { + dmz_zmd_err(zmd, "Unable to get zone %u", i); + return -EIO; + } + wp_block = zone->wp_block; + + ret = dmz_update_zone(zmd, zone); + if (ret) { + dmz_zmd_err(zmd, "Broken zone %u", i); + return ret; + } + + if (dmz_is_offline(zone)) { + dmz_zmd_warn(zmd, "Zone %u is offline", i); + continue; + } + + /* Check write pointer */ + if (!dmz_is_seq(zone)) + zone->wp_block = 0; + else if (zone->wp_block != wp_block) { + dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", + i, (u64)zone->wp_block, (u64)wp_block); + zone->wp_block = wp_block; + dmz_invalidate_blocks(zmd, zone, zone->wp_block, + zmd->zone_nr_blocks - zone->wp_block); + } + } + + return 0; +} diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c new file mode 100644 index 000000000..d58db9a27 --- /dev/null +++ b/drivers/md/dm-zoned-reclaim.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#include "dm-zoned.h" + +#include + +#define DM_MSG_PREFIX "zoned reclaim" + +struct dmz_reclaim { + struct dmz_metadata *metadata; + + struct delayed_work work; + struct workqueue_struct *wq; + + struct dm_kcopyd_client *kc; + struct dm_kcopyd_throttle kc_throttle; + int kc_err; + + int dev_idx; + + unsigned long flags; + + /* Last target access time */ + unsigned long atime; +}; + +/* + * Reclaim state flags. + */ +enum { + DMZ_RECLAIM_KCOPY, +}; + +/* + * Number of seconds of target BIO inactivity to consider the target idle. + */ +#define DMZ_IDLE_PERIOD (10UL * HZ) + +/* + * Percentage of unmapped (free) random zones below which reclaim starts + * even if the target is busy. + */ +#define DMZ_RECLAIM_LOW_UNMAP_ZONES 30 + +/* + * Percentage of unmapped (free) random zones above which reclaim will + * stop if the target is busy. + */ +#define DMZ_RECLAIM_HIGH_UNMAP_ZONES 50 + +/* + * Align a sequential zone write pointer to chunk_block. + */ +static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, + sector_t block) +{ + struct dmz_metadata *zmd = zrc->metadata; + struct dmz_dev *dev = zone->dev; + sector_t wp_block = zone->wp_block; + unsigned int nr_blocks; + int ret; + + if (wp_block == block) + return 0; + + if (wp_block > block) + return -EIO; + + /* + * Zeroout the space between the write + * pointer and the requested position. + */ + nr_blocks = block - wp_block; + ret = blkdev_issue_zeroout(dev->bdev, + dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), + dmz_blk2sect(nr_blocks), GFP_NOIO, 0); + if (ret) { + dmz_dev_err(dev, + "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", + zone->id, (unsigned long long)wp_block, + (unsigned long long)block, nr_blocks, ret); + dmz_check_bdev(dev); + return ret; + } + + zone->wp_block = block; + + return 0; +} + +/* + * dm_kcopyd_copy end notification. + */ +static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err, + void *context) +{ + struct dmz_reclaim *zrc = context; + + if (read_err || write_err) + zrc->kc_err = -EIO; + else + zrc->kc_err = 0; + + clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags); + smp_mb__after_atomic(); + wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY); +} + +/* + * Copy valid blocks of src_zone into dst_zone. + */ +static int dmz_reclaim_copy(struct dmz_reclaim *zrc, + struct dm_zone *src_zone, struct dm_zone *dst_zone) +{ + struct dmz_metadata *zmd = zrc->metadata; + struct dm_io_region src, dst; + sector_t block = 0, end_block; + sector_t nr_blocks; + sector_t src_zone_block; + sector_t dst_zone_block; + unsigned long flags = 0; + int ret; + + if (dmz_is_seq(src_zone)) + end_block = src_zone->wp_block; + else + end_block = dmz_zone_nr_blocks(zmd); + src_zone_block = dmz_start_block(zmd, src_zone); + dst_zone_block = dmz_start_block(zmd, dst_zone); + + if (dmz_is_seq(dst_zone)) + flags |= BIT(DM_KCOPYD_WRITE_SEQ); + + while (block < end_block) { + if (src_zone->dev->flags & DMZ_BDEV_DYING) + return -EIO; + if (dst_zone->dev->flags & DMZ_BDEV_DYING) + return -EIO; + + if (dmz_reclaim_should_terminate(src_zone)) + return -EINTR; + + /* Get a valid region from the source zone */ + ret = dmz_first_valid_block(zmd, src_zone, &block); + if (ret <= 0) + return ret; + nr_blocks = ret; + + /* + * If we are writing in a sequential zone, we must make sure + * that writes are sequential. So Zeroout any eventual hole + * between writes. + */ + if (dmz_is_seq(dst_zone)) { + ret = dmz_reclaim_align_wp(zrc, dst_zone, block); + if (ret) + return ret; + } + + src.bdev = src_zone->dev->bdev; + src.sector = dmz_blk2sect(src_zone_block + block); + src.count = dmz_blk2sect(nr_blocks); + + dst.bdev = dst_zone->dev->bdev; + dst.sector = dmz_blk2sect(dst_zone_block + block); + dst.count = src.count; + + /* Copy the valid region */ + set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags); + dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, + dmz_reclaim_kcopy_end, zrc); + + /* Wait for copy to complete */ + wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY, + TASK_UNINTERRUPTIBLE); + if (zrc->kc_err) + return zrc->kc_err; + + block += nr_blocks; + if (dmz_is_seq(dst_zone)) + dst_zone->wp_block = block; + } + + return 0; +} + +/* + * Move valid blocks of dzone buffer zone into dzone (after its write pointer) + * and free the buffer zone. + */ +static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) +{ + struct dm_zone *bzone = dzone->bzone; + sector_t chunk_block = dzone->wp_block; + struct dmz_metadata *zmd = zrc->metadata; + int ret; + + DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", + dmz_metadata_label(zmd), zrc->dev_idx, + dzone->chunk, bzone->id, dmz_weight(bzone), + dzone->id, dmz_weight(dzone)); + + /* Flush data zone into the buffer zone */ + ret = dmz_reclaim_copy(zrc, bzone, dzone); + if (ret < 0) + return ret; + + dmz_lock_flush(zmd); + + /* Validate copied blocks */ + ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block); + if (ret == 0) { + /* Free the buffer zone */ + dmz_invalidate_blocks(zmd, bzone, 0, dmz_zone_nr_blocks(zmd)); + dmz_lock_map(zmd); + dmz_unmap_zone(zmd, bzone); + dmz_unlock_zone_reclaim(dzone); + dmz_free_zone(zmd, bzone); + dmz_unlock_map(zmd); + } + + dmz_unlock_flush(zmd); + + return ret; +} + +/* + * Merge valid blocks of dzone into its buffer zone and free dzone. + */ +static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) +{ + unsigned int chunk = dzone->chunk; + struct dm_zone *bzone = dzone->bzone; + struct dmz_metadata *zmd = zrc->metadata; + int ret = 0; + + DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", + dmz_metadata_label(zmd), zrc->dev_idx, + chunk, dzone->id, dmz_weight(dzone), + bzone->id, dmz_weight(bzone)); + + /* Flush data zone into the buffer zone */ + ret = dmz_reclaim_copy(zrc, dzone, bzone); + if (ret < 0) + return ret; + + dmz_lock_flush(zmd); + + /* Validate copied blocks */ + ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0); + if (ret == 0) { + /* + * Free the data zone and remap the chunk to + * the buffer zone. + */ + dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); + dmz_lock_map(zmd); + dmz_unmap_zone(zmd, bzone); + dmz_unmap_zone(zmd, dzone); + dmz_unlock_zone_reclaim(dzone); + dmz_free_zone(zmd, dzone); + dmz_map_zone(zmd, bzone, chunk); + dmz_unlock_map(zmd); + } + + dmz_unlock_flush(zmd); + + return ret; +} + +/* + * Move valid blocks of the random data zone dzone into a free sequential zone. + * Once blocks are moved, remap the zone chunk to the sequential zone. + */ +static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) +{ + unsigned int chunk = dzone->chunk; + struct dm_zone *szone = NULL; + struct dmz_metadata *zmd = zrc->metadata; + int ret; + int alloc_flags = DMZ_ALLOC_SEQ; + + /* Get a free random or sequential zone */ + dmz_lock_map(zmd); +again: + szone = dmz_alloc_zone(zmd, zrc->dev_idx, + alloc_flags | DMZ_ALLOC_RECLAIM); + if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) { + alloc_flags = DMZ_ALLOC_RND; + goto again; + } + dmz_unlock_map(zmd); + if (!szone) + return -ENOSPC; + + DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u", + dmz_metadata_label(zmd), zrc->dev_idx, chunk, + dmz_is_cache(dzone) ? "cache" : "rnd", + dzone->id, dmz_weight(dzone), + dmz_is_rnd(szone) ? "rnd" : "seq", szone->id); + + /* Flush the random data zone into the sequential zone */ + ret = dmz_reclaim_copy(zrc, dzone, szone); + + dmz_lock_flush(zmd); + + if (ret == 0) { + /* Validate copied blocks */ + ret = dmz_copy_valid_blocks(zmd, dzone, szone); + } + if (ret) { + /* Free the sequential zone */ + dmz_lock_map(zmd); + dmz_free_zone(zmd, szone); + dmz_unlock_map(zmd); + } else { + /* Free the data zone and remap the chunk */ + dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); + dmz_lock_map(zmd); + dmz_unmap_zone(zmd, dzone); + dmz_unlock_zone_reclaim(dzone); + dmz_free_zone(zmd, dzone); + dmz_map_zone(zmd, szone, chunk); + dmz_unlock_map(zmd); + } + + dmz_unlock_flush(zmd); + + return ret; +} + +/* + * Reclaim an empty zone. + */ +static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) +{ + struct dmz_metadata *zmd = zrc->metadata; + + dmz_lock_flush(zmd); + dmz_lock_map(zmd); + dmz_unmap_zone(zmd, dzone); + dmz_unlock_zone_reclaim(dzone); + dmz_free_zone(zmd, dzone); + dmz_unlock_map(zmd); + dmz_unlock_flush(zmd); +} + +/* + * Test if the target device is idle. + */ +static inline int dmz_target_idle(struct dmz_reclaim *zrc) +{ + return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); +} + +/* + * Find a candidate zone for reclaim and process it. + */ +static int dmz_do_reclaim(struct dmz_reclaim *zrc) +{ + struct dmz_metadata *zmd = zrc->metadata; + struct dm_zone *dzone; + struct dm_zone *rzone; + unsigned long start; + int ret; + + /* Get a data zone */ + dzone = dmz_get_zone_for_reclaim(zmd, zrc->dev_idx, + dmz_target_idle(zrc)); + if (!dzone) { + DMDEBUG("(%s/%u): No zone found to reclaim", + dmz_metadata_label(zmd), zrc->dev_idx); + return -EBUSY; + } + rzone = dzone; + + start = jiffies; + if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { + if (!dmz_weight(dzone)) { + /* Empty zone */ + dmz_reclaim_empty(zrc, dzone); + ret = 0; + } else { + /* + * Reclaim the random data zone by moving its + * valid data blocks to a free sequential zone. + */ + ret = dmz_reclaim_rnd_data(zrc, dzone); + } + } else { + struct dm_zone *bzone = dzone->bzone; + sector_t chunk_block = 0; + + ret = dmz_first_valid_block(zmd, bzone, &chunk_block); + if (ret < 0) + goto out; + + if (ret == 0 || chunk_block >= dzone->wp_block) { + /* + * The buffer zone is empty or its valid blocks are + * after the data zone write pointer. + */ + ret = dmz_reclaim_buf(zrc, dzone); + rzone = bzone; + } else { + /* + * Reclaim the data zone by merging it into the + * buffer zone so that the buffer zone itself can + * be later reclaimed. + */ + ret = dmz_reclaim_seq_data(zrc, dzone); + } + } +out: + if (ret) { + if (ret == -EINTR) + DMDEBUG("(%s/%u): reclaim zone %u interrupted", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id); + else + DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id, ret); + dmz_unlock_zone_reclaim(dzone); + return ret; + } + + ret = dmz_flush_metadata(zrc->metadata); + if (ret) { + DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d", + dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret); + return ret; + } + + DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id, jiffies_to_msecs(jiffies - start)); + return 0; +} + +static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) +{ + struct dmz_metadata *zmd = zrc->metadata; + unsigned int nr_cache = dmz_nr_cache_zones(zmd); + unsigned int nr_unmap, nr_zones; + + if (nr_cache) { + nr_zones = nr_cache; + nr_unmap = dmz_nr_unmap_cache_zones(zmd); + } else { + nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx); + nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); + } + if (nr_unmap <= 1) + return 0; + return nr_unmap * 100 / nr_zones; +} + +/* + * Test if reclaim is necessary. + */ +static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) +{ + unsigned int nr_reclaim; + + nr_reclaim = dmz_nr_rnd_zones(zrc->metadata, zrc->dev_idx); + + if (dmz_nr_cache_zones(zrc->metadata)) { + /* + * The first device in a multi-device + * setup only contains cache zones, so + * never start reclaim there. + */ + if (zrc->dev_idx == 0) + return false; + nr_reclaim += dmz_nr_cache_zones(zrc->metadata); + } + + /* Reclaim when idle */ + if (dmz_target_idle(zrc) && nr_reclaim) + return true; + + /* If there are still plenty of cache zones, do not reclaim */ + if (p_unmap >= DMZ_RECLAIM_HIGH_UNMAP_ZONES) + return false; + + /* + * If the percentage of unmapped cache zones is low, + * reclaim even if the target is busy. + */ + return p_unmap <= DMZ_RECLAIM_LOW_UNMAP_ZONES; +} + +/* + * Reclaim work function. + */ +static void dmz_reclaim_work(struct work_struct *work) +{ + struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); + struct dmz_metadata *zmd = zrc->metadata; + unsigned int p_unmap; + int ret; + + if (dmz_dev_is_dying(zmd)) + return; + + p_unmap = dmz_reclaim_percentage(zrc); + if (!dmz_should_reclaim(zrc, p_unmap)) { + mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); + return; + } + + /* + * We need to start reclaiming random zones: set up zone copy + * throttling to either go fast if we are very low on random zones + * and slower if there are still some free random zones to avoid + * as much as possible to negatively impact the user workload. + */ + if (dmz_target_idle(zrc) || p_unmap < DMZ_RECLAIM_LOW_UNMAP_ZONES / 2) { + /* Idle or very low percentage: go fast */ + zrc->kc_throttle.throttle = 100; + } else { + /* Busy but we still have some random zone: throttle */ + zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); + } + + DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", + dmz_metadata_label(zmd), zrc->dev_idx, + zrc->kc_throttle.throttle, + (dmz_target_idle(zrc) ? "Idle" : "Busy"), + p_unmap, dmz_nr_unmap_cache_zones(zmd), + dmz_nr_cache_zones(zmd), + dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx), + dmz_nr_rnd_zones(zmd, zrc->dev_idx)); + + ret = dmz_do_reclaim(zrc); + if (ret && ret != -EINTR) { + if (!dmz_check_dev(zmd)) + return; + } + + dmz_schedule_reclaim(zrc); +} + +/* + * Initialize reclaim. + */ +int dmz_ctr_reclaim(struct dmz_metadata *zmd, + struct dmz_reclaim **reclaim, int idx) +{ + struct dmz_reclaim *zrc; + int ret; + + zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL); + if (!zrc) + return -ENOMEM; + + zrc->metadata = zmd; + zrc->atime = jiffies; + zrc->dev_idx = idx; + + /* Reclaim kcopyd client */ + zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle); + if (IS_ERR(zrc->kc)) { + ret = PTR_ERR(zrc->kc); + zrc->kc = NULL; + goto err; + } + + /* Reclaim work */ + INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); + zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM, + dmz_metadata_label(zmd), idx); + if (!zrc->wq) { + ret = -ENOMEM; + goto err; + } + + *reclaim = zrc; + queue_delayed_work(zrc->wq, &zrc->work, 0); + + return 0; +err: + if (zrc->kc) + dm_kcopyd_client_destroy(zrc->kc); + kfree(zrc); + + return ret; +} + +/* + * Terminate reclaim. + */ +void dmz_dtr_reclaim(struct dmz_reclaim *zrc) +{ + cancel_delayed_work_sync(&zrc->work); + destroy_workqueue(zrc->wq); + dm_kcopyd_client_destroy(zrc->kc); + kfree(zrc); +} + +/* + * Suspend reclaim. + */ +void dmz_suspend_reclaim(struct dmz_reclaim *zrc) +{ + cancel_delayed_work_sync(&zrc->work); +} + +/* + * Resume reclaim. + */ +void dmz_resume_reclaim(struct dmz_reclaim *zrc) +{ + queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); +} + +/* + * BIO accounting. + */ +void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc) +{ + zrc->atime = jiffies; +} + +/* + * Start reclaim if necessary. + */ +void dmz_schedule_reclaim(struct dmz_reclaim *zrc) +{ + unsigned int p_unmap = dmz_reclaim_percentage(zrc); + + if (dmz_should_reclaim(zrc, p_unmap)) + mod_delayed_work(zrc->wq, &zrc->work, 0); +} diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c new file mode 100644 index 000000000..4abe1e2f8 --- /dev/null +++ b/drivers/md/dm-zoned-target.c @@ -0,0 +1,1173 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#include "dm-zoned.h" + +#include + +#define DM_MSG_PREFIX "zoned" + +#define DMZ_MIN_BIOS 8192 + +/* + * Zone BIO context. + */ +struct dmz_bioctx { + struct dmz_dev *dev; + struct dm_zone *zone; + struct bio *bio; + refcount_t ref; +}; + +/* + * Chunk work descriptor. + */ +struct dm_chunk_work { + struct work_struct work; + refcount_t refcount; + struct dmz_target *target; + unsigned int chunk; + struct bio_list bio_list; +}; + +/* + * Target descriptor. + */ +struct dmz_target { + struct dm_dev **ddev; + unsigned int nr_ddevs; + + unsigned int flags; + + /* Zoned block device information */ + struct dmz_dev *dev; + + /* For metadata handling */ + struct dmz_metadata *metadata; + + /* For chunk work */ + struct radix_tree_root chunk_rxtree; + struct workqueue_struct *chunk_wq; + struct mutex chunk_lock; + + /* For cloned BIOs to zones */ + struct bio_set bio_set; + + /* For flush */ + spinlock_t flush_lock; + struct bio_list flush_list; + struct delayed_work flush_work; + struct workqueue_struct *flush_wq; +}; + +/* + * Flush intervals (seconds). + */ +#define DMZ_FLUSH_PERIOD (10 * HZ) + +/* + * Target BIO completion. + */ +static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) +{ + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + + if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) + bio->bi_status = status; + if (bioctx->dev && bio->bi_status != BLK_STS_OK) + bioctx->dev->flags |= DMZ_CHECK_BDEV; + + if (refcount_dec_and_test(&bioctx->ref)) { + struct dm_zone *zone = bioctx->zone; + + if (zone) { + if (bio->bi_status != BLK_STS_OK && + bio_op(bio) == REQ_OP_WRITE && + dmz_is_seq(zone)) + set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); + dmz_deactivate_zone(zone); + } + bio_endio(bio); + } +} + +/* + * Completion callback for an internally cloned target BIO. This terminates the + * target BIO when there are no more references to its context. + */ +static void dmz_clone_endio(struct bio *clone) +{ + struct dmz_bioctx *bioctx = clone->bi_private; + blk_status_t status = clone->bi_status; + + bio_put(clone); + dmz_bio_endio(bioctx->bio, status); +} + +/* + * Issue a clone of a target BIO. The clone may only partially process the + * original target BIO. + */ +static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, + struct bio *bio, sector_t chunk_block, + unsigned int nr_blocks) +{ + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_dev *dev = zone->dev; + struct bio *clone; + + if (dev->flags & DMZ_BDEV_DYING) + return -EIO; + + clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set); + if (!clone) + return -ENOMEM; + + bioctx->dev = dev; + clone->bi_iter.bi_sector = + dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); + clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; + clone->bi_end_io = dmz_clone_endio; + clone->bi_private = bioctx; + + bio_advance(bio, clone->bi_iter.bi_size); + + refcount_inc(&bioctx->ref); + submit_bio_noacct(clone); + + if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) + zone->wp_block += nr_blocks; + + return 0; +} + +/* + * Zero out pages of discarded blocks accessed by a read BIO. + */ +static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio, + sector_t chunk_block, unsigned int nr_blocks) +{ + unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT; + + /* Clear nr_blocks */ + swap(bio->bi_iter.bi_size, size); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, size); + + bio_advance(bio, size); +} + +/* + * Process a read BIO. + */ +static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, + struct bio *bio) +{ + struct dmz_metadata *zmd = dmz->metadata; + sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); + unsigned int nr_blocks = dmz_bio_blocks(bio); + sector_t end_block = chunk_block + nr_blocks; + struct dm_zone *rzone, *bzone; + int ret; + + /* Read into unmapped chunks need only zeroing the BIO buffer */ + if (!zone) { + zero_fill_bio(bio); + return 0; + } + + DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", + dmz_metadata_label(zmd), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (dmz_is_rnd(zone) ? "RND" : + (dmz_is_cache(zone) ? "CACHE" : "SEQ")), + zone->id, + (unsigned long long)chunk_block, nr_blocks); + + /* Check block validity to determine the read location */ + bzone = zone->bzone; + while (chunk_block < end_block) { + nr_blocks = 0; + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block < zone->wp_block) { + /* Test block validity in the data zone */ + ret = dmz_block_valid(zmd, zone, chunk_block); + if (ret < 0) + return ret; + if (ret > 0) { + /* Read data zone blocks */ + nr_blocks = ret; + rzone = zone; + } + } + + /* + * No valid blocks found in the data zone. + * Check the buffer zone, if there is one. + */ + if (!nr_blocks && bzone) { + ret = dmz_block_valid(zmd, bzone, chunk_block); + if (ret < 0) + return ret; + if (ret > 0) { + /* Read buffer zone blocks */ + nr_blocks = ret; + rzone = bzone; + } + } + + if (nr_blocks) { + /* Valid blocks found: read them */ + nr_blocks = min_t(unsigned int, nr_blocks, + end_block - chunk_block); + ret = dmz_submit_bio(dmz, rzone, bio, + chunk_block, nr_blocks); + if (ret) + return ret; + chunk_block += nr_blocks; + } else { + /* No valid block: zeroout the current BIO block */ + dmz_handle_read_zero(dmz, bio, chunk_block, 1); + chunk_block++; + } + } + + return 0; +} + +/* + * Write blocks directly in a data zone, at the write pointer. + * If a buffer zone is assigned, invalidate the blocks written + * in place. + */ +static int dmz_handle_direct_write(struct dmz_target *dmz, + struct dm_zone *zone, struct bio *bio, + sector_t chunk_block, + unsigned int nr_blocks) +{ + struct dmz_metadata *zmd = dmz->metadata; + struct dm_zone *bzone = zone->bzone; + int ret; + + if (dmz_is_readonly(zone)) + return -EROFS; + + /* Submit write */ + ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks); + if (ret) + return ret; + + /* + * Validate the blocks in the data zone and invalidate + * in the buffer zone, if there is one. + */ + ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks); + if (ret == 0 && bzone) + ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks); + + return ret; +} + +/* + * Write blocks in the buffer zone of @zone. + * If no buffer zone is assigned yet, get one. + * Called with @zone write locked. + */ +static int dmz_handle_buffered_write(struct dmz_target *dmz, + struct dm_zone *zone, struct bio *bio, + sector_t chunk_block, + unsigned int nr_blocks) +{ + struct dmz_metadata *zmd = dmz->metadata; + struct dm_zone *bzone; + int ret; + + /* Get the buffer zone. One will be allocated if needed */ + bzone = dmz_get_chunk_buffer(zmd, zone); + if (IS_ERR(bzone)) + return PTR_ERR(bzone); + + if (dmz_is_readonly(bzone)) + return -EROFS; + + /* Submit write */ + ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks); + if (ret) + return ret; + + /* + * Validate the blocks in the buffer zone + * and invalidate in the data zone. + */ + ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks); + if (ret == 0 && chunk_block < zone->wp_block) + ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); + + return ret; +} + +/* + * Process a write BIO. + */ +static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, + struct bio *bio) +{ + struct dmz_metadata *zmd = dmz->metadata; + sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); + unsigned int nr_blocks = dmz_bio_blocks(bio); + + if (!zone) + return -ENOSPC; + + DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", + dmz_metadata_label(zmd), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (dmz_is_rnd(zone) ? "RND" : + (dmz_is_cache(zone) ? "CACHE" : "SEQ")), + zone->id, + (unsigned long long)chunk_block, nr_blocks); + + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block == zone->wp_block) { + /* + * zone is a random zone or it is a sequential zone + * and the BIO is aligned to the zone write pointer: + * direct write the zone. + */ + return dmz_handle_direct_write(dmz, zone, bio, + chunk_block, nr_blocks); + } + + /* + * This is an unaligned write in a sequential zone: + * use buffered write. + */ + return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks); +} + +/* + * Process a discard BIO. + */ +static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, + struct bio *bio) +{ + struct dmz_metadata *zmd = dmz->metadata; + sector_t block = dmz_bio_block(bio); + unsigned int nr_blocks = dmz_bio_blocks(bio); + sector_t chunk_block = dmz_chunk_block(zmd, block); + int ret = 0; + + /* For unmapped chunks, there is nothing to do */ + if (!zone) + return 0; + + if (dmz_is_readonly(zone)) + return -EROFS; + + DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks", + dmz_metadata_label(dmz->metadata), + (unsigned long long)dmz_bio_chunk(zmd, bio), + zone->id, + (unsigned long long)chunk_block, nr_blocks); + + /* + * Invalidate blocks in the data zone and its + * buffer zone if one is mapped. + */ + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block < zone->wp_block) + ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); + if (ret == 0 && zone->bzone) + ret = dmz_invalidate_blocks(zmd, zone->bzone, + chunk_block, nr_blocks); + return ret; +} + +/* + * Process a BIO. + */ +static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, + struct bio *bio) +{ + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_metadata *zmd = dmz->metadata; + struct dm_zone *zone; + int ret; + + dmz_lock_metadata(zmd); + + /* + * Get the data zone mapping the chunk. There may be no + * mapping for read and discard. If a mapping is obtained, + + the zone returned will be set to active state. + */ + zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio), + bio_op(bio)); + if (IS_ERR(zone)) { + ret = PTR_ERR(zone); + goto out; + } + + /* Process the BIO */ + if (zone) { + dmz_activate_zone(zone); + bioctx->zone = zone; + dmz_reclaim_bio_acc(zone->dev->reclaim); + } + + switch (bio_op(bio)) { + case REQ_OP_READ: + ret = dmz_handle_read(dmz, zone, bio); + break; + case REQ_OP_WRITE: + ret = dmz_handle_write(dmz, zone, bio); + break; + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + ret = dmz_handle_discard(dmz, zone, bio); + break; + default: + DMERR("(%s): Unsupported BIO operation 0x%x", + dmz_metadata_label(dmz->metadata), bio_op(bio)); + ret = -EIO; + } + + /* + * Release the chunk mapping. This will check that the mapping + * is still valid, that is, that the zone used still has valid blocks. + */ + if (zone) + dmz_put_chunk_mapping(zmd, zone); +out: + dmz_bio_endio(bio, errno_to_blk_status(ret)); + + dmz_unlock_metadata(zmd); +} + +/* + * Increment a chunk reference counter. + */ +static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) +{ + refcount_inc(&cw->refcount); +} + +/* + * Decrement a chunk work reference count and + * free it if it becomes 0. + */ +static void dmz_put_chunk_work(struct dm_chunk_work *cw) +{ + if (refcount_dec_and_test(&cw->refcount)) { + WARN_ON(!bio_list_empty(&cw->bio_list)); + radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk); + kfree(cw); + } +} + +/* + * Chunk BIO work function. + */ +static void dmz_chunk_work(struct work_struct *work) +{ + struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work); + struct dmz_target *dmz = cw->target; + struct bio *bio; + + mutex_lock(&dmz->chunk_lock); + + /* Process the chunk BIOs */ + while ((bio = bio_list_pop(&cw->bio_list))) { + mutex_unlock(&dmz->chunk_lock); + dmz_handle_bio(dmz, cw, bio); + mutex_lock(&dmz->chunk_lock); + dmz_put_chunk_work(cw); + } + + /* Queueing the work incremented the work refcount */ + dmz_put_chunk_work(cw); + + mutex_unlock(&dmz->chunk_lock); +} + +/* + * Flush work. + */ +static void dmz_flush_work(struct work_struct *work) +{ + struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work); + struct bio *bio; + int ret; + + /* Flush dirty metadata blocks */ + ret = dmz_flush_metadata(dmz->metadata); + if (ret) + DMDEBUG("(%s): Metadata flush failed, rc=%d", + dmz_metadata_label(dmz->metadata), ret); + + /* Process queued flush requests */ + while (1) { + spin_lock(&dmz->flush_lock); + bio = bio_list_pop(&dmz->flush_list); + spin_unlock(&dmz->flush_lock); + + if (!bio) + break; + + dmz_bio_endio(bio, errno_to_blk_status(ret)); + } + + queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); +} + +/* + * Get a chunk work and start it to process a new BIO. + * If the BIO chunk has no work yet, create one. + */ +static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) +{ + unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio); + struct dm_chunk_work *cw; + int ret = 0; + + mutex_lock(&dmz->chunk_lock); + + /* Get the BIO chunk work. If one is not active yet, create one */ + cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); + if (cw) { + dmz_get_chunk_work(cw); + } else { + /* Create a new chunk work */ + cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); + if (unlikely(!cw)) { + ret = -ENOMEM; + goto out; + } + + INIT_WORK(&cw->work, dmz_chunk_work); + refcount_set(&cw->refcount, 1); + cw->target = dmz; + cw->chunk = chunk; + bio_list_init(&cw->bio_list); + + ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); + if (unlikely(ret)) { + kfree(cw); + goto out; + } + } + + bio_list_add(&cw->bio_list, bio); + + if (queue_work(dmz->chunk_wq, &cw->work)) + dmz_get_chunk_work(cw); +out: + mutex_unlock(&dmz->chunk_lock); + return ret; +} + +/* + * Check if the backing device is being removed. If it's on the way out, + * start failing I/O. Reclaim and metadata components also call this + * function to cleanly abort operation in the event of such failure. + */ +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) +{ + if (dmz_dev->flags & DMZ_BDEV_DYING) + return true; + + if (dmz_dev->flags & DMZ_CHECK_BDEV) + return !dmz_check_bdev(dmz_dev); + + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return dmz_dev->flags & DMZ_BDEV_DYING; +} + +/* + * Check the backing device availability. This detects such events as + * backing device going offline due to errors, media removals, etc. + * This check is less efficient than dmz_bdev_is_dying() and should + * only be performed as a part of error handling. + */ +bool dmz_check_bdev(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + dmz_dev->flags &= ~DMZ_CHECK_BDEV; + + if (dmz_bdev_is_dying(dmz_dev)) + return false; + + disk = dmz_dev->bdev->bd_disk; + if (disk->fops->check_events && + disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return !(dmz_dev->flags & DMZ_BDEV_DYING); +} + +/* + * Process a new BIO. + */ +static int dmz_map(struct dm_target *ti, struct bio *bio) +{ + struct dmz_target *dmz = ti->private; + struct dmz_metadata *zmd = dmz->metadata; + struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + sector_t sector = bio->bi_iter.bi_sector; + unsigned int nr_sectors = bio_sectors(bio); + sector_t chunk_sector; + int ret; + + if (dmz_dev_is_dying(zmd)) + return DM_MAPIO_KILL; + + DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", + dmz_metadata_label(zmd), + bio_op(bio), (unsigned long long)sector, nr_sectors, + (unsigned long long)dmz_bio_chunk(zmd, bio), + (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), + (unsigned int)dmz_bio_blocks(bio)); + + if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) + return DM_MAPIO_REMAPPED; + + /* The BIO should be block aligned */ + if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK)) + return DM_MAPIO_KILL; + + /* Initialize the BIO context */ + bioctx->dev = NULL; + bioctx->zone = NULL; + bioctx->bio = bio; + refcount_set(&bioctx->ref, 1); + + /* Set the BIO pending in the flush list */ + if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { + spin_lock(&dmz->flush_lock); + bio_list_add(&dmz->flush_list, bio); + spin_unlock(&dmz->flush_lock); + mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0); + return DM_MAPIO_SUBMITTED; + } + + /* Split zone BIOs to fit entirely into a zone */ + chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); + if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd)) + dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector); + + /* Now ready to handle this BIO */ + ret = dmz_queue_chunk_work(dmz, bio); + if (ret) { + DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i", + dmz_metadata_label(zmd), + bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), + ret); + return DM_MAPIO_REQUEUE; + } + + return DM_MAPIO_SUBMITTED; +} + +/* + * Get zoned device information. + */ +static int dmz_get_zoned_device(struct dm_target *ti, char *path, + int idx, int nr_devs) +{ + struct dmz_target *dmz = ti->private; + struct dm_dev *ddev; + struct dmz_dev *dev; + int ret; + struct block_device *bdev; + + /* Get the target device */ + ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); + if (ret) { + ti->error = "Get target device failed"; + return ret; + } + + bdev = ddev->bdev; + if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { + if (nr_devs == 1) { + ti->error = "Invalid regular device"; + goto err; + } + if (idx != 0) { + ti->error = "First device must be a regular device"; + goto err; + } + if (dmz->ddev[0]) { + ti->error = "Too many regular devices"; + goto err; + } + dev = &dmz->dev[idx]; + dev->flags = DMZ_BDEV_REGULAR; + } else { + if (dmz->ddev[idx]) { + ti->error = "Too many zoned devices"; + goto err; + } + if (nr_devs > 1 && idx == 0) { + ti->error = "First device must be a regular device"; + goto err; + } + dev = &dmz->dev[idx]; + } + dev->bdev = bdev; + dev->dev_idx = idx; + + dev->capacity = bdev_nr_sectors(bdev); + if (ti->begin) { + ti->error = "Partial mapping is not supported"; + goto err; + } + + dmz->ddev[idx] = ddev; + + return 0; +err: + dm_put_device(ti, ddev); + return -EINVAL; +} + +/* + * Cleanup zoned device information. + */ +static void dmz_put_zoned_devices(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + int i; + + for (i = 0; i < dmz->nr_ddevs; i++) + if (dmz->ddev[i]) + dm_put_device(ti, dmz->ddev[i]); + + kfree(dmz->ddev); +} + +static int dmz_fixup_devices(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + struct dmz_dev *reg_dev = NULL; + sector_t zone_nr_sectors = 0; + int i; + + /* + * When we have more than on devices, the first one must be a + * regular block device and the others zoned block devices. + */ + if (dmz->nr_ddevs > 1) { + reg_dev = &dmz->dev[0]; + if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { + ti->error = "Primary disk is not a regular device"; + return -EINVAL; + } + for (i = 1; i < dmz->nr_ddevs; i++) { + struct dmz_dev *zoned_dev = &dmz->dev[i]; + struct block_device *bdev = zoned_dev->bdev; + + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Secondary disk is not a zoned device"; + return -EINVAL; + } + if (zone_nr_sectors && + zone_nr_sectors != bdev_zone_sectors(bdev)) { + ti->error = "Zone nr sectors mismatch"; + return -EINVAL; + } + zone_nr_sectors = bdev_zone_sectors(bdev); + zoned_dev->zone_nr_sectors = zone_nr_sectors; + zoned_dev->nr_zones = bdev_nr_zones(bdev); + } + } else { + struct dmz_dev *zoned_dev = &dmz->dev[0]; + struct block_device *bdev = zoned_dev->bdev; + + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Disk is not a zoned device"; + return -EINVAL; + } + zoned_dev->zone_nr_sectors = bdev_zone_sectors(bdev); + zoned_dev->nr_zones = bdev_nr_zones(bdev); + } + + if (reg_dev) { + sector_t zone_offset; + + reg_dev->zone_nr_sectors = zone_nr_sectors; + reg_dev->nr_zones = + DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, + reg_dev->zone_nr_sectors); + reg_dev->zone_offset = 0; + zone_offset = reg_dev->nr_zones; + for (i = 1; i < dmz->nr_ddevs; i++) { + dmz->dev[i].zone_offset = zone_offset; + zone_offset += dmz->dev[i].nr_zones; + } + } + return 0; +} + +/* + * Setup target. + */ +static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dmz_target *dmz; + int ret, i; + + /* Check arguments */ + if (argc < 1) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + /* Allocate and initialize the target descriptor */ + dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL); + if (!dmz) { + ti->error = "Unable to allocate the zoned target descriptor"; + return -ENOMEM; + } + dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); + if (!dmz->dev) { + ti->error = "Unable to allocate the zoned device descriptors"; + kfree(dmz); + return -ENOMEM; + } + dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); + if (!dmz->ddev) { + ti->error = "Unable to allocate the dm device descriptors"; + ret = -ENOMEM; + goto err; + } + dmz->nr_ddevs = argc; + + ti->private = dmz; + + /* Get the target zoned block device */ + for (i = 0; i < argc; i++) { + ret = dmz_get_zoned_device(ti, argv[i], i, argc); + if (ret) + goto err_dev; + } + ret = dmz_fixup_devices(ti); + if (ret) + goto err_dev; + + /* Initialize metadata */ + ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, + dm_table_device_name(ti->table)); + if (ret) { + ti->error = "Metadata initialization failed"; + goto err_dev; + } + + /* Set target (no write same support) */ + ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata); + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_zeroes_bios = 1; + ti->per_io_data_size = sizeof(struct dmz_bioctx); + ti->flush_supported = true; + ti->discards_supported = true; + + /* The exposed capacity is the number of chunks that can be mapped */ + ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << + dmz_zone_nr_sectors_shift(dmz->metadata); + + /* Zone BIO */ + ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); + if (ret) { + ti->error = "Create BIO set failed"; + goto err_meta; + } + + /* Chunk BIO work */ + mutex_init(&dmz->chunk_lock); + INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); + dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0, + dmz_metadata_label(dmz->metadata)); + if (!dmz->chunk_wq) { + ti->error = "Create chunk workqueue failed"; + ret = -ENOMEM; + goto err_bio; + } + + /* Flush work */ + spin_lock_init(&dmz->flush_lock); + bio_list_init(&dmz->flush_list); + INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work); + dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM, + dmz_metadata_label(dmz->metadata)); + if (!dmz->flush_wq) { + ti->error = "Create flush workqueue failed"; + ret = -ENOMEM; + goto err_cwq; + } + mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); + + /* Initialize reclaim */ + for (i = 0; i < dmz->nr_ddevs; i++) { + ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i); + if (ret) { + ti->error = "Zone reclaim initialization failed"; + goto err_fwq; + } + } + + DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", + dmz_metadata_label(dmz->metadata), + (unsigned long long)ti->len, + (unsigned long long)dmz_sect2blk(ti->len)); + + return 0; +err_fwq: + destroy_workqueue(dmz->flush_wq); +err_cwq: + destroy_workqueue(dmz->chunk_wq); +err_bio: + mutex_destroy(&dmz->chunk_lock); + bioset_exit(&dmz->bio_set); +err_meta: + dmz_dtr_metadata(dmz->metadata); +err_dev: + dmz_put_zoned_devices(ti); +err: + kfree(dmz->dev); + kfree(dmz); + + return ret; +} + +/* + * Cleanup target. + */ +static void dmz_dtr(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + int i; + + destroy_workqueue(dmz->chunk_wq); + + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_dtr_reclaim(dmz->dev[i].reclaim); + + cancel_delayed_work_sync(&dmz->flush_work); + destroy_workqueue(dmz->flush_wq); + + (void) dmz_flush_metadata(dmz->metadata); + + dmz_dtr_metadata(dmz->metadata); + + bioset_exit(&dmz->bio_set); + + dmz_put_zoned_devices(ti); + + mutex_destroy(&dmz->chunk_lock); + + kfree(dmz->dev); + kfree(dmz); +} + +/* + * Setup target request queue limits. + */ +static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dmz_target *dmz = ti->private; + unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata); + + limits->logical_block_size = DMZ_BLOCK_SIZE; + limits->physical_block_size = DMZ_BLOCK_SIZE; + + blk_limits_io_min(limits, DMZ_BLOCK_SIZE); + blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); + + limits->discard_alignment = 0; + limits->discard_granularity = DMZ_BLOCK_SIZE; + limits->max_discard_sectors = chunk_sectors; + limits->max_hw_discard_sectors = chunk_sectors; + limits->max_write_zeroes_sectors = chunk_sectors; + + /* FS hint to try to align to the device zone size */ + limits->chunk_sectors = chunk_sectors; + limits->max_sectors = chunk_sectors; + + /* We are exposing a drive-managed zoned block device */ + limits->zoned = BLK_ZONED_NONE; +} + +/* + * Pass on ioctl to the backend device. + */ +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct dmz_target *dmz = ti->private; + struct dmz_dev *dev = &dmz->dev[0]; + + if (!dmz_check_bdev(dev)) + return -EIO; + + *bdev = dev->bdev; + + return 0; +} + +/* + * Stop works on suspend. + */ +static void dmz_suspend(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + int i; + + flush_workqueue(dmz->chunk_wq); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_suspend_reclaim(dmz->dev[i].reclaim); + cancel_delayed_work_sync(&dmz->flush_work); +} + +/* + * Restart works on resume or if suspend failed. + */ +static void dmz_resume(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + int i; + + queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_resume_reclaim(dmz->dev[i].reclaim); +} + +static int dmz_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dmz_target *dmz = ti->private; + unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); + sector_t capacity; + int i, r; + + for (i = 0; i < dmz->nr_ddevs; i++) { + capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[i], 0, capacity, data); + if (r) + break; + } + return r; +} + +static void dmz_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + struct dmz_target *dmz = ti->private; + ssize_t sz = 0; + char buf[BDEVNAME_SIZE]; + struct dmz_dev *dev; + int i; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%u zones %u/%u cache", + dmz_nr_zones(dmz->metadata), + dmz_nr_unmap_cache_zones(dmz->metadata), + dmz_nr_cache_zones(dmz->metadata)); + for (i = 0; i < dmz->nr_ddevs; i++) { + /* + * For a multi-device setup the first device + * contains only cache zones. + */ + if ((i == 0) && + (dmz_nr_cache_zones(dmz->metadata) > 0)) + continue; + DMEMIT(" %u/%u random %u/%u sequential", + dmz_nr_unmap_rnd_zones(dmz->metadata, i), + dmz_nr_rnd_zones(dmz->metadata, i), + dmz_nr_unmap_seq_zones(dmz->metadata, i), + dmz_nr_seq_zones(dmz->metadata, i)); + } + break; + case STATUSTYPE_TABLE: + dev = &dmz->dev[0]; + format_dev_t(buf, dev->bdev->bd_dev); + DMEMIT("%s", buf); + for (i = 1; i < dmz->nr_ddevs; i++) { + dev = &dmz->dev[i]; + format_dev_t(buf, dev->bdev->bd_dev); + DMEMIT(" %s", buf); + } + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } + return; +} + +static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct dmz_target *dmz = ti->private; + int r = -EINVAL; + + if (!strcasecmp(argv[0], "reclaim")) { + int i; + + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_schedule_reclaim(dmz->dev[i].reclaim); + r = 0; + } else + DMERR("unrecognized message %s", argv[0]); + return r; +} + +static struct target_type dmz_type = { + .name = "zoned", + .version = {2, 0, 0}, + .features = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL, + .module = THIS_MODULE, + .ctr = dmz_ctr, + .dtr = dmz_dtr, + .map = dmz_map, + .io_hints = dmz_io_hints, + .prepare_ioctl = dmz_prepare_ioctl, + .postsuspend = dmz_suspend, + .resume = dmz_resume, + .iterate_devices = dmz_iterate_devices, + .status = dmz_status, + .message = dmz_message, +}; + +static int __init dmz_init(void) +{ + return dm_register_target(&dmz_type); +} + +static void __exit dmz_exit(void) +{ + dm_unregister_target(&dmz_type); +} + +module_init(dmz_init); +module_exit(dmz_exit); + +MODULE_DESCRIPTION(DM_NAME " target for zoned block devices"); +MODULE_AUTHOR("Damien Le Moal "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h new file mode 100644 index 000000000..265494d3f --- /dev/null +++ b/drivers/md/dm-zoned.h @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#ifndef DM_ZONED_H +#define DM_ZONED_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * dm-zoned creates block devices with 4KB blocks, always. + */ +#define DMZ_BLOCK_SHIFT 12 +#define DMZ_BLOCK_SIZE (1 << DMZ_BLOCK_SHIFT) +#define DMZ_BLOCK_MASK (DMZ_BLOCK_SIZE - 1) + +#define DMZ_BLOCK_SHIFT_BITS (DMZ_BLOCK_SHIFT + 3) +#define DMZ_BLOCK_SIZE_BITS (1 << DMZ_BLOCK_SHIFT_BITS) +#define DMZ_BLOCK_MASK_BITS (DMZ_BLOCK_SIZE_BITS - 1) + +#define DMZ_BLOCK_SECTORS_SHIFT (DMZ_BLOCK_SHIFT - SECTOR_SHIFT) +#define DMZ_BLOCK_SECTORS (DMZ_BLOCK_SIZE >> SECTOR_SHIFT) +#define DMZ_BLOCK_SECTORS_MASK (DMZ_BLOCK_SECTORS - 1) + +/* + * 4KB block <-> 512B sector conversion. + */ +#define dmz_blk2sect(b) ((sector_t)(b) << DMZ_BLOCK_SECTORS_SHIFT) +#define dmz_sect2blk(s) ((sector_t)(s) >> DMZ_BLOCK_SECTORS_SHIFT) + +#define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector) +#define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio)) + +struct dmz_metadata; +struct dmz_reclaim; + +/* + * Zoned block device information. + */ +struct dmz_dev { + struct block_device *bdev; + struct dmz_metadata *metadata; + struct dmz_reclaim *reclaim; + + uuid_t uuid; + + sector_t capacity; + + unsigned int dev_idx; + + unsigned int nr_zones; + unsigned int zone_offset; + + unsigned int flags; + + sector_t zone_nr_sectors; + + unsigned int nr_rnd; + atomic_t unmap_nr_rnd; + struct list_head unmap_rnd_list; + struct list_head map_rnd_list; + + unsigned int nr_seq; + atomic_t unmap_nr_seq; + struct list_head unmap_seq_list; + struct list_head map_seq_list; +}; + +#define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \ + dmz_zone_nr_sectors_shift(zmd)) +#define dmz_chunk_block(zmd, b) ((b) & (dmz_zone_nr_blocks(zmd) - 1)) + +/* Device flags. */ +#define DMZ_BDEV_DYING (1 << 0) +#define DMZ_CHECK_BDEV (2 << 0) +#define DMZ_BDEV_REGULAR (4 << 0) + +/* + * Zone descriptor. + */ +struct dm_zone { + /* For listing the zone depending on its state */ + struct list_head link; + + /* Device containing this zone */ + struct dmz_dev *dev; + + /* Zone type and state */ + unsigned long flags; + + /* Zone activation reference count */ + atomic_t refcount; + + /* Zone id */ + unsigned int id; + + /* Zone write pointer block (relative to the zone start block) */ + unsigned int wp_block; + + /* Zone weight (number of valid blocks in the zone) */ + unsigned int weight; + + /* The chunk that the zone maps */ + unsigned int chunk; + + /* + * For a sequential data zone, pointer to the random zone + * used as a buffer for processing unaligned writes. + * For a buffer zone, this points back to the data zone. + */ + struct dm_zone *bzone; +}; + +/* + * Zone flags. + */ +enum { + /* Zone write type */ + DMZ_CACHE, + DMZ_RND, + DMZ_SEQ, + + /* Zone critical condition */ + DMZ_OFFLINE, + DMZ_READ_ONLY, + + /* How the zone is being used */ + DMZ_META, + DMZ_DATA, + DMZ_BUF, + DMZ_RESERVED, + + /* Zone internal state */ + DMZ_RECLAIM, + DMZ_SEQ_WRITE_ERR, + DMZ_RECLAIM_TERMINATE, +}; + +/* + * Zone data accessors. + */ +#define dmz_is_cache(z) test_bit(DMZ_CACHE, &(z)->flags) +#define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags) +#define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags) +#define dmz_is_empty(z) ((z)->wp_block == 0) +#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags) +#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags) +#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags) +#define dmz_is_reserved(z) test_bit(DMZ_RESERVED, &(z)->flags) +#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags) +#define dmz_reclaim_should_terminate(z) \ + test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags) + +#define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags) +#define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags) +#define dmz_is_data(z) test_bit(DMZ_DATA, &(z)->flags) + +#define dmz_weight(z) ((z)->weight) + +/* + * Message functions. + */ +#define dmz_dev_info(dev, format, args...) \ + DMINFO("(%pg): " format, (dev)->bdev, ## args) + +#define dmz_dev_err(dev, format, args...) \ + DMERR("(%pg): " format, (dev)->bdev, ## args) + +#define dmz_dev_warn(dev, format, args...) \ + DMWARN("(%pg): " format, (dev)->bdev, ## args) + +#define dmz_dev_debug(dev, format, args...) \ + DMDEBUG("(%pg): " format, (dev)->bdev, ## args) + +/* + * Functions defined in dm-zoned-metadata.c + */ +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, + struct dmz_metadata **zmd, const char *devname); +void dmz_dtr_metadata(struct dmz_metadata *zmd); +int dmz_resume_metadata(struct dmz_metadata *zmd); + +void dmz_lock_map(struct dmz_metadata *zmd); +void dmz_unlock_map(struct dmz_metadata *zmd); +void dmz_lock_metadata(struct dmz_metadata *zmd); +void dmz_unlock_metadata(struct dmz_metadata *zmd); +void dmz_lock_flush(struct dmz_metadata *zmd); +void dmz_unlock_flush(struct dmz_metadata *zmd); +int dmz_flush_metadata(struct dmz_metadata *zmd); +const char *dmz_metadata_label(struct dmz_metadata *zmd); + +sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); +sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); +unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); + +bool dmz_check_dev(struct dmz_metadata *zmd); +bool dmz_dev_is_dying(struct dmz_metadata *zmd); + +#define DMZ_ALLOC_RND 0x01 +#define DMZ_ALLOC_CACHE 0x02 +#define DMZ_ALLOC_SEQ 0x04 +#define DMZ_ALLOC_RECLAIM 0x10 + +struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, + unsigned int dev_idx, unsigned long flags); +void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); + +void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, + unsigned int chunk); +void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); +unsigned int dmz_nr_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd); + +/* + * Activate a zone (increment its reference count). + */ +static inline void dmz_activate_zone(struct dm_zone *zone) +{ + atomic_inc(&zone->refcount); +} + +int dmz_lock_zone_reclaim(struct dm_zone *zone); +void dmz_unlock_zone_reclaim(struct dm_zone *zone); +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int dev_idx, bool idle); + +struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, + unsigned int chunk, enum req_op op); +void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *zone); +struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, + struct dm_zone *dzone); + +int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block, unsigned int nr_blocks); +int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block, unsigned int nr_blocks); +int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t chunk_block); +int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, + sector_t *chunk_block); +int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, + struct dm_zone *to_zone); +int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, + struct dm_zone *to_zone, sector_t chunk_block); + +/* + * Functions defined in dm-zoned-reclaim.c + */ +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc, int idx); +void dmz_dtr_reclaim(struct dmz_reclaim *zrc); +void dmz_suspend_reclaim(struct dmz_reclaim *zrc); +void dmz_resume_reclaim(struct dmz_reclaim *zrc); +void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc); +void dmz_schedule_reclaim(struct dmz_reclaim *zrc); + +/* + * Functions defined in dm-zoned-target.c + */ +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); +bool dmz_check_bdev(struct dmz_dev *dmz_dev); + +/* + * Deactivate a zone. This decrement the zone reference counter + * indicating that all BIOs to the zone have completed when the count is 0. + */ +static inline void dmz_deactivate_zone(struct dm_zone *zone) +{ + dmz_reclaim_bio_acc(zone->dev->reclaim); + atomic_dec(&zone->refcount); +} + +/* + * Test if a zone is active, that is, has a refcount > 0. + */ +static inline bool dmz_is_active(struct dm_zone *zone) +{ + return atomic_read(&zone->refcount); +} + +#endif /* DM_ZONED_H */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c new file mode 100644 index 000000000..0ec85d159 --- /dev/null +++ b/drivers/md/dm.c @@ -0,0 +1,3397 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-core.h" +#include "dm-rq.h" +#include "dm-uevent.h" +#include "dm-ima.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "core" + +/* + * Cookies are numeric values sent with CHANGE and REMOVE + * uevents while resuming, removing or renaming the device. + */ +#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" +#define DM_COOKIE_LENGTH 24 + +/* + * For REQ_POLLED fs bio, this flag is set if we link mapped underlying + * dm_io into one list, and reuse bio->bi_private as the list head. Before + * ending this fs bio, we will recover its ->bi_private. + */ +#define REQ_DM_POLL_LIST REQ_DRV + +static const char *_name = DM_NAME; + +static unsigned int major = 0; +static unsigned int _major = 0; + +static DEFINE_IDR(_minor_idr); + +static DEFINE_SPINLOCK(_minor_lock); + +static void do_deferred_remove(struct work_struct *w); + +static DECLARE_WORK(deferred_remove_work, do_deferred_remove); + +static struct workqueue_struct *deferred_remove_workqueue; + +atomic_t dm_global_event_nr = ATOMIC_INIT(0); +DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); + +void dm_issue_global_event(void) +{ + atomic_inc(&dm_global_event_nr); + wake_up(&dm_global_eventq); +} + +DEFINE_STATIC_KEY_FALSE(stats_enabled); +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); +DEFINE_STATIC_KEY_FALSE(zoned_enabled); + +/* + * One of these is allocated (on-stack) per original bio. + */ +struct clone_info { + struct dm_table *map; + struct bio *bio; + struct dm_io *io; + sector_t sector; + unsigned int sector_count; + bool is_abnormal_io:1; + bool submit_as_polled:1; +}; + +static inline struct dm_target_io *clone_to_tio(struct bio *clone) +{ + return container_of(clone, struct dm_target_io, clone); +} + +void *dm_per_bio_data(struct bio *bio, size_t data_size) +{ + if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO)) + return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; + return (char *)bio - DM_IO_BIO_OFFSET - data_size; +} +EXPORT_SYMBOL_GPL(dm_per_bio_data); + +struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) +{ + struct dm_io *io = (struct dm_io *)((char *)data + data_size); + if (io->magic == DM_IO_MAGIC) + return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); + BUG_ON(io->magic != DM_TIO_MAGIC); + return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); +} +EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); + +unsigned int dm_bio_get_target_bio_nr(const struct bio *bio) +{ + return container_of(bio, struct dm_target_io, clone)->target_bio_nr; +} +EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); + +#define MINOR_ALLOCED ((void *)-1) + +#define DM_NUMA_NODE NUMA_NO_NODE +static int dm_numa_node = DM_NUMA_NODE; + +#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) +static int swap_bios = DEFAULT_SWAP_BIOS; +static int get_swap_bios(void) +{ + int latch = READ_ONCE(swap_bios); + if (unlikely(latch <= 0)) + latch = DEFAULT_SWAP_BIOS; + return latch; +} + +struct table_device { + struct list_head list; + refcount_t count; + struct dm_dev dm_dev; +}; + +/* + * Bio-based DM's mempools' reserved IOs set by the user. + */ +#define RESERVED_BIO_BASED_IOS 16 +static unsigned int reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; + +static int __dm_get_module_param_int(int *module_param, int min, int max) +{ + int param = READ_ONCE(*module_param); + int modified_param = 0; + bool modified = true; + + if (param < min) + modified_param = min; + else if (param > max) + modified_param = max; + else + modified = false; + + if (modified) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + +unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max) +{ + unsigned int param = READ_ONCE(*module_param); + unsigned int modified_param = 0; + + if (!param) + modified_param = def; + else if (param > max) + modified_param = max; + + if (modified_param) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + +unsigned int dm_get_reserved_bio_based_ios(void) +{ + return __dm_get_module_param(&reserved_bio_based_ios, + RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); + +static unsigned int dm_get_numa_node(void) +{ + return __dm_get_module_param_int(&dm_numa_node, + DM_NUMA_NODE, num_online_nodes() - 1); +} + +static int __init local_init(void) +{ + int r; + + r = dm_uevent_init(); + if (r) + return r; + + deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); + if (!deferred_remove_workqueue) { + r = -ENOMEM; + goto out_uevent_exit; + } + + _major = major; + r = register_blkdev(_major, _name); + if (r < 0) + goto out_free_workqueue; + + if (!_major) + _major = r; + + return 0; + +out_free_workqueue: + destroy_workqueue(deferred_remove_workqueue); +out_uevent_exit: + dm_uevent_exit(); + + return r; +} + +static void local_exit(void) +{ + destroy_workqueue(deferred_remove_workqueue); + + unregister_blkdev(_major, _name); + dm_uevent_exit(); + + _major = 0; + + DMINFO("cleaned up"); +} + +static int (*_inits[])(void) __initdata = { + local_init, + dm_target_init, + dm_linear_init, + dm_stripe_init, + dm_io_init, + dm_kcopyd_init, + dm_interface_init, + dm_statistics_init, +}; + +static void (*_exits[])(void) = { + local_exit, + dm_target_exit, + dm_linear_exit, + dm_stripe_exit, + dm_io_exit, + dm_kcopyd_exit, + dm_interface_exit, + dm_statistics_exit, +}; + +static int __init dm_init(void) +{ + const int count = ARRAY_SIZE(_inits); + int r, i; + +#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) + DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." + " Duplicate IMA measurements will not be recorded in the IMA log."); +#endif + + for (i = 0; i < count; i++) { + r = _inits[i](); + if (r) + goto bad; + } + + return 0; +bad: + while (i--) + _exits[i](); + + return r; +} + +static void __exit dm_exit(void) +{ + int i = ARRAY_SIZE(_exits); + + while (i--) + _exits[i](); + + /* + * Should be empty by this point. + */ + idr_destroy(&_minor_idr); +} + +/* + * Block device functions + */ +int dm_deleting_md(struct mapped_device *md) +{ + return test_bit(DMF_DELETING, &md->flags); +} + +static int dm_blk_open(struct block_device *bdev, fmode_t mode) +{ + struct mapped_device *md; + + spin_lock(&_minor_lock); + + md = bdev->bd_disk->private_data; + if (!md) + goto out; + + if (test_bit(DMF_FREEING, &md->flags) || + dm_deleting_md(md)) { + md = NULL; + goto out; + } + + dm_get(md); + atomic_inc(&md->open_count); +out: + spin_unlock(&_minor_lock); + + return md ? 0 : -ENXIO; +} + +static void dm_blk_close(struct gendisk *disk, fmode_t mode) +{ + struct mapped_device *md; + + spin_lock(&_minor_lock); + + md = disk->private_data; + if (WARN_ON(!md)) + goto out; + + if (atomic_dec_and_test(&md->open_count) && + (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) + queue_work(deferred_remove_workqueue, &deferred_remove_work); + + dm_put(md); +out: + spin_unlock(&_minor_lock); +} + +int dm_open_count(struct mapped_device *md) +{ + return atomic_read(&md->open_count); +} + +/* + * Guarantees nothing is using the device before it's deleted. + */ +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (dm_open_count(md)) { + r = -EBUSY; + if (mark_deferred) + set_bit(DMF_DEFERRED_REMOVE, &md->flags); + } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) + r = -EEXIST; + else + set_bit(DMF_DELETING, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +int dm_cancel_deferred_remove(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (test_bit(DMF_DELETING, &md->flags)) + r = -EBUSY; + else + clear_bit(DMF_DEFERRED_REMOVE, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +static void do_deferred_remove(struct work_struct *w) +{ + dm_deferred_remove(); +} + +static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + + return dm_get_geometry(md, geo); +} + +static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, + struct block_device **bdev) +{ + struct dm_target *ti; + struct dm_table *map; + int r; + +retry: + r = -ENOTTY; + map = dm_get_live_table(md, srcu_idx); + if (!map || !dm_table_get_size(map)) + return r; + + /* We only support devices that have a single target */ + if (map->num_targets != 1) + return r; + + ti = dm_table_get_target(map, 0); + if (!ti->type->prepare_ioctl) + return r; + + if (dm_suspended_md(md)) + return -EAGAIN; + + r = ti->type->prepare_ioctl(ti, bdev); + if (r == -ENOTCONN && !fatal_signal_pending(current)) { + dm_put_live_table(md, *srcu_idx); + msleep(10); + goto retry; + } + + return r; +} + +static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) +{ + dm_put_live_table(md, srcu_idx); +} + +static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + int r, srcu_idx; + + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + if (r < 0) + goto out; + + if (r > 0) { + /* + * Target determined this ioctl is being issued against a + * subset of the parent bdev; require extra privileges. + */ + if (!capable(CAP_SYS_RAWIO)) { + DMDEBUG_LIMIT( + "%s: sending ioctl %x to DM device without required privilege.", + current->comm, cmd); + r = -ENOIOCTLCMD; + goto out; + } + } + + if (!bdev->bd_disk->fops->ioctl) + r = -ENOTTY; + else + r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); +out: + dm_unprepare_ioctl(md, srcu_idx); + return r; +} + +u64 dm_start_time_ns_from_clone(struct bio *bio) +{ + return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); +} +EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); + +static bool bio_is_flush_with_data(struct bio *bio) +{ + return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); +} + +static void dm_io_acct(struct dm_io *io, bool end) +{ + struct dm_stats_aux *stats_aux = &io->stats_aux; + unsigned long start_time = io->start_time; + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + unsigned int sectors; + + /* + * If REQ_PREFLUSH set, don't account payload, it will be + * submitted (and accounted) after this flush completes. + */ + if (bio_is_flush_with_data(bio)) + sectors = 0; + else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) + sectors = bio_sectors(bio); + else + sectors = io->sectors; + + if (!end) + bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), + start_time); + else + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) { + sector_t sector; + + if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) + sector = bio->bi_iter.bi_sector; + else + sector = bio_end_sector(bio) - io->sector_offset; + + dm_stats_account_io(&md->stats, bio_data_dir(bio), + sector, sectors, + end, start_time, stats_aux); + } +} + +static void __dm_start_io_acct(struct dm_io *io) +{ + dm_io_acct(io, false); +} + +static void dm_start_io_acct(struct dm_io *io, struct bio *clone) +{ + /* + * Ensure IO accounting is only ever started once. + */ + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + return; + + /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ + if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { + dm_io_set_flag(io, DM_IO_ACCOUNTED); + } else { + unsigned long flags; + /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ + spin_lock_irqsave(&io->lock, flags); + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) { + spin_unlock_irqrestore(&io->lock, flags); + return; + } + dm_io_set_flag(io, DM_IO_ACCOUNTED); + spin_unlock_irqrestore(&io->lock, flags); + } + + __dm_start_io_acct(io); +} + +static void dm_end_io_acct(struct dm_io *io) +{ + dm_io_acct(io, true); +} + +static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) +{ + struct dm_io *io; + struct dm_target_io *tio; + struct bio *clone; + + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs); + tio = clone_to_tio(clone); + tio->flags = 0; + dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); + tio->io = NULL; + + io = container_of(tio, struct dm_io, tio); + io->magic = DM_IO_MAGIC; + io->status = BLK_STS_OK; + + /* one ref is for submission, the other is for completion */ + atomic_set(&io->io_count, 2); + this_cpu_inc(*md->pending_io); + io->orig_bio = bio; + io->md = md; + spin_lock_init(&io->lock); + io->start_time = jiffies; + io->flags = 0; + + if (static_branch_unlikely(&stats_enabled)) + dm_stats_record_start(&md->stats, &io->stats_aux); + + return io; +} + +static void free_io(struct dm_io *io) +{ + bio_put(&io->tio.clone); +} + +static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, + unsigned int target_bio_nr, unsigned int *len, gfp_t gfp_mask) +{ + struct mapped_device *md = ci->io->md; + struct dm_target_io *tio; + struct bio *clone; + + if (!ci->io->tio.io) { + /* the dm_target_io embedded in ci->io is available */ + tio = &ci->io->tio; + /* alloc_io() already initialized embedded clone */ + clone = &tio->clone; + } else { + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, + &md->mempools->bs); + if (!clone) + return NULL; + + /* REQ_DM_POLL_LIST shouldn't be inherited */ + clone->bi_opf &= ~REQ_DM_POLL_LIST; + + tio = clone_to_tio(clone); + tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */ + } + + tio->magic = DM_TIO_MAGIC; + tio->io = ci->io; + tio->ti = ti; + tio->target_bio_nr = target_bio_nr; + tio->len_ptr = len; + tio->old_sector = 0; + + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; + if (unlikely(ti->needs_bio_set_dev)) + bio_set_dev(clone, md->disk->part0); + + if (len) { + clone->bi_iter.bi_size = to_bytes(*len); + if (bio_integrity(clone)) + bio_integrity_trim(clone); + } + + return clone; +} + +static void free_tio(struct bio *clone) +{ + if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) + return; + bio_put(clone); +} + +/* + * Add the bio to the list of deferred io. + */ +static void queue_io(struct mapped_device *md, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&md->deferred_lock, flags); + bio_list_add(&md->deferred, bio); + spin_unlock_irqrestore(&md->deferred_lock, flags); + queue_work(md->wq, &md->work); +} + +/* + * Everyone (including functions in this file), should use this + * function to access the md->map field, and make sure they call + * dm_put_live_table() when finished. + */ +struct dm_table *dm_get_live_table(struct mapped_device *md, + int *srcu_idx) __acquires(md->io_barrier) +{ + *srcu_idx = srcu_read_lock(&md->io_barrier); + + return srcu_dereference(md->map, &md->io_barrier); +} + +void dm_put_live_table(struct mapped_device *md, + int srcu_idx) __releases(md->io_barrier) +{ + srcu_read_unlock(&md->io_barrier, srcu_idx); +} + +void dm_sync_table(struct mapped_device *md) +{ + synchronize_srcu(&md->io_barrier); + synchronize_rcu_expedited(); +} + +/* + * A fast alternative to dm_get_live_table/dm_put_live_table. + * The caller must not block between these two functions. + */ +static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(md->map); +} + +static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) +{ + rcu_read_unlock(); +} + +static char *_dm_claim_ptr = "I belong to device-mapper"; + +/* + * Open a table device so we can use it as a map destination. + */ +static struct table_device *open_table_device(struct mapped_device *md, + dev_t dev, fmode_t mode) +{ + struct table_device *td; + struct block_device *bdev; + u64 part_off; + int r; + + td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); + if (!td) + return ERR_PTR(-ENOMEM); + refcount_set(&td->count, 1); + + bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr); + if (IS_ERR(bdev)) { + r = PTR_ERR(bdev); + goto out_free_td; + } + + /* + * We can be called before the dm disk is added. In that case we can't + * register the holder relation here. It will be done once add_disk was + * called. + */ + if (md->disk->slave_dir) { + r = bd_link_disk_holder(bdev, md->disk); + if (r) + goto out_blkdev_put; + } + + td->dm_dev.mode = mode; + td->dm_dev.bdev = bdev; + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); + format_dev_t(td->dm_dev.name, dev); + list_add(&td->list, &md->table_devices); + return td; + +out_blkdev_put: + blkdev_put(bdev, mode | FMODE_EXCL); +out_free_td: + kfree(td); + return ERR_PTR(r); +} + +/* + * Close a table device that we've been using. + */ +static void close_table_device(struct table_device *td, struct mapped_device *md) +{ + if (md->disk->slave_dir) + bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + put_dax(td->dm_dev.dax_dev); + list_del(&td->list); + kfree(td); +} + +static struct table_device *find_table_device(struct list_head *l, dev_t dev, + fmode_t mode) +{ + struct table_device *td; + + list_for_each_entry(td, l, list) + if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + return td; + + return NULL; +} + +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result) +{ + struct table_device *td; + + mutex_lock(&md->table_devices_lock); + td = find_table_device(&md->table_devices, dev, mode); + if (!td) { + td = open_table_device(md, dev, mode); + if (IS_ERR(td)) { + mutex_unlock(&md->table_devices_lock); + return PTR_ERR(td); + } + } else { + refcount_inc(&td->count); + } + mutex_unlock(&md->table_devices_lock); + + *result = &td->dm_dev; + return 0; +} + +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) +{ + struct table_device *td = container_of(d, struct table_device, dm_dev); + + mutex_lock(&md->table_devices_lock); + if (refcount_dec_and_test(&td->count)) + close_table_device(td, md); + mutex_unlock(&md->table_devices_lock); +} + +static void free_table_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct table_device *td = list_entry(tmp, struct table_device, list); + + DMWARN("dm_destroy: %s still exists with %d references", + td->dm_dev.name, refcount_read(&td->count)); + kfree(td); + } +} + +/* + * Get the geometry associated with a dm device + */ +int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) +{ + *geo = md->geometry; + + return 0; +} + +/* + * Set the geometry of a device. + */ +int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) +{ + sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; + + if (geo->start > sz) { + DMERR("Start sector is beyond the geometry limits."); + return -EINVAL; + } + + md->geometry = *geo; + + return 0; +} + +static int __noflush_suspending(struct mapped_device *md) +{ + return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); +} + +static void dm_requeue_add_io(struct dm_io *io, bool first_stage) +{ + struct mapped_device *md = io->md; + + if (first_stage) { + struct dm_io *next = md->requeue_list; + + md->requeue_list = io; + io->next = next; + } else { + bio_list_add_head(&md->deferred, io->orig_bio); + } +} + +static void dm_kick_requeue(struct mapped_device *md, bool first_stage) +{ + if (first_stage) + queue_work(md->wq, &md->requeue_work); + else + queue_work(md->wq, &md->work); +} + +/* + * Return true if the dm_io's original bio is requeued. + * io->status is updated with error if requeue disallowed. + */ +static bool dm_handle_requeue(struct dm_io *io, bool first_stage) +{ + struct bio *bio = io->orig_bio; + bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE); + bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) && + (bio->bi_opf & REQ_POLLED)); + struct mapped_device *md = io->md; + bool requeued = false; + + if (handle_requeue || handle_polled_eagain) { + unsigned long flags; + + if (bio->bi_opf & REQ_POLLED) { + /* + * Upper layer won't help us poll split bio + * (io->orig_bio may only reflect a subset of the + * pre-split original) so clear REQ_POLLED. + */ + bio_clear_polled(bio); + } + + /* + * Target requested pushing back the I/O or + * polled IO hit BLK_STS_AGAIN. + */ + spin_lock_irqsave(&md->deferred_lock, flags); + if ((__noflush_suspending(md) && + !WARN_ON_ONCE(dm_is_zone_write(md, bio))) || + handle_polled_eagain || first_stage) { + dm_requeue_add_io(io, first_stage); + requeued = true; + } else { + /* + * noflush suspend was interrupted or this is + * a write to a zoned target. + */ + io->status = BLK_STS_IOERR; + } + spin_unlock_irqrestore(&md->deferred_lock, flags); + } + + if (requeued) + dm_kick_requeue(md, first_stage); + + return requeued; +} + +static void __dm_io_complete(struct dm_io *io, bool first_stage) +{ + struct bio *bio = io->orig_bio; + struct mapped_device *md = io->md; + blk_status_t io_error; + bool requeued; + + requeued = dm_handle_requeue(io, first_stage); + if (requeued && first_stage) + return; + + io_error = io->status; + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + dm_end_io_acct(io); + else if (!io_error) { + /* + * Must handle target that DM_MAPIO_SUBMITTED only to + * then bio_endio() rather than dm_submit_bio_remap() + */ + __dm_start_io_acct(io); + dm_end_io_acct(io); + } + free_io(io); + smp_wmb(); + this_cpu_dec(*md->pending_io); + + /* nudge anyone waiting on suspend queue */ + if (unlikely(wq_has_sleeper(&md->wait))) + wake_up(&md->wait); + + /* Return early if the original bio was requeued */ + if (requeued) + return; + + if (bio_is_flush_with_data(bio)) { + /* + * Preflush done for flush with data, reissue + * without REQ_PREFLUSH. + */ + bio->bi_opf &= ~REQ_PREFLUSH; + queue_io(md, bio); + } else { + /* done with normal IO or empty flush */ + if (io_error) + bio->bi_status = io_error; + bio_endio(bio); + } +} + +static void dm_wq_requeue_work(struct work_struct *work) +{ + struct mapped_device *md = container_of(work, struct mapped_device, + requeue_work); + unsigned long flags; + struct dm_io *io; + + /* reuse deferred lock to simplify dm_handle_requeue */ + spin_lock_irqsave(&md->deferred_lock, flags); + io = md->requeue_list; + md->requeue_list = NULL; + spin_unlock_irqrestore(&md->deferred_lock, flags); + + while (io) { + struct dm_io *next = io->next; + + dm_io_rewind(io, &md->disk->bio_split); + + io->next = NULL; + __dm_io_complete(io, false); + io = next; + cond_resched(); + } +} + +/* + * Two staged requeue: + * + * 1) io->orig_bio points to the real original bio, and the part mapped to + * this io must be requeued, instead of other parts of the original bio. + * + * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. + */ +static void dm_io_complete(struct dm_io *io) +{ + bool first_requeue; + + /* + * Only dm_io that has been split needs two stage requeue, otherwise + * we may run into long bio clone chain during suspend and OOM could + * be triggered. + * + * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they + * also aren't handled via the first stage requeue. + */ + if (dm_io_flagged(io, DM_IO_WAS_SPLIT)) + first_requeue = true; + else + first_requeue = false; + + __dm_io_complete(io, first_requeue); +} + +/* + * Decrements the number of outstanding ios that a bio has been + * cloned into, completing the original io if necc. + */ +static inline void __dm_io_dec_pending(struct dm_io *io) +{ + if (atomic_dec_and_test(&io->io_count)) + dm_io_complete(io); +} + +static void dm_io_set_error(struct dm_io *io, blk_status_t error) +{ + unsigned long flags; + + /* Push-back supersedes any I/O errors */ + spin_lock_irqsave(&io->lock, flags); + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(io->md))) { + io->status = error; + } + spin_unlock_irqrestore(&io->lock, flags); +} + +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +{ + if (unlikely(error)) + dm_io_set_error(io, error); + + __dm_io_dec_pending(io); +} + +void disable_discard(struct mapped_device *md) +{ + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support DISCARD, disable it */ + limits->max_discard_sectors = 0; +} + +void disable_write_zeroes(struct mapped_device *md) +{ + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support WRITE ZEROES, disable it */ + limits->max_write_zeroes_sectors = 0; +} + +static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) +{ + return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); +} + +static void clone_endio(struct bio *bio) +{ + blk_status_t error = bio->bi_status; + struct dm_target_io *tio = clone_to_tio(bio); + struct dm_target *ti = tio->ti; + dm_endio_fn endio = ti->type->end_io; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !bdev_write_zeroes_sectors(bio->bi_bdev)) + disable_write_zeroes(md); + } + + if (static_branch_unlikely(&zoned_enabled) && + unlikely(bdev_is_zoned(bio->bi_bdev))) + dm_zone_endio(io, bio); + + if (endio) { + int r = endio(ti, bio, &error); + switch (r) { + case DM_ENDIO_REQUEUE: + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; + } else + error = BLK_STS_DM_REQUEUE; + fallthrough; + case DM_ENDIO_DONE: + break; + case DM_ENDIO_INCOMPLETE: + /* The target will handle the io */ + return; + default: + DMCRIT("unimplemented target endio return value: %d", r); + BUG(); + } + } + + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, bio))) + up(&md->swap_bios_semaphore); + + free_tio(bio); + dm_io_dec_pending(io, error); +} + +/* + * Return maximum size of I/O possible at the supplied sector up to the current + * target boundary. + */ +static inline sector_t max_io_len_target_boundary(struct dm_target *ti, + sector_t target_offset) +{ + return ti->len - target_offset; +} + +static sector_t max_io_len(struct dm_target *ti, sector_t sector) +{ + sector_t target_offset = dm_target_offset(ti, sector); + sector_t len = max_io_len_target_boundary(ti, target_offset); + + /* + * Does the target need to split IO even further? + * - varied (per target) IO splitting is a tenet of DM; this + * explains why stacked chunk_sectors based splitting via + * bio_split_to_limits() isn't possible here. + */ + if (!ti->max_io_len) + return len; + return min_t(sector_t, len, + min(queue_max_sectors(ti->table->md->queue), + blk_chunk_sectors_left(target_offset, ti->max_io_len))); +} + +int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) +{ + if (len > UINT_MAX) { + DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", + (unsigned long long)len, UINT_MAX); + ti->error = "Maximum size of target IO is too large"; + return -EINVAL; + } + + ti->max_io_len = (uint32_t) len; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); + +static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, + sector_t sector, int *srcu_idx) + __acquires(md->io_barrier) +{ + struct dm_table *map; + struct dm_target *ti; + + map = dm_get_live_table(md, srcu_idx); + if (!map) + return NULL; + + ti = dm_table_find_target(map, sector); + if (!ti) + return NULL; + + return ti; +} + +static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + long len, ret = -EIO; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (!ti->type->direct_access) + goto out; + len = max_io_len(ti, sector) / PAGE_SECTORS; + if (len < 1) + goto out; + nr_pages = min(len, nr_pages); + ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); + + out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int ret = -EIO; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (WARN_ON(!ti->type->dax_zero_page_range)) { + /* + * ->zero_page_range() is mandatory dax operation. If we are + * here, something is wrong. + */ + goto out; + } + ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); + out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + long ret = 0; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + if (!ti || !ti->type->dax_recovery_write) + goto out; + + ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); +out: + dm_put_live_table(md, srcu_idx); + return ret; +} + +/* + * A target may call dm_accept_partial_bio only from the map routine. It is + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management + * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by + * __send_duplicate_bios(). + * + * dm_accept_partial_bio informs the dm that the target only wants to process + * additional n_sectors sectors of the bio and the rest of the data should be + * sent in a next bio. + * + * A diagram that explains the arithmetics: + * +--------------------+---------------+-------+ + * | 1 | 2 | 3 | + * +--------------------+---------------+-------+ + * + * <-------------- *tio->len_ptr ---------------> + * <----- bio_sectors -----> + * <-- n_sectors --> + * + * Region 1 was already iterated over with bio_advance or similar function. + * (it may be empty if the target doesn't use bio_advance) + * Region 2 is the remaining bio size that the target wants to process. + * (it may be empty if region 1 is non-empty, although there is no reason + * to make it empty) + * The target requires that region 3 is to be sent in the next bio. + * + * If the target wants to receive multiple copies of the bio (via num_*bios, etc), + * the partially processed part (the sum of regions 1+2) must be the same for all + * copies of the bio. + */ +void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors) +{ + struct dm_target_io *tio = clone_to_tio(bio); + struct dm_io *io = tio->io; + unsigned int bio_sectors = bio_sectors(bio); + + BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); + BUG_ON(op_is_zone_mgmt(bio_op(bio))); + BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); + BUG_ON(bio_sectors > *tio->len_ptr); + BUG_ON(n_sectors > bio_sectors); + + *tio->len_ptr -= bio_sectors - n_sectors; + bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; + + /* + * __split_and_process_bio() may have already saved mapped part + * for accounting but it is being reduced so update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = n_sectors; + io->sector_offset = bio_sectors(io->orig_bio); +} +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); + +/* + * @clone: clone bio that DM core passed to target's .map function + * @tgt_clone: clone of @clone bio that target needs submitted + * + * Targets should use this interface to submit bios they take + * ownership of when returning DM_MAPIO_SUBMITTED. + * + * Target should also enable ti->accounts_remapped_io + */ +void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) +{ + struct dm_target_io *tio = clone_to_tio(clone); + struct dm_io *io = tio->io; + + /* establish bio that will get submitted */ + if (!tgt_clone) + tgt_clone = clone; + + /* + * Account io->origin_bio to DM dev on behalf of target + * that took ownership of IO with DM_MAPIO_SUBMITTED. + */ + dm_start_io_acct(io, clone); + + trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), + tio->old_sector); + submit_bio_noacct(tgt_clone); +} +EXPORT_SYMBOL_GPL(dm_submit_bio_remap); + +static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) +{ + mutex_lock(&md->swap_bios_lock); + while (latch < md->swap_bios) { + cond_resched(); + down(&md->swap_bios_semaphore); + md->swap_bios--; + } + while (latch > md->swap_bios) { + cond_resched(); + up(&md->swap_bios_semaphore); + md->swap_bios++; + } + mutex_unlock(&md->swap_bios_lock); +} + +static void __map_bio(struct bio *clone) +{ + struct dm_target_io *tio = clone_to_tio(clone); + struct dm_target *ti = tio->ti; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + int r; + + clone->bi_end_io = clone_endio; + + /* + * Map the clone. + */ + tio->old_sector = clone->bi_iter.bi_sector; + + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) { + int latch = get_swap_bios(); + if (unlikely(latch != md->swap_bios)) + __set_swap_bios_limit(md, latch); + down(&md->swap_bios_semaphore); + } + + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Check if the IO needs a special mapping due to zone append + * emulation on zoned target. In this case, dm_zone_map_bio() + * calls the target map operation. + */ + if (unlikely(dm_emulate_zone_append(md))) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + } else + r = ti->type->map(ti, clone); + + switch (r) { + case DM_MAPIO_SUBMITTED: + /* target has assumed ownership of this io */ + if (!ti->accounts_remapped_io) + dm_start_io_acct(io, clone); + break; + case DM_MAPIO_REMAPPED: + dm_submit_bio_remap(clone, NULL); + break; + case DM_MAPIO_KILL: + case DM_MAPIO_REQUEUE: + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) + up(&md->swap_bios_semaphore); + free_tio(clone); + if (r == DM_MAPIO_KILL) + dm_io_dec_pending(io, BLK_STS_IOERR); + else + dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); + break; + default: + DMCRIT("unimplemented target map return value: %d", r); + BUG(); + } +} + +static void setup_split_accounting(struct clone_info *ci, unsigned int len) +{ + struct dm_io *io = ci->io; + + if (ci->sector_count > len) { + /* + * Split needed, save the mapped part for accounting. + * NOTE: dm_accept_partial_bio() will update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = len; + io->sector_offset = bio_sectors(ci->bio); + } +} + +static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, + struct dm_target *ti, unsigned int num_bios, + unsigned *len) +{ + struct bio *bio; + int try; + + for (try = 0; try < 2; try++) { + int bio_nr; + + if (try) + mutex_lock(&ci->io->md->table_devices_lock); + for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { + bio = alloc_tio(ci, ti, bio_nr, len, + try ? GFP_NOIO : GFP_NOWAIT); + if (!bio) + break; + + bio_list_add(blist, bio); + } + if (try) + mutex_unlock(&ci->io->md->table_devices_lock); + if (bio_nr == num_bios) + return; + + while ((bio = bio_list_pop(blist))) + free_tio(bio); + } +} + +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, + unsigned int num_bios, unsigned int *len) +{ + struct bio_list blist = BIO_EMPTY_LIST; + struct bio *clone; + unsigned int ret = 0; + + switch (num_bios) { + case 0: + break; + case 1: + if (len) + setup_split_accounting(ci, *len); + clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); + __map_bio(clone); + ret = 1; + break; + default: + if (len) + setup_split_accounting(ci, *len); + /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ + alloc_multiple_bios(&blist, ci, ti, num_bios, len); + while ((clone = bio_list_pop(&blist))) { + dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); + __map_bio(clone); + ret += 1; + } + break; + } + + return ret; +} + +static void __send_empty_flush(struct clone_info *ci) +{ + struct dm_table *t = ci->map; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + + ci->bio = &flush_bio; + ci->sector_count = 0; + ci->io->tio.clone.bi_iter.bi_size = 0; + + for (unsigned int i = 0; i < t->num_targets; i++) { + unsigned int bios; + struct dm_target *ti = dm_table_get_target(t, i); + + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following subtraction + */ + atomic_sub(1, &ci->io->io_count); + + bio_uninit(ci->bio); +} + +static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, + unsigned int num_bios) +{ + unsigned int len, bios; + + len = min_t(sector_t, ci->sector_count, + max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); + + atomic_add(num_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following (+1) subtraction + */ + atomic_sub(num_bios - bios + 1, &ci->io->io_count); + + ci->sector += len; + ci->sector_count -= len; +} + +static bool is_abnormal_io(struct bio *bio) +{ + enum req_op op = bio_op(bio); + + if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + break; + } + } + + return false; +} + +static blk_status_t __process_abnormal_io(struct clone_info *ci, + struct dm_target *ti) +{ + unsigned int num_bios = 0; + + switch (bio_op(ci->bio)) { + case REQ_OP_DISCARD: + num_bios = ti->num_discard_bios; + break; + case REQ_OP_SECURE_ERASE: + num_bios = ti->num_secure_erase_bios; + break; + case REQ_OP_WRITE_ZEROES: + num_bios = ti->num_write_zeroes_bios; + break; + default: + break; + } + + /* + * Even though the device advertised support for this type of + * request, that does not mean every target supports it, and + * reconfiguration might also have changed that since the + * check was performed. + */ + if (unlikely(!num_bios)) + return BLK_STS_NOTSUPP; + + __send_changing_extent_only(ci, ti, num_bios); + return BLK_STS_OK; +} + +/* + * Reuse ->bi_private as dm_io list head for storing all dm_io instances + * associated with this bio, and this bio's bi_private needs to be + * stored in dm_io->data before the reuse. + * + * bio->bi_private is owned by fs or upper layer, so block layer won't + * touch it after splitting. Meantime it won't be changed by anyone after + * bio is submitted. So this reuse is safe. + */ +static inline struct dm_io **dm_poll_list_head(struct bio *bio) +{ + return (struct dm_io **)&bio->bi_private; +} + +static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) +{ + struct dm_io **head = dm_poll_list_head(bio); + + if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { + bio->bi_opf |= REQ_DM_POLL_LIST; + /* + * Save .bi_private into dm_io, so that we can reuse + * .bi_private as dm_io list head for storing dm_io list + */ + io->data = bio->bi_private; + + /* tell block layer to poll for completion */ + bio->bi_cookie = ~BLK_QC_T_NONE; + + io->next = NULL; + } else { + /* + * bio recursed due to split, reuse original poll list, + * and save bio->bi_private too. + */ + io->data = (*head)->data; + io->next = *head; + } + + *head = io; +} + +/* + * Select the correct strategy for processing a non-flush bio. + */ +static blk_status_t __split_and_process_bio(struct clone_info *ci) +{ + struct bio *clone; + struct dm_target *ti; + unsigned int len; + + ti = dm_table_find_target(ci->map, ci->sector); + if (unlikely(!ti)) + return BLK_STS_IOERR; + + if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) && + unlikely(!dm_target_supports_nowait(ti->type))) + return BLK_STS_NOTSUPP; + + if (unlikely(ci->is_abnormal_io)) + return __process_abnormal_io(ci, ti); + + /* + * Only support bio polling for normal IO, and the target io is + * exactly inside the dm_io instance (verified in dm_poll_dm_io) + */ + ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED); + + len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + setup_split_accounting(ci, len); + clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); + __map_bio(clone); + + ci->sector += len; + ci->sector_count -= len; + + return BLK_STS_OK; +} + +static void init_clone_info(struct clone_info *ci, struct mapped_device *md, + struct dm_table *map, struct bio *bio, bool is_abnormal) +{ + ci->map = map; + ci->io = alloc_io(md, bio); + ci->bio = bio; + ci->is_abnormal_io = is_abnormal; + ci->submit_as_polled = false; + ci->sector = bio->bi_iter.bi_sector; + ci->sector_count = bio_sectors(bio); + + /* Shouldn't happen but sector_count was being set to 0 so... */ + if (static_branch_unlikely(&zoned_enabled) && + WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) + ci->sector_count = 0; +} + +/* + * Entry point to split a bio into clones and submit them to the targets. + */ +static void dm_split_and_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) +{ + struct clone_info ci; + struct dm_io *io; + blk_status_t error = BLK_STS_OK; + bool is_abnormal; + + is_abnormal = is_abnormal_io(bio); + if (unlikely(is_abnormal)) { + /* + * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) + * otherwise associated queue_limits won't be imposed. + */ + bio = bio_split_to_limits(bio); + if (!bio) + return; + } + + init_clone_info(&ci, md, map, bio, is_abnormal); + io = ci.io; + + if (bio->bi_opf & REQ_PREFLUSH) { + __send_empty_flush(&ci); + /* dm_io_complete submits any data associated with flush */ + goto out; + } + + error = __split_and_process_bio(&ci); + if (error || !ci.sector_count) + goto out; + /* + * Remainder must be passed to submit_bio_noacct() so it gets handled + * *after* bios already submitted have been completely processed. + */ + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); + submit_bio_noacct(bio); +out: + /* + * Drop the extra reference count for non-POLLED bio, and hold one + * reference for POLLED bio, which will be released in dm_poll_bio + * + * Add every dm_io instance into the dm_io list head which is stored + * in bio->bi_private, so that dm_poll_bio can poll them all. + */ + if (error || !ci.submit_as_polled) { + /* + * In case of submission failure, the extra reference for + * submitting io isn't consumed yet + */ + if (error) + atomic_dec(&io->io_count); + dm_io_dec_pending(io, error); + } else + dm_queue_poll_io(bio, io); +} + +static void dm_submit_bio(struct bio *bio) +{ + struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; + int srcu_idx; + struct dm_table *map; + + map = dm_get_live_table(md, &srcu_idx); + + /* If suspended, or map not yet available, queue this IO for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || + unlikely(!map)) { + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + else if (bio->bi_opf & REQ_RAHEAD) + bio_io_error(bio); + else + queue_io(md, bio); + goto out; + } + + dm_split_and_process_bio(md, map, bio); +out: + dm_put_live_table(md, srcu_idx); +} + +static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, + unsigned int flags) +{ + WARN_ON_ONCE(!dm_tio_is_normal(&io->tio)); + + /* don't poll if the mapped io is done */ + if (atomic_read(&io->io_count) > 1) + bio_poll(&io->tio.clone, iob, flags); + + /* bio_poll holds the last reference */ + return atomic_read(&io->io_count) == 1; +} + +static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, + unsigned int flags) +{ + struct dm_io **head = dm_poll_list_head(bio); + struct dm_io *list = *head; + struct dm_io *tmp = NULL; + struct dm_io *curr, *next; + + /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ + if (!(bio->bi_opf & REQ_DM_POLL_LIST)) + return 0; + + WARN_ON_ONCE(!list); + + /* + * Restore .bi_private before possibly completing dm_io. + * + * bio_poll() is only possible once @bio has been completely + * submitted via submit_bio_noacct()'s depth-first submission. + * So there is no dm_queue_poll_io() race associated with + * clearing REQ_DM_POLL_LIST here. + */ + bio->bi_opf &= ~REQ_DM_POLL_LIST; + bio->bi_private = list->data; + + for (curr = list, next = curr->next; curr; curr = next, next = + curr ? curr->next : NULL) { + if (dm_poll_dm_io(curr, iob, flags)) { + /* + * clone_endio() has already occurred, so no + * error handling is needed here. + */ + __dm_io_dec_pending(curr); + } else { + curr->next = tmp; + tmp = curr; + } + } + + /* Not done? */ + if (tmp) { + bio->bi_opf |= REQ_DM_POLL_LIST; + /* Reset bio->bi_private to dm_io list head */ + *head = tmp; + return 0; + } + return 1; +} + +/*----------------------------------------------------------------- + * An IDR is used to keep track of allocated minor numbers. + *---------------------------------------------------------------*/ +static void free_minor(int minor) +{ + spin_lock(&_minor_lock); + idr_remove(&_minor_idr, minor); + spin_unlock(&_minor_lock); +} + +/* + * See if the device with a specific minor # is free. + */ +static int specific_minor(int minor) +{ + int r; + + if (minor >= (1 << MINORBITS)) + return -EINVAL; + + idr_preload(GFP_KERNEL); + spin_lock(&_minor_lock); + + r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); + + spin_unlock(&_minor_lock); + idr_preload_end(); + if (r < 0) + return r == -ENOSPC ? -EBUSY : r; + return 0; +} + +static int next_free_minor(int *minor) +{ + int r; + + idr_preload(GFP_KERNEL); + spin_lock(&_minor_lock); + + r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); + + spin_unlock(&_minor_lock); + idr_preload_end(); + if (r < 0) + return r; + *minor = r; + return 0; +} + +static const struct block_device_operations dm_blk_dops; +static const struct block_device_operations dm_rq_blk_dops; +static const struct dax_operations dm_dax_ops; + +static void dm_wq_work(struct work_struct *work); + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION +static void dm_queue_destroy_crypto_profile(struct request_queue *q) +{ + dm_destroy_crypto_profile(q->crypto_profile); +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) +{ +} +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ + +static void cleanup_mapped_device(struct mapped_device *md) +{ + if (md->wq) + destroy_workqueue(md->wq); + dm_free_md_mempools(md->mempools); + + if (md->dax_dev) { + dax_remove_host(md->disk); + kill_dax(md->dax_dev); + put_dax(md->dax_dev); + md->dax_dev = NULL; + } + + dm_cleanup_zoned_dev(md); + if (md->disk) { + spin_lock(&_minor_lock); + md->disk->private_data = NULL; + spin_unlock(&_minor_lock); + if (dm_get_md_type(md) != DM_TYPE_NONE) { + struct table_device *td; + + dm_sysfs_exit(md); + list_for_each_entry(td, &md->table_devices, list) { + bd_unlink_disk_holder(td->dm_dev.bdev, + md->disk); + } + + /* + * Hold lock to make sure del_gendisk() won't concurrent + * with open/close_table_device(). + */ + mutex_lock(&md->table_devices_lock); + del_gendisk(md->disk); + mutex_unlock(&md->table_devices_lock); + } + dm_queue_destroy_crypto_profile(md->queue); + put_disk(md->disk); + } + + if (md->pending_io) { + free_percpu(md->pending_io); + md->pending_io = NULL; + } + + cleanup_srcu_struct(&md->io_barrier); + + mutex_destroy(&md->suspend_lock); + mutex_destroy(&md->type_lock); + mutex_destroy(&md->table_devices_lock); + mutex_destroy(&md->swap_bios_lock); + + dm_mq_cleanup_mapped_device(md); +} + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_dev(int minor) +{ + int r, numa_node_id = dm_get_numa_node(); + struct mapped_device *md; + void *old_md; + + md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); + if (!md) { + DMERR("unable to allocate device, out of memory."); + return NULL; + } + + if (!try_module_get(THIS_MODULE)) + goto bad_module_get; + + /* get a minor number for the dev */ + if (minor == DM_ANY_MINOR) + r = next_free_minor(&minor); + else + r = specific_minor(minor); + if (r < 0) + goto bad_minor; + + r = init_srcu_struct(&md->io_barrier); + if (r < 0) + goto bad_io_barrier; + + md->numa_node_id = numa_node_id; + md->init_tio_pdu = false; + md->type = DM_TYPE_NONE; + mutex_init(&md->suspend_lock); + mutex_init(&md->type_lock); + mutex_init(&md->table_devices_lock); + spin_lock_init(&md->deferred_lock); + atomic_set(&md->holders, 1); + atomic_set(&md->open_count, 0); + atomic_set(&md->event_nr, 0); + atomic_set(&md->uevent_seq, 0); + INIT_LIST_HEAD(&md->uevent_list); + INIT_LIST_HEAD(&md->table_devices); + spin_lock_init(&md->uevent_lock); + + /* + * default to bio-based until DM table is loaded and md->type + * established. If request-based table is loaded: blk-mq will + * override accordingly. + */ + md->disk = blk_alloc_disk(md->numa_node_id); + if (!md->disk) + goto bad; + md->queue = md->disk->queue; + + init_waitqueue_head(&md->wait); + INIT_WORK(&md->work, dm_wq_work); + INIT_WORK(&md->requeue_work, dm_wq_requeue_work); + init_waitqueue_head(&md->eventq); + init_completion(&md->kobj_holder.completion); + + md->requeue_list = NULL; + md->swap_bios = get_swap_bios(); + sema_init(&md->swap_bios_semaphore, md->swap_bios); + mutex_init(&md->swap_bios_lock); + + md->disk->major = _major; + md->disk->first_minor = minor; + md->disk->minors = 1; + md->disk->flags |= GENHD_FL_NO_PART; + md->disk->fops = &dm_blk_dops; + md->disk->private_data = md; + sprintf(md->disk->disk_name, "dm-%d", minor); + + if (IS_ENABLED(CONFIG_FS_DAX)) { + md->dax_dev = alloc_dax(md, &dm_dax_ops); + if (IS_ERR(md->dax_dev)) { + md->dax_dev = NULL; + goto bad; + } + set_dax_nocache(md->dax_dev); + set_dax_nomc(md->dax_dev); + if (dax_add_host(md->dax_dev, md->disk)) + goto bad; + } + + format_dev_t(md->name, MKDEV(_major, minor)); + + md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); + if (!md->wq) + goto bad; + + md->pending_io = alloc_percpu(unsigned long); + if (!md->pending_io) + goto bad; + + r = dm_stats_init(&md->stats); + if (r < 0) + goto bad; + + /* Populate the mapping, nobody knows we exist yet */ + spin_lock(&_minor_lock); + old_md = idr_replace(&_minor_idr, md, minor); + spin_unlock(&_minor_lock); + + BUG_ON(old_md != MINOR_ALLOCED); + + return md; + +bad: + cleanup_mapped_device(md); +bad_io_barrier: + free_minor(minor); +bad_minor: + module_put(THIS_MODULE); +bad_module_get: + kvfree(md); + return NULL; +} + +static void unlock_fs(struct mapped_device *md); + +static void free_dev(struct mapped_device *md) +{ + int minor = MINOR(disk_devt(md->disk)); + + unlock_fs(md); + + cleanup_mapped_device(md); + + free_table_devices(&md->table_devices); + dm_stats_cleanup(&md->stats); + free_minor(minor); + + module_put(THIS_MODULE); + kvfree(md); +} + +/* + * Bind a table to the device. + */ +static void event_callback(void *context) +{ + unsigned long flags; + LIST_HEAD(uevents); + struct mapped_device *md = (struct mapped_device *) context; + + spin_lock_irqsave(&md->uevent_lock, flags); + list_splice_init(&md->uevent_list, &uevents); + spin_unlock_irqrestore(&md->uevent_lock, flags); + + dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); + + atomic_inc(&md->event_nr); + wake_up(&md->eventq); + dm_issue_global_event(); +} + +/* + * Returns old map, which caller must destroy. + */ +static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, + struct queue_limits *limits) +{ + struct dm_table *old_map; + sector_t size; + int ret; + + lockdep_assert_held(&md->suspend_lock); + + size = dm_table_get_size(t); + + /* + * Wipe any geometry if the size of the table changed. + */ + if (size != dm_get_size(md)) + memset(&md->geometry, 0, sizeof(md->geometry)); + + set_capacity(md->disk, size); + + dm_table_event_callback(t, event_callback, md); + + if (dm_table_request_based(t)) { + /* + * Leverage the fact that request-based DM targets are + * immutable singletons - used to optimize dm_mq_queue_rq. + */ + md->immutable_target = dm_table_get_immutable_target(t); + + /* + * There is no need to reload with request-based dm because the + * size of front_pad doesn't change. + * + * Note for future: If you are to reload bioset, prep-ed + * requests in the queue may refer to bio from the old bioset, + * so you must walk through the queue to unprep. + */ + if (!md->mempools) { + md->mempools = t->mempools; + t->mempools = NULL; + } + } else { + /* + * The md may already have mempools that need changing. + * If so, reload bioset because front_pad may have changed + * because a different table was loaded. + */ + dm_free_md_mempools(md->mempools); + md->mempools = t->mempools; + t->mempools = NULL; + } + + ret = dm_table_set_restrictions(t, md->queue, limits); + if (ret) { + old_map = ERR_PTR(ret); + goto out; + } + + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + rcu_assign_pointer(md->map, (void *)t); + md->immutable_target_type = dm_table_get_immutable_target_type(t); + + if (old_map) + dm_sync_table(md); +out: + return old_map; +} + +/* + * Returns unbound table for the caller to free. + */ +static struct dm_table *__unbind(struct mapped_device *md) +{ + struct dm_table *map = rcu_dereference_protected(md->map, 1); + + if (!map) + return NULL; + + dm_table_event_callback(map, NULL, NULL); + RCU_INIT_POINTER(md->map, NULL); + dm_sync_table(md); + + return map; +} + +/* + * Constructor for a new device. + */ +int dm_create(int minor, struct mapped_device **result) +{ + struct mapped_device *md; + + md = alloc_dev(minor); + if (!md) + return -ENXIO; + + dm_ima_reset_data(md); + + *result = md; + return 0; +} + +/* + * Functions to manage md->type. + * All are required to hold md->type_lock. + */ +void dm_lock_md_type(struct mapped_device *md) +{ + mutex_lock(&md->type_lock); +} + +void dm_unlock_md_type(struct mapped_device *md) +{ + mutex_unlock(&md->type_lock); +} + +void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) +{ + BUG_ON(!mutex_is_locked(&md->type_lock)); + md->type = type; +} + +enum dm_queue_mode dm_get_md_type(struct mapped_device *md) +{ + return md->type; +} + +struct target_type *dm_get_immutable_target_type(struct mapped_device *md) +{ + return md->immutable_target_type; +} + +/* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ + BUG_ON(!atomic_read(&md->holders)); + return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); + +/* + * Setup the DM device's queue based on md's type + */ +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) +{ + enum dm_queue_mode type = dm_table_get_type(t); + struct queue_limits limits; + struct table_device *td; + int r; + + switch (type) { + case DM_TYPE_REQUEST_BASED: + md->disk->fops = &dm_rq_blk_dops; + r = dm_mq_init_request_queue(md, t); + if (r) { + DMERR("Cannot initialize queue for request-based dm mapped device"); + return r; + } + break; + case DM_TYPE_BIO_BASED: + case DM_TYPE_DAX_BIO_BASED: + break; + case DM_TYPE_NONE: + WARN_ON_ONCE(true); + break; + } + + r = dm_calculate_queue_limits(t, &limits); + if (r) { + DMERR("Cannot calculate initial queue limits"); + return r; + } + r = dm_table_set_restrictions(t, md->queue, &limits); + if (r) + return r; + + /* + * Hold lock to make sure add_disk() and del_gendisk() won't concurrent + * with open_table_device() and close_table_device(). + */ + mutex_lock(&md->table_devices_lock); + r = add_disk(md->disk); + mutex_unlock(&md->table_devices_lock); + if (r) + return r; + + /* + * Register the holder relationship for devices added before the disk + * was live. + */ + list_for_each_entry(td, &md->table_devices, list) { + r = bd_link_disk_holder(td->dm_dev.bdev, md->disk); + if (r) + goto out_undo_holders; + } + + r = dm_sysfs_init(md); + if (r) + goto out_undo_holders; + + md->type = type; + return 0; + +out_undo_holders: + list_for_each_entry_continue_reverse(td, &md->table_devices, list) + bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); + mutex_lock(&md->table_devices_lock); + del_gendisk(md->disk); + mutex_unlock(&md->table_devices_lock); + return r; +} + +struct mapped_device *dm_get_md(dev_t dev) +{ + struct mapped_device *md; + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) + return NULL; + + spin_lock(&_minor_lock); + + md = idr_find(&_minor_idr, minor); + if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || + test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; + } + dm_get(md); +out: + spin_unlock(&_minor_lock); + + return md; +} +EXPORT_SYMBOL_GPL(dm_get_md); + +void *dm_get_mdptr(struct mapped_device *md) +{ + return md->interface_ptr; +} + +void dm_set_mdptr(struct mapped_device *md, void *ptr) +{ + md->interface_ptr = ptr; +} + +void dm_get(struct mapped_device *md) +{ + atomic_inc(&md->holders); + BUG_ON(test_bit(DMF_FREEING, &md->flags)); +} + +int dm_hold(struct mapped_device *md) +{ + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags)) { + spin_unlock(&_minor_lock); + return -EBUSY; + } + dm_get(md); + spin_unlock(&_minor_lock); + return 0; +} +EXPORT_SYMBOL_GPL(dm_hold); + +const char *dm_device_name(struct mapped_device *md) +{ + return md->name; +} +EXPORT_SYMBOL_GPL(dm_device_name); + +static void __dm_destroy(struct mapped_device *md, bool wait) +{ + struct dm_table *map; + int srcu_idx; + + might_sleep(); + + spin_lock(&_minor_lock); + idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); + + blk_mark_disk_dead(md->disk); + + /* + * Take suspend_lock so that presuspend and postsuspend methods + * do not race with internal suspend. + */ + mutex_lock(&md->suspend_lock); + map = dm_get_live_table(md, &srcu_idx); + if (!dm_suspended_md(md)) { + dm_table_presuspend_targets(map); + set_bit(DMF_SUSPENDED, &md->flags); + set_bit(DMF_POST_SUSPENDING, &md->flags); + dm_table_postsuspend_targets(map); + } + /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ + dm_put_live_table(md, srcu_idx); + mutex_unlock(&md->suspend_lock); + + /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + + dm_table_destroy(__unbind(md)); + free_dev(md); +} + +void dm_destroy(struct mapped_device *md) +{ + __dm_destroy(md, true); +} + +void dm_destroy_immediate(struct mapped_device *md) +{ + __dm_destroy(md, false); +} + +void dm_put(struct mapped_device *md) +{ + atomic_dec(&md->holders); +} +EXPORT_SYMBOL_GPL(dm_put); + +static bool dm_in_flight_bios(struct mapped_device *md) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) + sum += *per_cpu_ptr(md->pending_io, cpu); + + return sum != 0; +} + +static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) +{ + int r = 0; + DEFINE_WAIT(wait); + + while (true) { + prepare_to_wait(&md->wait, &wait, task_state); + + if (!dm_in_flight_bios(md)) + break; + + if (signal_pending_state(task_state, current)) { + r = -EINTR; + break; + } + + io_schedule(); + } + finish_wait(&md->wait, &wait); + + smp_rmb(); + + return r; +} + +static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) +{ + int r = 0; + + if (!queue_is_mq(md->queue)) + return dm_wait_for_bios_completion(md, task_state); + + while (true) { + if (!blk_mq_queue_inflight(md->queue)) + break; + + if (signal_pending_state(task_state, current)) { + r = -EINTR; + break; + } + + msleep(5); + } + + return r; +} + +/* + * Process the deferred bios + */ +static void dm_wq_work(struct work_struct *work) +{ + struct mapped_device *md = container_of(work, struct mapped_device, work); + struct bio *bio; + + while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { + spin_lock_irq(&md->deferred_lock); + bio = bio_list_pop(&md->deferred); + spin_unlock_irq(&md->deferred_lock); + + if (!bio) + break; + + submit_bio_noacct(bio); + cond_resched(); + } +} + +static void dm_queue_flush(struct mapped_device *md) +{ + clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + smp_mb__after_atomic(); + queue_work(md->wq, &md->work); +} + +/* + * Swap in a new table, returning the old one for the caller to destroy. + */ +struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ + struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); + struct queue_limits limits; + int r; + + mutex_lock(&md->suspend_lock); + + /* device must be suspended */ + if (!dm_suspended_md(md)) + goto out; + + /* + * If the new table has no data devices, retain the existing limits. + * This helps multipath with queue_if_no_path if all paths disappear, + * then new I/O is queued based on these limits, and then some paths + * reappear. + */ + if (dm_table_has_no_data_devices(table)) { + live_map = dm_get_live_table_fast(md); + if (live_map) + limits = md->queue->limits; + dm_put_live_table_fast(md); + } + + if (!live_map) { + r = dm_calculate_queue_limits(table, &limits); + if (r) { + map = ERR_PTR(r); + goto out; + } + } + + map = __bind(md, table, &limits); + dm_issue_global_event(); + +out: + mutex_unlock(&md->suspend_lock); + return map; +} + +/* + * Functions to lock and unlock any filesystem running on the + * device. + */ +static int lock_fs(struct mapped_device *md) +{ + int r; + + WARN_ON(test_bit(DMF_FROZEN, &md->flags)); + + r = freeze_bdev(md->disk->part0); + if (!r) + set_bit(DMF_FROZEN, &md->flags); + return r; +} + +static void unlock_fs(struct mapped_device *md) +{ + if (!test_bit(DMF_FROZEN, &md->flags)) + return; + thaw_bdev(md->disk->part0); + clear_bit(DMF_FROZEN, &md->flags); +} + +/* + * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG + * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE + * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY + * + * If __dm_suspend returns 0, the device is completely quiescent + * now. There is no request-processing activity. All new requests + * are being added to md->deferred list. + */ +static int __dm_suspend(struct mapped_device *md, struct dm_table *map, + unsigned int suspend_flags, unsigned int task_state, + int dmf_suspended_flag) +{ + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; + int r; + + lockdep_assert_held(&md->suspend_lock); + + /* + * DMF_NOFLUSH_SUSPENDING must be set before presuspend. + * This flag is cleared before dm_suspend returns. + */ + if (noflush) + set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + else + DMDEBUG("%s: suspending with flush", dm_device_name(md)); + + /* + * This gets reverted if there's an error later and the targets + * provide the .presuspend_undo hook. + */ + dm_table_presuspend_targets(map); + + /* + * Flush I/O to the device. + * Any I/O submitted after lock_fs() may not be flushed. + * noflush takes precedence over do_lockfs. + * (lock_fs() flushes I/Os and waits for them to complete.) + */ + if (!noflush && do_lockfs) { + r = lock_fs(md); + if (r) { + dm_table_presuspend_undo_targets(map); + return r; + } + } + + /* + * Here we must make sure that no processes are submitting requests + * to target drivers i.e. no one may be executing + * dm_split_and_process_bio from dm_submit_bio. + * + * To get all processes out of dm_split_and_process_bio in dm_submit_bio, + * we take the write lock. To prevent any process from reentering + * dm_split_and_process_bio from dm_submit_bio and quiesce the thread + * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call + * flush_workqueue(md->wq). + */ + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + if (map) + synchronize_srcu(&md->io_barrier); + + /* + * Stop md->queue before flushing md->wq in case request-based + * dm defers requests to md->wq from md->queue. + */ + if (dm_request_based(md)) + dm_stop_queue(md->queue); + + flush_workqueue(md->wq); + + /* + * At this point no more requests are entering target request routines. + * We call dm_wait_for_completion to wait for all existing requests + * to finish. + */ + r = dm_wait_for_completion(md, task_state); + if (!r) + set_bit(dmf_suspended_flag, &md->flags); + + if (noflush) + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + if (map) + synchronize_srcu(&md->io_barrier); + + /* were we interrupted ? */ + if (r < 0) { + dm_queue_flush(md); + + if (dm_request_based(md)) + dm_start_queue(md->queue); + + unlock_fs(md); + dm_table_presuspend_undo_targets(map); + /* pushback list is already flushed, so skip flush */ + } + + return r; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned int suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map) { + /* avoid deadlock with fs/namespace.c:do_mount() */ + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + } + + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); + if (r) + goto out_unlock; + + set_bit(DMF_POST_SUSPENDING, &md->flags); + dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); + +out_unlock: + mutex_unlock(&md->suspend_lock); + return r; +} + +static int __dm_resume(struct mapped_device *md, struct dm_table *map) +{ + if (map) { + int r = dm_table_resume_targets(map); + if (r) + return r; + } + + dm_queue_flush(md); + + /* + * Flushing deferred I/Os must be done after targets are resumed + * so that mapping of targets can work correctly. + * Request-based dm is queueing the deferred I/Os in its request_queue. + */ + if (dm_request_based(md)) + dm_start_queue(md->queue); + + unlock_fs(md); + + return 0; +} + +int dm_resume(struct mapped_device *md) +{ + int r; + struct dm_table *map = NULL; + +retry: + r = -EINVAL; + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (!dm_suspended_md(md)) + goto out; + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map || !dm_table_get_size(map)) + goto out; + + r = __dm_resume(md, map); + if (r) + goto out; + + clear_bit(DMF_SUSPENDED, &md->flags); +out: + mutex_unlock(&md->suspend_lock); + + return r; +} + +/* + * Internal suspend/resume works like userspace-driven suspend. It waits + * until all bios finish and prevents issuing new bios to the target drivers. + * It may be used only from the kernel. + */ + +static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend_flags) +{ + struct dm_table *map = NULL; + + lockdep_assert_held(&md->suspend_lock); + + if (md->internal_suspend_count++) + return; /* nested internal suspend */ + + if (dm_suspended_md(md)) { + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + return; /* nest suspend */ + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + /* + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend + * would require changing .presuspend to return an error -- avoid this + * until there is a need for more elaborate variants of internal suspend. + */ + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, + DMF_SUSPENDED_INTERNALLY); + + set_bit(DMF_POST_SUSPENDING, &md->flags); + dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); +} + +static void __dm_internal_resume(struct mapped_device *md) +{ + BUG_ON(!md->internal_suspend_count); + + if (--md->internal_suspend_count) + return; /* resume from nested internal suspend */ + + if (dm_suspended_md(md)) + goto done; /* resume from nested suspend */ + + /* + * NOTE: existing callers don't need to call dm_table_resume_targets + * (which may fail -- so best to avoid it for now by passing NULL map) + */ + (void) __dm_resume(md, NULL); + +done: + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + smp_mb__after_atomic(); + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); +} + +void dm_internal_suspend_noflush(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); + +void dm_internal_resume(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_resume(md); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume); + +/* + * Fast variants of internal suspend/resume hold md->suspend_lock, + * which prevents interaction with userspace-driven suspend. + */ + +void dm_internal_suspend_fast(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) + return; + + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + synchronize_srcu(&md->io_barrier); + flush_workqueue(md->wq); + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); + +void dm_internal_resume_fast(struct mapped_device *md) +{ + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) + goto done; + + dm_queue_flush(md); + +done: + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume_fast); + +/*----------------------------------------------------------------- + * Event notification. + *---------------------------------------------------------------*/ +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned int cookie, bool need_resize_uevent) +{ + int r; + unsigned int noio_flag; + char udev_cookie[DM_COOKIE_LENGTH]; + char *envp[3] = { NULL, NULL, NULL }; + char **envpp = envp; + if (cookie) { + snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", + DM_COOKIE_ENV_VAR_NAME, cookie); + *envpp++ = udev_cookie; + } + if (need_resize_uevent) { + *envpp++ = "RESIZE=1"; + } + + noio_flag = memalloc_noio_save(); + + r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); + + memalloc_noio_restore(noio_flag); + + return r; +} + +uint32_t dm_next_uevent_seq(struct mapped_device *md) +{ + return atomic_add_return(1, &md->uevent_seq); +} + +uint32_t dm_get_event_nr(struct mapped_device *md) +{ + return atomic_read(&md->event_nr); +} + +int dm_wait_event(struct mapped_device *md, int event_nr) +{ + return wait_event_interruptible(md->eventq, + (event_nr != atomic_read(&md->event_nr))); +} + +void dm_uevent_add(struct mapped_device *md, struct list_head *elist) +{ + unsigned long flags; + + spin_lock_irqsave(&md->uevent_lock, flags); + list_add(elist, &md->uevent_list); + spin_unlock_irqrestore(&md->uevent_lock, flags); +} + +/* + * The gendisk is only valid as long as you have a reference + * count on 'md'. + */ +struct gendisk *dm_disk(struct mapped_device *md) +{ + return md->disk; +} +EXPORT_SYMBOL_GPL(dm_disk); + +struct kobject *dm_kobject(struct mapped_device *md) +{ + return &md->kobj_holder.kobj; +} + +struct mapped_device *dm_get_from_kobject(struct kobject *kobj) +{ + struct mapped_device *md; + + md = container_of(kobj, struct mapped_device, kobj_holder.kobj); + + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; + } + dm_get(md); +out: + spin_unlock(&_minor_lock); + + return md; +} + +int dm_suspended_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED, &md->flags); +} + +static int dm_post_suspending_md(struct mapped_device *md) +{ + return test_bit(DMF_POST_SUSPENDING, &md->flags); +} + +int dm_suspended_internally_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); +} + +int dm_test_deferred_remove_flag(struct mapped_device *md) +{ + return test_bit(DMF_DEFERRED_REMOVE, &md->flags); +} + +int dm_suspended(struct dm_target *ti) +{ + return dm_suspended_md(ti->table->md); +} +EXPORT_SYMBOL_GPL(dm_suspended); + +int dm_post_suspending(struct dm_target *ti) +{ + return dm_post_suspending_md(ti->table->md); +} +EXPORT_SYMBOL_GPL(dm_post_suspending); + +int dm_noflush_suspending(struct dm_target *ti) +{ + return __noflush_suspending(ti->table->md); +} +EXPORT_SYMBOL_GPL(dm_noflush_suspending); + +void dm_free_md_mempools(struct dm_md_mempools *pools) +{ + if (!pools) + return; + + bioset_exit(&pools->bs); + bioset_exit(&pools->io_bs); + + kfree(pools); +} + +struct dm_pr { + u64 old_key; + u64 new_key; + u32 flags; + bool abort; + bool fail_early; + int ret; + enum pr_type type; +}; + +static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, + struct dm_pr *pr) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + struct dm_table *table; + struct dm_target *ti; + int ret = -ENOTTY, srcu_idx; + + table = dm_get_live_table(md, &srcu_idx); + if (!table || !dm_table_get_size(table)) + goto out; + + /* We only support devices that have a single target */ + if (table->num_targets != 1) + goto out; + ti = dm_table_get_target(table, 0); + + if (dm_suspended_md(md)) { + ret = -EAGAIN; + goto out; + } + + ret = -EINVAL; + if (!ti->type->iterate_devices) + goto out; + + ti->type->iterate_devices(ti, fn, pr); + ret = 0; +out: + dm_put_live_table(md, srcu_idx); + return ret; +} + +/* + * For register / unregister we need to manually call out to every path. + */ +static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + int ret; + + if (!ops || !ops->pr_register) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); + if (!ret) + return 0; + + if (!pr->ret) + pr->ret = ret; + + if (pr->fail_early) + return -1; + + return 0; +} + +static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, + u32 flags) +{ + struct dm_pr pr = { + .old_key = old_key, + .new_key = new_key, + .flags = flags, + .fail_early = true, + .ret = 0, + }; + int ret; + + ret = dm_call_pr(bdev, __dm_pr_register, &pr); + if (ret) { + /* Didn't even get to register a path */ + return ret; + } + + if (!pr.ret) + return 0; + ret = pr.ret; + + if (!new_key) + return ret; + + /* unregister all paths if we failed to register any path */ + pr.old_key = new_key; + pr.new_key = 0; + pr.flags = 0; + pr.fail_early = false; + (void) dm_call_pr(bdev, __dm_pr_register, &pr); + return ret; +} + + +static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_reserve) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags); + if (!pr->ret) + return -1; + + return 0; +} + +static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, + u32 flags) +{ + struct dm_pr pr = { + .old_key = key, + .flags = flags, + .type = type, + .fail_early = false, + .ret = 0, + }; + int ret; + + ret = dm_call_pr(bdev, __dm_pr_reserve, &pr); + if (ret) + return ret; + + return pr.ret; +} + +/* + * If there is a non-All Registrants type of reservation, the release must be + * sent down the holding path. For the cases where there is no reservation or + * the path is not the holder the device will also return success, so we must + * try each path to make sure we got the correct path. + */ +static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_release) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type); + if (pr->ret) + return -1; + + return 0; +} + +static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + struct dm_pr pr = { + .old_key = key, + .type = type, + .fail_early = false, + }; + int ret; + + ret = dm_call_pr(bdev, __dm_pr_release, &pr); + if (ret) + return ret; + + return pr.ret; +} + +static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_preempt) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type, + pr->abort); + if (!pr->ret) + return -1; + + return 0; +} + +static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, + enum pr_type type, bool abort) +{ + struct dm_pr pr = { + .new_key = new_key, + .old_key = old_key, + .type = type, + .fail_early = false, + }; + int ret; + + ret = dm_call_pr(bdev, __dm_pr_preempt, &pr); + if (ret) + return ret; + + return pr.ret; +} + +static int dm_pr_clear(struct block_device *bdev, u64 key) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + int r, srcu_idx; + + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + if (r < 0) + goto out; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_clear) + r = ops->pr_clear(bdev, key); + else + r = -EOPNOTSUPP; +out: + dm_unprepare_ioctl(md, srcu_idx); + return r; +} + +static const struct pr_ops dm_pr_ops = { + .pr_register = dm_pr_register, + .pr_reserve = dm_pr_reserve, + .pr_release = dm_pr_release, + .pr_preempt = dm_pr_preempt, + .pr_clear = dm_pr_clear, +}; + +static const struct block_device_operations dm_blk_dops = { + .submit_bio = dm_submit_bio, + .poll_bio = dm_poll_bio, + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .getgeo = dm_blk_getgeo, + .report_zones = dm_blk_report_zones, + .pr_ops = &dm_pr_ops, + .owner = THIS_MODULE +}; + +static const struct block_device_operations dm_rq_blk_dops = { + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .getgeo = dm_blk_getgeo, + .pr_ops = &dm_pr_ops, + .owner = THIS_MODULE +}; + +static const struct dax_operations dm_dax_ops = { + .direct_access = dm_dax_direct_access, + .zero_page_range = dm_dax_zero_page_range, + .recovery_write = dm_dax_recovery_write, +}; + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +module_param(major, uint, 0); +MODULE_PARM_DESC(major, "The major number of the device mapper"); + +module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); + +module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); + +module_param(swap_bios, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); + +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h new file mode 100644 index 000000000..a7917df09 --- /dev/null +++ b/drivers/md/dm.h @@ -0,0 +1,227 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-stats.h" + +/* + * Suspend feature flags + */ +#define DM_SUSPEND_LOCKFS_FLAG (1 << 0) +#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) + +/* + * Status feature flags + */ +#define DM_STATUS_NOFLUSH_FLAG (1 << 0) + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev_internal { + struct list_head list; + refcount_t count; + struct dm_dev *dm_dev; +}; + +struct dm_table; +struct dm_md_mempools; +struct dm_target_io; +struct dm_io; + +/*----------------------------------------------------------------- + * Internal table functions. + *---------------------------------------------------------------*/ +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +bool dm_table_has_no_data_devices(struct dm_table *table); +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits); +int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits); +struct list_head *dm_table_get_devices(struct dm_table *t); +void dm_table_presuspend_targets(struct dm_table *t); +void dm_table_presuspend_undo_targets(struct dm_table *t); +void dm_table_postsuspend_targets(struct dm_table *t); +int dm_table_resume_targets(struct dm_table *t); +enum dm_queue_mode dm_table_get_type(struct dm_table *t); +struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); +struct dm_target *dm_table_get_immutable_target(struct dm_table *t); +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); +bool dm_table_bio_based(struct dm_table *t); +bool dm_table_request_based(struct dm_table *t); + +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type); +enum dm_queue_mode dm_get_md_type(struct mapped_device *md); +struct target_type *dm_get_immutable_target_type(struct mapped_device *md); + +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); + +/* + * To check whether the target type is bio-based or not (request-based). + */ +#define dm_target_bio_based(t) ((t)->type->map != NULL) + +/* + * To check whether the target type is request-based or not (bio-based). + */ +#define dm_target_request_based(t) ((t)->type->clone_and_map_rq != NULL) + +/* + * To check whether the target type is a hybrid (capable of being + * either request-based or bio-based). + */ +#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t)) + +/* + * Zoned targets related functions. + */ +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +void dm_zone_endio(struct dm_io *io, struct bio *clone); +#ifdef CONFIG_BLK_DEV_ZONED +void dm_cleanup_zoned_dev(struct mapped_device *md); +int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); +int dm_zone_map_bio(struct dm_target_io *io); +#else +static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {} +#define dm_blk_report_zones NULL +static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) +{ + return false; +} +static inline int dm_zone_map_bio(struct dm_target_io *tio) +{ + return DM_MAPIO_KILL; +} +#endif + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *tt); +int dm_target_iterate(void (*iter_func)(struct target_type *tt, + void *param), void *param); + +int dm_split_args(int *argc, char ***argvp, char *input); + +/* + * Is this mapped_device being deleted? + */ +int dm_deleting_md(struct mapped_device *md); + +/* + * Is this mapped_device suspended? + */ +int dm_suspended_md(struct mapped_device *md); + +/* + * Internal suspend and resume methods. + */ +int dm_suspended_internally_md(struct mapped_device *md); +void dm_internal_suspend_fast(struct mapped_device *md); +void dm_internal_resume_fast(struct mapped_device *md); +void dm_internal_suspend_noflush(struct mapped_device *md); +void dm_internal_resume(struct mapped_device *md); + +/* + * Test if the device is scheduled for deferred remove. + */ +int dm_test_deferred_remove_flag(struct mapped_device *md); + +/* + * Try to remove devices marked for deferred removal. + */ +void dm_deferred_remove(void); + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * sysfs interface + */ +int dm_sysfs_init(struct mapped_device *md); +void dm_sysfs_exit(struct mapped_device *md); +struct kobject *dm_kobject(struct mapped_device *md); +struct mapped_device *dm_get_from_kobject(struct kobject *kobj); + +/* + * The kobject helper + */ +void dm_kobject_release(struct kobject *kobj); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +/* + * mapped_device operations + */ +void dm_destroy(struct mapped_device *md); +void dm_destroy_immediate(struct mapped_device *md); +int dm_open_count(struct mapped_device *md); +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); +int dm_cancel_deferred_remove(struct mapped_device *md); +int dm_request_based(struct mapped_device *md); +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result); +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); + +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned int cookie, bool need_resize_uevent); + +void dm_internal_suspend(struct mapped_device *md); +void dm_internal_resume(struct mapped_device *md); + +int dm_io_init(void); +void dm_io_exit(void); + +int dm_kcopyd_init(void); +void dm_kcopyd_exit(void); + +/* + * Mempool operations + */ +void dm_free_md_mempools(struct dm_md_mempools *pools); + +/* + * Various helpers + */ +unsigned int dm_get_reserved_bio_based_ios(void); + +#endif diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c new file mode 100644 index 000000000..91836e6de --- /dev/null +++ b/drivers/md/md-autodetect.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "md.h" + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +#ifdef CONFIG_MD_AUTODETECT +static int __initdata raid_noautodetect; +#else +static int __initdata raid_noautodetect=1; +#endif +static int __initdata raid_autopart; + +static struct md_setup_args { + int minor; + int partitioned; + int level; + int chunk; + char *device_names; +} md_setup_args[256] __initdata; + +static int md_setup_ents __initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistent-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int __init md_setup(char *str) +{ + int minor, level, factor, fault, partitioned = 0; + char *pername = ""; + char *str1; + int ent; + + if (*str == 'd') { + partitioned = 1; + str++; + } + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + str1 = str; + for (ent=0 ; ent< md_setup_ents ; ent++) + if (md_setup_args[ent].minor == minor && + md_setup_args[ent].partitioned == partitioned) { + printk(KERN_WARNING "md: md=%s%d, Specified more than once. " + "Replacing previous definition.\n", partitioned?"d":"", minor); + break; + } + if (ent >= ARRAY_SIZE(md_setup_args)) { + printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor); + return 0; + } + if (ent >= md_setup_ents) + md_setup_ents++; + switch (get_option(&str, &level)) { /* RAID level */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == LEVEL_LINEAR) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args[ent].level = level; + md_setup_args[ent].chunk = 1 << (factor+12); + if (level == LEVEL_LINEAR) + pername = "linear"; + else + pername = "raid0"; + break; + } + fallthrough; + case 1: /* the first device is numeric */ + str = str1; + fallthrough; + case 0: + md_setup_args[ent].level = LEVEL_NONE; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args[ent].device_names = str; + md_setup_args[ent].partitioned = partitioned; + md_setup_args[ent].minor = minor; + + return 1; +} + +static void __init md_setup_drive(struct md_setup_args *args) +{ + char *devname = args->device_names; + dev_t devices[MD_SB_DISKS + 1], mdev; + struct mdu_array_info_s ainfo = { }; + struct mddev *mddev; + int err = 0, i; + char name[16]; + + if (args->partitioned) { + mdev = MKDEV(mdp_major, args->minor << MdpMinorShift); + sprintf(name, "md_d%d", args->minor); + } else { + mdev = MKDEV(MD_MAJOR, args->minor); + sprintf(name, "md%d", args->minor); + } + + for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) { + struct kstat stat; + char *p; + char comp_name[64]; + dev_t dev; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_dev_t(devname); + if (strncmp(devname, "/dev/", 5) == 0) + devname += 5; + snprintf(comp_name, 63, "/dev/%s", devname); + if (init_stat(comp_name, &stat, 0) == 0 && S_ISBLK(stat.mode)) + dev = new_decode_dev(stat.rdev); + if (!dev) { + pr_warn("md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + devname = p; + } + devices[i] = 0; + + if (!i) + return; + + pr_info("md: Loading %s: %s\n", name, args->device_names); + + mddev = md_alloc(mdev, name); + if (IS_ERR(mddev)) { + pr_err("md: md_alloc failed - cannot start array %s\n", name); + return; + } + + err = mddev_lock(mddev); + if (err) { + pr_err("md: failed to lock array %s\n", name); + goto out_mddev_put; + } + + if (!list_empty(&mddev->disks) || mddev->raid_disks) { + pr_warn("md: Ignoring %s, already autodetected. (Use raid=noautodetect)\n", + name); + goto out_unlock; + } + + if (args->level != LEVEL_NONE) { + /* non-persistent */ + ainfo.level = args->level; + ainfo.md_minor = args->minor; + ainfo.not_persistent = 1; + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.chunk_size = args->chunk; + while (devices[ainfo.raid_disks]) + ainfo.raid_disks++; + } + + err = md_set_array_info(mddev, &ainfo); + + for (i = 0; i <= MD_SB_DISKS && devices[i]; i++) { + struct mdu_disk_info_s dinfo = { + .major = MAJOR(devices[i]), + .minor = MINOR(devices[i]), + }; + + if (args->level != LEVEL_NONE) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = + (1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC); + } + + md_add_new_disk(mddev, &dinfo); + } + + if (!err) + err = do_md_run(mddev); + if (err) + pr_warn("md: starting %s failed\n", name); +out_unlock: + mddev_unlock(mddev); +out_mddev_put: + mddev_put(mddev); +} + +static int __init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (!strncmp(str, "noautodetect", wlen)) + raid_noautodetect = 1; + if (!strncmp(str, "autodetect", wlen)) + raid_noautodetect = 0; + if (strncmp(str, "partitionable", wlen)==0) + raid_autopart = 1; + if (strncmp(str, "part", wlen)==0) + raid_autopart = 1; + pos += wlen+1; + } + return 1; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +static void __init autodetect_raid(void) +{ + /* + * Since we don't want to detect and use half a raid array, we need to + * wait for the known devices to complete their probing + */ + printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n"); + printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n"); + + wait_for_device_probe(); + md_autostart_arrays(raid_autopart); +} + +void __init md_run_setup(void) +{ + int ent; + + if (raid_noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n"); + else + autodetect_raid(); + + for (ent = 0; ent < md_setup_ents; ent++) + md_setup_drive(&md_setup_args[ent]); +} diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c new file mode 100644 index 000000000..5200bba63 --- /dev/null +++ b/drivers/md/md-bitmap.c @@ -0,0 +1,2653 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * bitmap_create - sets up the bitmap structure + * bitmap_destroy - destroys the bitmap structure + * + * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: + * - added disk storage for bitmap + * - changes to allow various bitmap chunk sizes + */ + +/* + * Still to do: + * + * flush after percent set rather than just time based. (maybe both). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "md-bitmap.h" + +static inline char *bmname(struct bitmap *bitmap) +{ + return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; +} + +/* + * check a page and, if necessary, allocate it (or hijack it if the alloc fails) + * + * 1) check to see if this page is allocated, if it's not then try to alloc + * 2) if the alloc fails, set the page's hijacked flag so we'll use the + * page pointer directly as a counter + * + * if we find our page, we increment the page's refcount so that it stays + * allocated while we're using it + */ +static int md_bitmap_checkpage(struct bitmap_counts *bitmap, + unsigned long page, int create, int no_hijack) +__releases(bitmap->lock) +__acquires(bitmap->lock) +{ + unsigned char *mappage; + + WARN_ON_ONCE(page >= bitmap->pages); + if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ + return 0; + + if (bitmap->bp[page].map) /* page is already allocated, just return */ + return 0; + + if (!create) + return -ENOENT; + + /* this page has not been allocated yet */ + + spin_unlock_irq(&bitmap->lock); + /* It is possible that this is being called inside a + * prepare_to_wait/finish_wait loop from raid5c:make_request(). + * In general it is not permitted to sleep in that context as it + * can cause the loop to spin freely. + * That doesn't apply here as we can only reach this point + * once with any loop. + * When this function completes, either bp[page].map or + * bp[page].hijacked. In either case, this function will + * abort before getting to this point again. So there is + * no risk of a free-spin, and so it is safe to assert + * that sleeping here is allowed. + */ + sched_annotate_sleep(); + mappage = kzalloc(PAGE_SIZE, GFP_NOIO); + spin_lock_irq(&bitmap->lock); + + if (mappage == NULL) { + pr_debug("md/bitmap: map page allocation failed, hijacking\n"); + /* We don't support hijack for cluster raid */ + if (no_hijack) + return -ENOMEM; + /* failed - set the hijacked flag so that we can use the + * pointer as a counter */ + if (!bitmap->bp[page].map) + bitmap->bp[page].hijacked = 1; + } else if (bitmap->bp[page].map || + bitmap->bp[page].hijacked) { + /* somebody beat us to getting the page */ + kfree(mappage); + } else { + + /* no page was in place and we have one, so install it */ + + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; + } + return 0; +} + +/* if page is completely empty, put it back on the free list, or dealloc it */ +/* if page was hijacked, unmark the flag so it might get alloced next time */ +/* Note: lock should be held when calling this */ +static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) +{ + char *ptr; + + if (bitmap->bp[page].count) /* page is still busy */ + return; + + /* page is no longer in use, it can be released */ + + if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ + bitmap->bp[page].hijacked = 0; + bitmap->bp[page].map = NULL; + } else { + /* normal case, free the page */ + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + kfree(ptr); + } +} + +/* + * bitmap file handling - read and write the bitmap file and its superblock + */ + +/* + * basic page I/O operations + */ + +/* IO operations when bitmap is stored near all superblocks */ +static int read_sb_page(struct mddev *mddev, loff_t offset, + struct page *page, + unsigned long index, int size) +{ + /* choose a good rdev and read the page from there */ + + struct md_rdev *rdev; + sector_t target; + + rdev_for_each(rdev, mddev) { + if (! test_bit(In_sync, &rdev->flags) + || test_bit(Faulty, &rdev->flags) + || test_bit(Bitmap_sync, &rdev->flags)) + continue; + + target = offset + index * (PAGE_SIZE/512); + + if (sync_page_io(rdev, target, + roundup(size, bdev_logical_block_size(rdev->bdev)), + page, REQ_OP_READ, true)) { + page->index = index; + return 0; + } + } + return -EIO; +} + +static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + /* Iterate the disks of an mddev, using rcu to protect access to the + * linked list, and raising the refcount of devices we return to ensure + * they don't disappear while in use. + * As devices are only added or removed when raid_disk is < 0 and + * nr_pending is 0 and In_sync is clear, the entries we return will + * still be in the same position on the list when we re-enter + * list_for_each_entry_continue_rcu. + * + * Note that if entered with 'rdev == NULL' to start at the + * beginning, we temporarily assign 'rdev' to an address which + * isn't really an rdev, but which can be used by + * list_for_each_entry_continue_rcu() to find the first entry. + */ + rcu_read_lock(); + if (rdev == NULL) + /* start at the beginning */ + rdev = list_entry(&mddev->disks, struct md_rdev, same_set); + else { + /* release the previous rdev and start from there. */ + rdev_dec_pending(rdev, mddev); + } + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* this is a usable devices */ + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return rdev; + } + } + rcu_read_unlock(); + return NULL; +} + +static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct md_rdev *rdev; + struct block_device *bdev; + struct mddev *mddev = bitmap->mddev; + struct bitmap_storage *store = &bitmap->storage; + +restart: + rdev = NULL; + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { + int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + + if (page->index == store->file_pages-1) { + int last_page_size = store->bytes & (PAGE_SIZE-1); + if (last_page_size == 0) + last_page_size = PAGE_SIZE; + size = roundup(last_page_size, + bdev_logical_block_size(bdev)); + } + /* Just make sure we aren't corrupting data or + * metadata + */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index + * (PAGE_SIZE/512)) + > rdev->data_offset + && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512))) + goto bad_alignment; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE/512)) + + size/512 > 0) + /* bitmap runs in to metadata */ + goto bad_alignment; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + + offset + + page->index*(PAGE_SIZE/512) + size/512 + > rdev->data_offset) + /* bitmap runs in to data */ + goto bad_alignment; + } else { + /* DATA METADATA BITMAP - no problems */ + } + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE/512), + size, + page); + } + + if (wait && md_super_wait(mddev) < 0) + goto restart; + return 0; + + bad_alignment: + return -EINVAL; +} + +static void md_bitmap_file_kick(struct bitmap *bitmap); +/* + * write out a page to a file + */ +static void write_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct buffer_head *bh; + + if (bitmap->storage.file == NULL) { + switch (write_sb_page(bitmap, page, wait)) { + case -EINVAL: + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + } + } else { + + bh = page_buffers(page); + + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + bh = bh->b_this_page; + } + + if (wait) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + } + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + md_bitmap_file_kick(bitmap); +} + +static void end_bitmap_write(struct buffer_head *bh, int uptodate) +{ + struct bitmap *bitmap = bh->b_private; + + if (!uptodate) + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + if (atomic_dec_and_test(&bitmap->pending_writes)) + wake_up(&bitmap->write_wait); +} + +static void free_buffers(struct page *page) +{ + struct buffer_head *bh; + + if (!PagePrivate(page)) + return; + + bh = page_buffers(page); + while (bh) { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } + detach_page_private(page); + put_page(page); +} + +/* read a page from a file. + * We both read the page, and attach buffers to the page to record the + * address of each block (using bmap). These addresses will be used + * to write the block later, completely bypassing the filesystem. + * This usage is similar to how swap files are handled, and allows us + * to write to a file with no concerns of memory allocation failing. + */ +static int read_page(struct file *file, unsigned long index, + struct bitmap *bitmap, + unsigned long count, + struct page *page) +{ + int ret = 0; + struct inode *inode = file_inode(file); + struct buffer_head *bh; + sector_t block, blk_cur; + unsigned long blocksize = i_blocksize(inode); + + pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT); + + bh = alloc_page_buffers(page, blocksize, false); + if (!bh) { + ret = -ENOMEM; + goto out; + } + attach_page_private(page, bh); + blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); + while (bh) { + block = blk_cur; + + if (count == 0) + bh->b_blocknr = 0; + else { + ret = bmap(inode, &block); + if (ret || !block) { + ret = -EINVAL; + bh->b_blocknr = 0; + goto out; + } + + bh->b_blocknr = block; + bh->b_bdev = inode->i_sb->s_bdev; + if (count < blocksize) + count = 0; + else + count -= blocksize; + + bh->b_end_io = end_bitmap_write; + bh->b_private = bitmap; + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_READ, bh); + } + blk_cur++; + bh = bh->b_this_page; + } + page->index = index; + + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + ret = -EIO; +out: + if (ret) + pr_err("md: bitmap read error: (%dB @ %llu): %d\n", + (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT, + ret); + return ret; +} + +/* + * bitmap file superblock operations + */ + +/* + * md_bitmap_wait_writes() should be called before writing any bitmap + * blocks, to ensure previous writes, particularly from + * md_bitmap_daemon_work(), have completed. + */ +static void md_bitmap_wait_writes(struct bitmap *bitmap) +{ + if (bitmap->storage.file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else + /* Note that we ignore the return value. The writes + * might have failed, but that would just mean that + * some bits which should be cleared haven't been, + * which is safe. The relevant bitmap blocks will + * probably get written again, but there is no great + * loss if they aren't. + */ + md_super_wait(bitmap->mddev); +} + + +/* update the event counter and sync the superblock to disk */ +void md_bitmap_update_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ + return; + if (bitmap->mddev->bitmap_info.external) + return; + if (!bitmap->storage.sb_page) /* no superblock */ + return; + sb = kmap_atomic(bitmap->storage.sb_page); + sb->events = cpu_to_le64(bitmap->mddev->events); + if (bitmap->mddev->events < bitmap->events_cleared) + /* rocking back to read-only */ + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + /* + * clear BITMAP_WRITE_ERROR bit to protect against the case that + * a bitmap write error occurred but the later writes succeeded. + */ + sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); + /* Just in case these have been changed via sysfs: */ + sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); + sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); + /* This might have been changed by a reshape */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); + sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); + sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> + bitmap_info.space); + kunmap_atomic(sb); + write_page(bitmap, bitmap->storage.sb_page, 1); +} +EXPORT_SYMBOL(md_bitmap_update_sb); + +/* print out the bitmap file superblock */ +void md_bitmap_print_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->storage.sb_page) + return; + sb = kmap_atomic(bitmap->storage.sb_page); + pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); + pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); + pr_debug(" version: %u\n", le32_to_cpu(sb->version)); + pr_debug(" uuid: %08x.%08x.%08x.%08x\n", + le32_to_cpu(*(__le32 *)(sb->uuid+0)), + le32_to_cpu(*(__le32 *)(sb->uuid+4)), + le32_to_cpu(*(__le32 *)(sb->uuid+8)), + le32_to_cpu(*(__le32 *)(sb->uuid+12))); + pr_debug(" events: %llu\n", + (unsigned long long) le64_to_cpu(sb->events)); + pr_debug("events cleared: %llu\n", + (unsigned long long) le64_to_cpu(sb->events_cleared)); + pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); + pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize)); + pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep)); + pr_debug(" sync size: %llu KB\n", + (unsigned long long)le64_to_cpu(sb->sync_size)/2); + pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); + kunmap_atomic(sb); +} + +/* + * bitmap_new_disk_sb + * @bitmap + * + * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb + * reads and verifies the on-disk bitmap superblock and populates bitmap_info. + * This function verifies 'bitmap_info' and populates the on-disk bitmap + * structure, which is to be written to disk. + * + * Returns: 0 on success, -Exxx on error + */ +static int md_bitmap_new_disk_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + + bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; + bitmap->storage.sb_page->index = 0; + + sb = kmap_atomic(bitmap->storage.sb_page); + + sb->magic = cpu_to_le32(BITMAP_MAGIC); + sb->version = cpu_to_le32(BITMAP_MAJOR_HI); + + chunksize = bitmap->mddev->bitmap_info.chunksize; + BUG_ON(!chunksize); + if (!is_power_of_2(chunksize)) { + kunmap_atomic(sb); + pr_warn("bitmap chunksize not a power of 2\n"); + return -EINVAL; + } + sb->chunksize = cpu_to_le32(chunksize); + + daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; + if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { + pr_debug("Choosing daemon_sleep default (5 sec)\n"); + daemon_sleep = 5 * HZ; + } + sb->daemon_sleep = cpu_to_le32(daemon_sleep); + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + + /* + * FIXME: write_behind for RAID1. If not specified, what + * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. + */ + write_behind = bitmap->mddev->bitmap_info.max_write_behind; + if (write_behind > COUNTER_MAX) + write_behind = COUNTER_MAX / 2; + sb->write_behind = cpu_to_le32(write_behind); + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + memcpy(sb->uuid, bitmap->mddev->uuid, 16); + + set_bit(BITMAP_STALE, &bitmap->flags); + sb->state = cpu_to_le32(bitmap->flags); + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + bitmap->mddev->bitmap_info.nodes = 0; + + kunmap_atomic(sb); + + return 0; +} + +/* read the superblock from the bitmap file and initialize some bitmap fields */ +static int md_bitmap_read_sb(struct bitmap *bitmap) +{ + char *reason = NULL; + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + unsigned long long events; + int nodes = 0; + unsigned long sectors_reserved = 0; + int err = -EINVAL; + struct page *sb_page; + loff_t offset = bitmap->mddev->bitmap_info.offset; + + if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { + chunksize = 128 * 1024 * 1024; + daemon_sleep = 5 * HZ; + write_behind = 0; + set_bit(BITMAP_STALE, &bitmap->flags); + err = 0; + goto out_no_sb; + } + /* page 0 is the superblock, read it... */ + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) + return -ENOMEM; + bitmap->storage.sb_page = sb_page; + +re_read: + /* If cluster_slot is set, the cluster is setup */ + if (bitmap->cluster_slot >= 0) { + sector_t bm_blocks = bitmap->mddev->resync_max_sectors; + + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, + (bitmap->mddev->bitmap_info.chunksize >> 9)); + /* bits to bytes */ + bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); + /* to 4k blocks */ + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); + offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); + pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, + bitmap->cluster_slot, offset); + } + + if (bitmap->storage.file) { + loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); + int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; + + err = read_page(bitmap->storage.file, 0, + bitmap, bytes, sb_page); + } else { + err = read_sb_page(bitmap->mddev, + offset, + sb_page, + 0, sizeof(bitmap_super_t)); + } + if (err) + return err; + + err = -EINVAL; + sb = kmap_atomic(sb_page); + + chunksize = le32_to_cpu(sb->chunksize); + daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; + write_behind = le32_to_cpu(sb->write_behind); + sectors_reserved = le32_to_cpu(sb->sectors_reserved); + + /* verify that the bitmap-specific fields are valid */ + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) + reason = "bad magic"; + else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || + le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) + reason = "unrecognized superblock version"; + else if (chunksize < 512) + reason = "bitmap chunksize too small"; + else if (!is_power_of_2(chunksize)) + reason = "bitmap chunksize not a power of 2"; + else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) + reason = "daemon sleep period out of range"; + else if (write_behind > COUNTER_MAX) + reason = "write-behind limit out of range (0 - 16383)"; + if (reason) { + pr_warn("%s: invalid bitmap file superblock: %s\n", + bmname(bitmap), reason); + goto out; + } + + /* + * Setup nodes/clustername only if bitmap version is + * cluster-compatible + */ + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { + nodes = le32_to_cpu(sb->nodes); + strscpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + if (bitmap->mddev->persistent) { + /* + * We have a persistent array superblock, so compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + pr_warn("%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (!nodes && (events < bitmap->mddev->events)) { + pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + set_bit(BITMAP_STALE, &bitmap->flags); + } + } + + /* assign fields using values from superblock */ + bitmap->flags |= le32_to_cpu(sb->state); + if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) + set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); + bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + err = 0; + +out: + kunmap_atomic(sb); + if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { + /* Assigning chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; + err = md_setup_cluster(bitmap->mddev, nodes); + if (err) { + pr_warn("%s: Could not setup cluster service (%d)\n", + bmname(bitmap), err); + goto out_no_sb; + } + bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + goto re_read; + } + +out_no_sb: + if (err == 0) { + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; + } else { + md_bitmap_print_sb(bitmap); + if (bitmap->cluster_slot < 0) + md_cluster_stop(bitmap->mddev); + } + return err; +} + +/* + * general bitmap file operations + */ + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ +/* calculate the index of the page that contains this bit */ +static inline unsigned long file_page_index(struct bitmap_storage *store, + unsigned long chunk) +{ + if (store->sb_page) + chunk += sizeof(bitmap_super_t) << 3; + return chunk >> PAGE_BIT_SHIFT; +} + +/* calculate the (bit) offset of this bit within a page */ +static inline unsigned long file_page_offset(struct bitmap_storage *store, + unsigned long chunk) +{ + if (store->sb_page) + chunk += sizeof(bitmap_super_t) << 3; + return chunk & (PAGE_BITS - 1); +} + +/* + * return a pointer to the page in the filemap that contains the given bit + * + */ +static inline struct page *filemap_get_page(struct bitmap_storage *store, + unsigned long chunk) +{ + if (file_page_index(store, chunk) >= store->file_pages) + return NULL; + return store->filemap[file_page_index(store, chunk)]; +} + +static int md_bitmap_storage_alloc(struct bitmap_storage *store, + unsigned long chunks, int with_super, + int slot_number) +{ + int pnum, offset = 0; + unsigned long num_pages; + unsigned long bytes; + + bytes = DIV_ROUND_UP(chunks, 8); + if (with_super) + bytes += sizeof(bitmap_super_t); + + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + offset = slot_number * num_pages; + + store->filemap = kmalloc_array(num_pages, sizeof(struct page *), + GFP_KERNEL); + if (!store->filemap) + return -ENOMEM; + + if (with_super && !store->sb_page) { + store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (store->sb_page == NULL) + return -ENOMEM; + } + + pnum = 0; + if (store->sb_page) { + store->filemap[0] = store->sb_page; + pnum = 1; + store->sb_page->index = offset; + } + + for ( ; pnum < num_pages; pnum++) { + store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!store->filemap[pnum]) { + store->file_pages = pnum; + return -ENOMEM; + } + store->filemap[pnum]->index = pnum + offset; + } + store->file_pages = pnum; + + /* We need 4 bits per page, rounded up to a multiple + * of sizeof(unsigned long) */ + store->filemap_attr = kzalloc( + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + GFP_KERNEL); + if (!store->filemap_attr) + return -ENOMEM; + + store->bytes = bytes; + + return 0; +} + +static void md_bitmap_file_unmap(struct bitmap_storage *store) +{ + struct page **map, *sb_page; + int pages; + struct file *file; + + file = store->file; + map = store->filemap; + pages = store->file_pages; + sb_page = store->sb_page; + + while (pages--) + if (map[pages] != sb_page) /* 0 is sb_page, release it below */ + free_buffers(map[pages]); + kfree(map); + kfree(store->filemap_attr); + + if (sb_page) + free_buffers(sb_page); + + if (file) { + struct inode *inode = file_inode(file); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + fput(file); + } +} + +/* + * bitmap_file_kick - if an error occurs while manipulating the bitmap file + * then it is no longer reliable, so we stop using it and we mark the file + * as failed in the superblock + */ +static void md_bitmap_file_kick(struct bitmap *bitmap) +{ + char *path, *ptr = NULL; + + if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { + md_bitmap_update_sb(bitmap); + + if (bitmap->storage.file) { + path = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (path) + ptr = file_path(bitmap->storage.file, + path, PAGE_SIZE); + + pr_warn("%s: kicking failed bitmap file %s from array!\n", + bmname(bitmap), IS_ERR(ptr) ? "" : ptr); + + kfree(path); + } else + pr_warn("%s: disabling internal bitmap due to errors\n", + bmname(bitmap)); + } +} + +enum bitmap_page_attr { + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ + BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. + * i.e. counter is 1 or 2. */ + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ +}; + +static inline void set_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); +} + +static inline void clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); +} + +static inline int test_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); +} + +static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + return test_and_clear_bit((pnum<<2) + attr, + bitmap->storage.filemap_attr); +} +/* + * bitmap_file_set_bit -- called before performing a write to the md device + * to set (and eventually sync) a particular bit in the bitmap file + * + * we set the bit immediately, then we record the page number so that + * when an unplug occurs, we can flush the dirty pages out to disk + */ +static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *kaddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return; + bit = file_page_offset(&bitmap->storage, chunk); + + /* set the bit */ + kaddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + set_bit(bit, kaddr); + else + set_bit_le(bit, kaddr); + kunmap_atomic(kaddr); + pr_debug("set file bit %lu page %lu\n", bit, page->index); + /* record page number so it gets flushed to disk when unplug occurs */ + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); +} + +static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + clear_bit(bit, paddr); + else + clear_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } +} + +static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + int set = 0; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return -EINVAL; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + set = test_bit(bit, paddr); + else + set = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + return set; +} + +/* this gets called when the md device is ready to unplug its underlying + * (slave) device queues -- before we let any writes go down, we need to + * sync the dirty pages of the bitmap file to disk */ +void md_bitmap_unplug(struct bitmap *bitmap) +{ + unsigned long i; + int dirty, need_write; + int writing = 0; + + if (!md_bitmap_enabled(bitmap)) + return; + + /* look at each page to see if there are any set bits that need to be + * flushed out to disk */ + for (i = 0; i < bitmap->storage.file_pages; i++) { + dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + need_write = test_and_clear_page_attr(bitmap, i, + BITMAP_PAGE_NEEDWRITE); + if (dirty || need_write) { + if (!writing) { + md_bitmap_wait_writes(bitmap); + if (bitmap->mddev->queue) + blk_add_trace_msg(bitmap->mddev->queue, + "md bitmap_unplug"); + } + clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); + write_page(bitmap, bitmap->storage.filemap[i], 0); + writing = 1; + } + } + if (writing) + md_bitmap_wait_writes(bitmap); + + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + md_bitmap_file_kick(bitmap); +} +EXPORT_SYMBOL(md_bitmap_unplug); + +static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); +/* * bitmap_init_from_disk -- called at bitmap_create time to initialize + * the in-memory bitmap from the on-disk bitmap -- also, sets up the + * memory mapping of the bitmap file + * Special cases: + * if there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as + * 1's in order to cause a full resync. + * + * We ignore all bits for sectors that end earlier than 'start'. + * This is used when reading an out-of-date bitmap... + */ +static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) +{ + unsigned long i, chunks, index, oldindex, bit, node_offset = 0; + struct page *page = NULL; + unsigned long bit_cnt = 0; + struct file *file; + unsigned long offset; + int outofdate; + int ret = -ENOSPC; + void *paddr; + struct bitmap_storage *store = &bitmap->storage; + + chunks = bitmap->counts.chunks; + file = store->file; + + if (!file && !bitmap->mddev->bitmap_info.offset) { + /* No permanent bitmap - fill with '1s'. */ + store->filemap = NULL; + store->file_pages = 0; + for (i = 0; i < chunks ; i++) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) + >= start); + md_bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); + } + return 0; + } + + outofdate = test_bit(BITMAP_STALE, &bitmap->flags); + if (outofdate) + pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); + + if (file && i_size_read(file->f_mapping->host) < store->bytes) { + pr_warn("%s: bitmap file too short %lu < %lu\n", + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + store->bytes); + goto err; + } + + oldindex = ~0L; + offset = 0; + if (!bitmap->mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); + + for (i = 0; i < chunks; i++) { + int b; + index = file_page_index(&bitmap->storage, i); + bit = file_page_offset(&bitmap->storage, i); + if (index != oldindex) { /* this is a new page, read it in */ + int count; + /* unmap the old page, we're done with it */ + if (index == store->file_pages-1) + count = store->bytes - index * PAGE_SIZE; + else + count = PAGE_SIZE; + page = store->filemap[index]; + if (file) + ret = read_page(file, index, bitmap, + count, page); + else + ret = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index + node_offset, count); + + if (ret) + goto err; + + oldindex = index; + + if (outofdate) { + /* + * if bitmap is out of date, dirty the + * whole page and write it out + */ + paddr = kmap_atomic(page); + memset(paddr + offset, 0xff, + PAGE_SIZE - offset); + kunmap_atomic(paddr); + write_page(bitmap, page, 1); + + ret = -EIO; + if (test_bit(BITMAP_WRITE_ERROR, + &bitmap->flags)) + goto err; + } + } + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + b = test_bit(bit, paddr); + else + b = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (b) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift + >= start); + md_bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); + bit_cnt++; + } + offset = 0; + } + + pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", + bmname(bitmap), store->file_pages, + bit_cnt, chunks); + + return 0; + + err: + pr_warn("%s: bitmap initialisation failed: %d\n", + bmname(bitmap), ret); + return ret; +} + +void md_bitmap_write_all(struct bitmap *bitmap) +{ + /* We don't actually write all bitmap blocks here, + * just flag them as needing to be written + */ + int i; + + if (!bitmap || !bitmap->storage.filemap) + return; + if (bitmap->storage.file) + /* Only one copy, so nothing needed */ + return; + + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, + BITMAP_PAGE_NEEDWRITE); + bitmap->allclean = 0; +} + +static void md_bitmap_count_page(struct bitmap_counts *bitmap, + sector_t offset, int inc) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + bitmap->bp[page].count += inc; + md_bitmap_checkfree(bitmap, page); +} + +static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + struct bitmap_page *bp = &bitmap->bp[page]; + + if (!bp->pending) + bp->pending = 1; +} + +static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, + int create); + +/* + * bitmap daemon -- periodically wakes up to clean bits and flush pages + * out to disk + */ + +void md_bitmap_daemon_work(struct mddev *mddev) +{ + struct bitmap *bitmap; + unsigned long j; + unsigned long nextpage; + sector_t blocks; + struct bitmap_counts *counts; + + /* Use a mutex to guard daemon_work against + * bitmap_destroy. + */ + mutex_lock(&mddev->bitmap_info.mutex); + bitmap = mddev->bitmap; + if (bitmap == NULL) { + mutex_unlock(&mddev->bitmap_info.mutex); + return; + } + if (time_before(jiffies, bitmap->daemon_lastrun + + mddev->bitmap_info.daemon_sleep)) + goto done; + + bitmap->daemon_lastrun = jiffies; + if (bitmap->allclean) { + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + goto done; + } + bitmap->allclean = 1; + + if (bitmap->mddev->queue) + blk_add_trace_msg(bitmap->mddev->queue, + "md bitmap_daemon_work"); + + /* Any file-page which is PENDING now needs to be written. + * So set NEEDWRITE now, then after we make any last-minute changes + * we will write it. + */ + for (j = 0; j < bitmap->storage.file_pages; j++) + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE); + + if (bitmap->need_sync && + mddev->bitmap_info.external == 0) { + /* Arrange for superblock update as well as + * other changes */ + bitmap_super_t *sb; + bitmap->need_sync = 0; + if (bitmap->storage.filemap) { + sb = kmap_atomic(bitmap->storage.sb_page); + sb->events_cleared = + cpu_to_le64(bitmap->events_cleared); + kunmap_atomic(sb); + set_page_attr(bitmap, 0, + BITMAP_PAGE_NEEDWRITE); + } + } + /* Now look at the bitmap counters and if any are '2' or '1', + * decrement and handle accordingly. + */ + counts = &bitmap->counts; + spin_lock_irq(&counts->lock); + nextpage = 0; + for (j = 0; j < counts->chunks; j++) { + bitmap_counter_t *bmc; + sector_t block = (sector_t)j << counts->chunkshift; + + if (j == nextpage) { + nextpage += PAGE_COUNTER_RATIO; + if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { + j |= PAGE_COUNTER_MASK; + continue; + } + counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; + } + + bmc = md_bitmap_get_counter(counts, block, &blocks, 0); + if (!bmc) { + j |= PAGE_COUNTER_MASK; + continue; + } + if (*bmc == 1 && !bitmap->need_sync) { + /* We can clear the bit */ + *bmc = 0; + md_bitmap_count_page(counts, block, -1); + md_bitmap_file_clear_bit(bitmap, block); + } else if (*bmc && *bmc <= 2) { + *bmc = 1; + md_bitmap_set_pending(counts, block); + bitmap->allclean = 0; + } + } + spin_unlock_irq(&counts->lock); + + md_bitmap_wait_writes(bitmap); + /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. + * DIRTY pages need to be written by bitmap_unplug so it can wait + * for them. + * If we find any DIRTY page we stop there and let bitmap_unplug + * handle all the rest. This is important in the case where + * the first blocking holds the superblock and it has been updated. + * We mustn't write any other blocks before the superblock. + */ + for (j = 0; + j < bitmap->storage.file_pages + && !test_bit(BITMAP_STALE, &bitmap->flags); + j++) { + if (test_page_attr(bitmap, j, + BITMAP_PAGE_DIRTY)) + /* bitmap_unplug will handle the rest */ + break; + if (bitmap->storage.filemap && + test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE)) { + write_page(bitmap, bitmap->storage.filemap[j], 0); + } + } + + done: + if (bitmap->allclean == 0) + mddev->thread->timeout = + mddev->bitmap_info.daemon_sleep; + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, + int create) +__releases(bitmap->lock) +__acquires(bitmap->lock) +{ + /* If 'create', we might release the lock and reclaim it. + * The lock must have been taken with interrupts enabled. + * If !create, we don't release the lock. + */ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; + sector_t csize; + int err; + + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } + err = md_bitmap_checkpage(bitmap, page, create, 0); + + if (bitmap->bp[page].hijacked || + bitmap->bp[page].map == NULL) + csize = ((sector_t)1) << (bitmap->chunkshift + + PAGE_COUNTER_SHIFT); + else + csize = ((sector_t)1) << bitmap->chunkshift; + *blocks = csize - (offset & (csize - 1)); + + if (err < 0) + return NULL; + + /* now locked ... */ + + if (bitmap->bp[page].hijacked) { /* hijacked pointer */ + /* should we use the first or second counter field + * of the hijacked pointer? */ + int hi = (pageoff > PAGE_COUNTER_MASK); + return &((bitmap_counter_t *) + &bitmap->bp[page].map)[hi]; + } else /* page is allocated */ + return (bitmap_counter_t *) + &(bitmap->bp[page].map[pageoff]); +} + +int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +{ + if (!bitmap) + return 0; + + if (behind) { + int bw; + atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); + } + + while (sectors) { + sector_t blocks; + bitmap_counter_t *bmc; + + spin_lock_irq(&bitmap->counts.lock); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->counts.lock); + return 0; + } + + if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { + DEFINE_WAIT(__wait); + /* note that it is safe to do the prepare_to_wait + * after the test as long as we do it before dropping + * the spinlock. + */ + prepare_to_wait(&bitmap->overflow_wait, &__wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&bitmap->counts.lock); + schedule(); + finish_wait(&bitmap->overflow_wait, &__wait); + continue; + } + + switch (*bmc) { + case 0: + md_bitmap_file_set_bit(bitmap, offset); + md_bitmap_count_page(&bitmap->counts, offset, 1); + fallthrough; + case 1: + *bmc = 2; + } + + (*bmc)++; + + spin_unlock_irq(&bitmap->counts.lock); + + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else + sectors = 0; + } + return 0; +} +EXPORT_SYMBOL(md_bitmap_startwrite); + +void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind) +{ + if (!bitmap) + return; + if (behind) { + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); + } + + while (sectors) { + sector_t blocks; + unsigned long flags; + bitmap_counter_t *bmc; + + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); + if (!bmc) { + spin_unlock_irqrestore(&bitmap->counts.lock, flags); + return; + } + + if (success && !bitmap->mddev->degraded && + bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); + } + + if (!success && !NEEDED(*bmc)) + *bmc |= NEEDED_MASK; + + if (COUNTER(*bmc) == COUNTER_MAX) + wake_up(&bitmap->overflow_wait); + + (*bmc)--; + if (*bmc <= 2) { + md_bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; + } + spin_unlock_irqrestore(&bitmap->counts.lock, flags); + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else + sectors = 0; + } +} +EXPORT_SYMBOL(md_bitmap_endwrite); + +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + bitmap_counter_t *bmc; + int rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ + *blocks = 1024; + return 1; /* always resync if no bitmap */ + } + spin_lock_irq(&bitmap->counts.lock); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); + rv = 0; + if (bmc) { + /* locked */ + if (RESYNC(*bmc)) + rv = 1; + else if (NEEDED(*bmc)) { + rv = 1; + if (!degraded) { /* don't set/clear bits if degraded */ + *bmc |= RESYNC_MASK; + *bmc &= ~NEEDED_MASK; + } + } + } + spin_unlock_irq(&bitmap->counts.lock); + return rv; +} + +int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + sector_t blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} +EXPORT_SYMBOL(md_bitmap_start_sync); + +void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +{ + bitmap_counter_t *bmc; + unsigned long flags; + + if (bitmap == NULL) { + *blocks = 1024; + return; + } + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); + if (bmc == NULL) + goto unlock; + /* locked */ + if (RESYNC(*bmc)) { + *bmc &= ~RESYNC_MASK; + + if (!NEEDED(*bmc) && aborted) + *bmc |= NEEDED_MASK; + else { + if (*bmc <= 2) { + md_bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; + } + } + } + unlock: + spin_unlock_irqrestore(&bitmap->counts.lock, flags); +} +EXPORT_SYMBOL(md_bitmap_end_sync); + +void md_bitmap_close_sync(struct bitmap *bitmap) +{ + /* Sync has finished, and any bitmap chunks that weren't synced + * properly have been aborted. It remains to us to clear the + * RESYNC bit wherever it is still on + */ + sector_t sector = 0; + sector_t blocks; + if (!bitmap) + return; + while (sector < bitmap->mddev->resync_max_sectors) { + md_bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } +} +EXPORT_SYMBOL(md_bitmap_close_sync); + +void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) +{ + sector_t s = 0; + sector_t blocks; + + if (!bitmap) + return; + if (sector == 0) { + bitmap->last_end_sync = jiffies; + return; + } + if (!force && time_before(jiffies, (bitmap->last_end_sync + + bitmap->mddev->bitmap_info.daemon_sleep))) + return; + wait_event(bitmap->mddev->recovery_wait, + atomic_read(&bitmap->mddev->recovery_active) == 0); + + bitmap->mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); + sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); + s = 0; + while (s < sector && s < bitmap->mddev->resync_max_sectors) { + md_bitmap_end_sync(bitmap, s, &blocks, 0); + s += blocks; + } + bitmap->last_end_sync = jiffies; + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); +} +EXPORT_SYMBOL(md_bitmap_cond_end_sync); + +void md_bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) +{ + struct bitmap *bitmap = mddev->bitmap; + sector_t sector, blocks = 0; + + for (sector = old_lo; sector < new_lo; ) { + md_bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); + + for (sector = old_hi; sector < new_hi; ) { + md_bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); +} +EXPORT_SYMBOL(md_bitmap_sync_with_cluster); + +static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) +{ + /* For each chunk covered by any of these sectors, set the + * counter to 2 and possibly set resync_needed. They should all + * be 0 at this point + */ + + sector_t secs; + bitmap_counter_t *bmc; + spin_lock_irq(&bitmap->counts.lock); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->counts.lock); + return; + } + if (!*bmc) { + *bmc = 2; + md_bitmap_count_page(&bitmap->counts, offset, 1); + md_bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; + } + if (needed) + *bmc |= NEEDED_MASK; + spin_unlock_irq(&bitmap->counts.lock); +} + +/* dirty the memory and file bits for bitmap chunks "s" to "e" */ +void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +{ + unsigned long chunk; + + for (chunk = s; chunk <= e; chunk++) { + sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; + md_bitmap_set_memory_bits(bitmap, sec, 1); + md_bitmap_file_set_bit(bitmap, sec); + if (sec < bitmap->mddev->recovery_cp) + /* We are asserting that the array is dirty, + * so move the recovery_cp address back so + * that it is obvious that it is dirty + */ + bitmap->mddev->recovery_cp = sec; + } +} + +/* + * flush out any pending updates + */ +void md_bitmap_flush(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + long sleep; + + if (!bitmap) /* there was no bitmap */ + return; + + /* run the daemon_work three time to ensure everything is flushed + * that can be + */ + sleep = mddev->bitmap_info.daemon_sleep * 2; + bitmap->daemon_lastrun -= sleep; + md_bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + md_bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + md_bitmap_daemon_work(mddev); + if (mddev->bitmap_info.external) + md_super_wait(mddev); + md_bitmap_update_sb(bitmap); +} + +/* + * free memory that was allocated + */ +void md_bitmap_free(struct bitmap *bitmap) +{ + unsigned long k, pages; + struct bitmap_page *bp; + + if (!bitmap) /* there was no bitmap */ + return; + + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + + if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && + bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + md_cluster_stop(bitmap->mddev); + + /* Shouldn't be needed - but just in case.... */ + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); + + /* release the bitmap file */ + md_bitmap_file_unmap(&bitmap->storage); + + bp = bitmap->counts.bp; + pages = bitmap->counts.pages; + + /* free all allocated memory */ + + if (bp) /* deallocate the page memory */ + for (k = 0; k < pages; k++) + if (bp[k].map && !bp[k].hijacked) + kfree(bp[k].map); + kfree(bp); + kfree(bitmap); +} +EXPORT_SYMBOL(md_bitmap_free); + +void md_bitmap_wait_behind_writes(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + pr_debug("md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } +} + +void md_bitmap_destroy(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) /* there was no bitmap */ + return; + + md_bitmap_wait_behind_writes(mddev); + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, true); + + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); + mddev->bitmap = NULL; /* disconnect from the md device */ + spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); + if (mddev->thread) + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + + md_bitmap_free(bitmap); +} + +/* + * initialize the bitmap structure + * if this returns an error, bitmap_destroy must be called to do clean up + * once mddev->bitmap is set + */ +struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) +{ + struct bitmap *bitmap; + sector_t blocks = mddev->resync_max_sectors; + struct file *file = mddev->bitmap_info.file; + int err; + struct kernfs_node *bm = NULL; + + BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); + + BUG_ON(file && mddev->bitmap_info.offset); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/raid:%s: array with journal cannot have bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&bitmap->counts.lock); + atomic_set(&bitmap->pending_writes, 0); + init_waitqueue_head(&bitmap->write_wait); + init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); + + bitmap->mddev = mddev; + bitmap->cluster_slot = slot; + + if (mddev->kobj.sd) + bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); + if (bm) { + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); + sysfs_put(bm); + } else + bitmap->sysfs_can_clear = NULL; + + bitmap->storage.file = file; + if (file) { + get_file(file); + /* As future accesses to this file will use bmap, + * and bypass the page cache, we must sync the file + * first. + */ + vfs_fsync(file, 1); + } + /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ + if (!mddev->bitmap_info.external) { + /* + * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is + * instructing us to create a new on-disk bitmap instance. + */ + if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) + err = md_bitmap_new_disk_sb(bitmap); + else + err = md_bitmap_read_sb(bitmap); + } else { + err = 0; + if (mddev->bitmap_info.chunksize == 0 || + mddev->bitmap_info.daemon_sleep == 0) + /* chunksize and time_base need to be + * set first. */ + err = -EINVAL; + } + if (err) + goto error; + + bitmap->daemon_lastrun = jiffies; + err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + if (err) + goto error; + + pr_debug("created bitmap (%lu pages) for device %s\n", + bitmap->counts.pages, bmname(bitmap)); + + err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; + if (err) + goto error; + + return bitmap; + error: + md_bitmap_free(bitmap); + return ERR_PTR(err); +} + +int md_bitmap_load(struct mddev *mddev) +{ + int err = 0; + sector_t start = 0; + sector_t sector = 0; + struct bitmap *bitmap = mddev->bitmap; + struct md_rdev *rdev; + + if (!bitmap) + goto out; + + rdev_for_each(rdev, mddev) + mddev_create_serial_pool(mddev, rdev, true); + + if (mddev_is_clustered(mddev)) + md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + + /* Clear out old bitmap info first: Either there is none, or we + * are resuming after someone else has possibly changed things, + * so we should forget old cached info. + * All chunks should be clean, but some might need_sync. + */ + while (sector < mddev->resync_max_sectors) { + sector_t blocks; + md_bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + md_bitmap_close_sync(bitmap); + + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + + mutex_lock(&mddev->bitmap_info.mutex); + err = md_bitmap_init_from_disk(bitmap, start); + mutex_unlock(&mddev->bitmap_info.mutex); + + if (err) + goto out; + clear_bit(BITMAP_STALE, &bitmap->flags); + + /* Kick recovery in case any bits were set */ + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); + + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + md_wakeup_thread(mddev->thread); + + md_bitmap_update_sb(bitmap); + + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + err = -EIO; +out: + return err; +} +EXPORT_SYMBOL_GPL(md_bitmap_load); + +/* caller need to free returned bitmap with md_bitmap_free() */ +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) +{ + int rv = 0; + struct bitmap *bitmap; + + bitmap = md_bitmap_create(mddev, slot); + if (IS_ERR(bitmap)) { + rv = PTR_ERR(bitmap); + return ERR_PTR(rv); + } + + rv = md_bitmap_init_from_disk(bitmap, 0); + if (rv) { + md_bitmap_free(bitmap); + return ERR_PTR(rv); + } + + return bitmap; +} +EXPORT_SYMBOL(get_bitmap_from_slot); + +/* Loads the bitmap associated with slot and copies the resync information + * to our bitmap + */ +int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *low, sector_t *high, bool clear_bits) +{ + int rv = 0, i, j; + sector_t block, lo = 0, hi = 0; + struct bitmap_counts *counts; + struct bitmap *bitmap; + + bitmap = get_bitmap_from_slot(mddev, slot); + if (IS_ERR(bitmap)) { + pr_err("%s can't get bitmap from slot %d\n", __func__, slot); + return -1; + } + + counts = &bitmap->counts; + for (j = 0; j < counts->chunks; j++) { + block = (sector_t)j << counts->chunkshift; + if (md_bitmap_file_test_bit(bitmap, block)) { + if (!lo) + lo = block; + hi = block; + md_bitmap_file_clear_bit(bitmap, block); + md_bitmap_set_memory_bits(mddev->bitmap, block, 1); + md_bitmap_file_set_bit(mddev->bitmap, block); + } + } + + if (clear_bits) { + md_bitmap_update_sb(bitmap); + /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs + * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ + for (i = 0; i < bitmap->storage.file_pages; i++) + if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); + md_bitmap_unplug(bitmap); + } + md_bitmap_unplug(mddev->bitmap); + *low = lo; + *high = hi; + md_bitmap_free(bitmap); + + return rv; +} +EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); + + +void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + struct bitmap_counts *counts; + + if (!bitmap) + return; + + counts = &bitmap->counts; + + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + counts->pages - counts->missing_pages, + counts->pages, + (counts->pages - counts->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->storage.file) { + seq_printf(seq, ", file: "); + seq_file_path(seq, bitmap->storage.file, " \t\n"); + } + + seq_printf(seq, "\n"); +} + +int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init) +{ + /* If chunk_size is 0, choose an appropriate chunk size. + * Then possibly allocate new storage space. + * Then quiesce, copy bits, replace bitmap, and re-start + * + * This function is called both to set up the initial bitmap + * and to resize the bitmap while the array is active. + * If this happens as a result of the array being resized, + * chunksize will be zero, and we need to choose a suitable + * chunksize, otherwise we use what we are given. + */ + struct bitmap_storage store; + struct bitmap_counts old_counts; + unsigned long chunks; + sector_t block; + sector_t old_blocks, new_blocks; + int chunkshift; + int ret = 0; + long pages; + struct bitmap_page *new_bp; + + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + + if (chunksize == 0) { + /* If there is enough space, leave the chunk size unchanged, + * else increase by factor of two until there is enough space. + */ + long bytes; + long space = bitmap->mddev->bitmap_info.space; + + if (space == 0) { + /* We don't know how much space there is, so limit + * to current size - in sectors. + */ + bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + space = DIV_ROUND_UP(bytes, 512); + bitmap->mddev->bitmap_info.space = space; + } + chunkshift = bitmap->counts.chunkshift; + chunkshift--; + do { + /* 'chunkshift' is shift from block size to chunk size */ + chunkshift++; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + bytes = DIV_ROUND_UP(chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) < + (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1)); + } else + chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; + + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + memset(&store, 0, sizeof(store)); + if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) + ret = md_bitmap_storage_alloc(&store, chunks, + !bitmap->mddev->bitmap_info.external, + mddev_is_clustered(bitmap->mddev) + ? bitmap->cluster_slot : 0); + if (ret) { + md_bitmap_file_unmap(&store); + goto err; + } + + pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); + + new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL); + ret = -ENOMEM; + if (!new_bp) { + md_bitmap_file_unmap(&store); + goto err; + } + + if (!init) + bitmap->mddev->pers->quiesce(bitmap->mddev, 1); + + store.file = bitmap->storage.file; + bitmap->storage.file = NULL; + + if (store.sb_page && bitmap->storage.sb_page) + memcpy(page_address(store.sb_page), + page_address(bitmap->storage.sb_page), + sizeof(bitmap_super_t)); + spin_lock_irq(&bitmap->counts.lock); + md_bitmap_file_unmap(&bitmap->storage); + bitmap->storage = store; + + old_counts = bitmap->counts; + bitmap->counts.bp = new_bp; + bitmap->counts.pages = pages; + bitmap->counts.missing_pages = pages; + bitmap->counts.chunkshift = chunkshift; + bitmap->counts.chunks = chunks; + bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift + + BITMAP_BLOCK_SHIFT); + + blocks = min(old_counts.chunks << old_counts.chunkshift, + chunks << chunkshift); + + /* For cluster raid, need to pre-allocate bitmap */ + if (mddev_is_clustered(bitmap->mddev)) { + unsigned long page; + for (page = 0; page < pages; page++) { + ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1); + if (ret) { + unsigned long k; + + /* deallocate the page memory */ + for (k = 0; k < page; k++) { + kfree(new_bp[k].map); + } + kfree(new_bp); + + /* restore some fields from old_counts */ + bitmap->counts.bp = old_counts.bp; + bitmap->counts.pages = old_counts.pages; + bitmap->counts.missing_pages = old_counts.pages; + bitmap->counts.chunkshift = old_counts.chunkshift; + bitmap->counts.chunks = old_counts.chunks; + bitmap->mddev->bitmap_info.chunksize = + 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); + blocks = old_counts.chunks << old_counts.chunkshift; + pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); + break; + } else + bitmap->counts.bp[page].count += 1; + } + } + + for (block = 0; block < blocks; ) { + bitmap_counter_t *bmc_old, *bmc_new; + int set; + + bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0); + set = bmc_old && NEEDED(*bmc_old); + + if (set) { + bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); + if (bmc_new) { + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + + start <<= chunkshift; + while (start < end) { + md_bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); + } + *bmc_new |= NEEDED_MASK; + } + if (new_blocks < old_blocks) + old_blocks = new_blocks; + } + block += old_blocks; + } + + if (bitmap->counts.bp != old_counts.bp) { + unsigned long k; + for (k = 0; k < old_counts.pages; k++) + if (!old_counts.bp[k].hijacked) + kfree(old_counts.bp[k].map); + kfree(old_counts.bp); + } + + if (!init) { + int i; + while (block < (chunks << chunkshift)) { + bitmap_counter_t *bmc; + bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); + if (bmc) { + /* new space. It needs to be resynced, so + * we set NEEDED_MASK. + */ + if (*bmc == 0) { + *bmc = NEEDED_MASK | 2; + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); + } + } + block += new_blocks; + } + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + } + spin_unlock_irq(&bitmap->counts.lock); + + if (!init) { + md_bitmap_unplug(bitmap); + bitmap->mddev->pers->quiesce(bitmap->mddev, 0); + } + ret = 0; +err: + return ret; +} +EXPORT_SYMBOL_GPL(md_bitmap_resize); + +static ssize_t +location_show(struct mddev *mddev, char *page) +{ + ssize_t len; + if (mddev->bitmap_info.file) + len = sprintf(page, "file"); + else if (mddev->bitmap_info.offset) + len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); + else + len = sprintf(page, "none"); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +location_store(struct mddev *mddev, const char *buf, size_t len) +{ + int rv; + + rv = mddev_lock(mddev); + if (rv) + return rv; + if (mddev->pers) { + if (!mddev->pers->quiesce) { + rv = -EBUSY; + goto out; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto out; + } + } + + if (mddev->bitmap || mddev->bitmap_info.file || + mddev->bitmap_info.offset) { + /* bitmap already configured. Only option is to clear it */ + if (strncmp(buf, "none", 4) != 0) { + rv = -EBUSY; + goto out; + } + if (mddev->pers) { + mddev_suspend(mddev); + md_bitmap_destroy(mddev); + mddev_resume(mddev); + } + mddev->bitmap_info.offset = 0; + if (mddev->bitmap_info.file) { + struct file *f = mddev->bitmap_info.file; + mddev->bitmap_info.file = NULL; + fput(f); + } + } else { + /* No bitmap, OK to set a location */ + long long offset; + if (strncmp(buf, "none", 4) == 0) + /* nothing to be done */; + else if (strncmp(buf, "file:", 5) == 0) { + /* Not supported yet */ + rv = -EINVAL; + goto out; + } else { + if (buf[0] == '+') + rv = kstrtoll(buf+1, 10, &offset); + else + rv = kstrtoll(buf, 10, &offset); + if (rv) + goto out; + if (offset == 0) { + rv = -EINVAL; + goto out; + } + if (mddev->bitmap_info.external == 0 && + mddev->major_version == 0 && + offset != mddev->bitmap_info.default_offset) { + rv = -EINVAL; + goto out; + } + mddev->bitmap_info.offset = offset; + if (mddev->pers) { + struct bitmap *bitmap; + bitmap = md_bitmap_create(mddev, -1); + mddev_suspend(mddev); + if (IS_ERR(bitmap)) + rv = PTR_ERR(bitmap); + else { + mddev->bitmap = bitmap; + rv = md_bitmap_load(mddev); + if (rv) + mddev->bitmap_info.offset = 0; + } + if (rv) { + md_bitmap_destroy(mddev); + mddev_resume(mddev); + goto out; + } + mddev_resume(mddev); + } + } + } + if (!mddev->external) { + /* Ensure new bitmap info is stored in + * metadata promptly. + */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + } + rv = 0; +out: + mddev_unlock(mddev); + if (rv) + return rv; + return len; +} + +static struct md_sysfs_entry bitmap_location = +__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); + +/* 'bitmap/space' is the space available at 'location' for the + * bitmap. This allows the kernel to know when it is safe to + * resize the bitmap to match a resized array. + */ +static ssize_t +space_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.space); +} + +static ssize_t +space_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long sectors; + int rv; + + rv = kstrtoul(buf, 10, §ors); + if (rv) + return rv; + + if (sectors == 0) + return -EINVAL; + + if (mddev->bitmap && + sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + return -EFBIG; /* Bitmap is too big for this small space */ + + /* could make sure it isn't too big, but that isn't really + * needed - user-space should be careful. + */ + mddev->bitmap_info.space = sectors; + return len; +} + +static struct md_sysfs_entry bitmap_space = +__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); + +static ssize_t +timeout_show(struct mddev *mddev, char *page) +{ + ssize_t len; + unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; + unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; + + len = sprintf(page, "%lu", secs); + if (jifs) + len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +timeout_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* timeout can be set at any time */ + unsigned long timeout; + int rv = strict_strtoul_scaled(buf, &timeout, 4); + if (rv) + return rv; + + /* just to make sure we don't overflow... */ + if (timeout >= LONG_MAX / HZ) + return -EINVAL; + + timeout = timeout * HZ / 10000; + + if (timeout >= MAX_SCHEDULE_TIMEOUT) + timeout = MAX_SCHEDULE_TIMEOUT-1; + if (timeout < 1) + timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; + if (mddev->thread) { + /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then + * the bitmap is all clean and we don't need to + * adjust the timeout right now + */ + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + mddev->thread->timeout = timeout; + md_wakeup_thread(mddev->thread); + } + } + return len; +} + +static struct md_sysfs_entry bitmap_timeout = +__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); + +static ssize_t +backlog_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); +} + +static ssize_t +backlog_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long backlog; + unsigned long old_mwb = mddev->bitmap_info.max_write_behind; + struct md_rdev *rdev; + bool has_write_mostly = false; + int rv = kstrtoul(buf, 10, &backlog); + if (rv) + return rv; + if (backlog > COUNTER_MAX) + return -EINVAL; + + rv = mddev_lock(mddev); + if (rv) + return rv; + + /* + * Without write mostly device, it doesn't make sense to set + * backlog for max_write_behind. + */ + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags)) { + has_write_mostly = true; + break; + } + } + if (!has_write_mostly) { + pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", + mdname(mddev)); + mddev_unlock(mddev); + return -EINVAL; + } + + mddev->bitmap_info.max_write_behind = backlog; + if (!backlog && mddev->serial_info_pool) { + /* serial_info_pool is not needed if backlog is zero */ + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, false); + } else if (backlog && !mddev->serial_info_pool) { + /* serial_info_pool is needed since backlog is not zero */ + rdev_for_each(rdev, mddev) + mddev_create_serial_pool(mddev, rdev, false); + } + if (old_mwb != backlog) + md_bitmap_update_sb(mddev->bitmap); + + mddev_unlock(mddev); + return len; +} + +static struct md_sysfs_entry bitmap_backlog = +__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); + +static ssize_t +chunksize_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); +} + +static ssize_t +chunksize_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* Can only be changed when no bitmap is active */ + int rv; + unsigned long csize; + if (mddev->bitmap) + return -EBUSY; + rv = kstrtoul(buf, 10, &csize); + if (rv) + return rv; + if (csize < 512 || + !is_power_of_2(csize)) + return -EINVAL; + if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE * + sizeof(((bitmap_super_t *)0)->chunksize)))) + return -EOVERFLOW; + mddev->bitmap_info.chunksize = csize; + return len; +} + +static struct md_sysfs_entry bitmap_chunksize = +__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + if (mddev_is_clustered(mddev)) + return sprintf(page, "clustered\n"); + return sprintf(page, "%s\n", (mddev->bitmap_info.external + ? "external" : "internal")); +} + +static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap || + mddev->bitmap_info.file || + mddev->bitmap_info.offset) + return -EBUSY; + if (strncmp(buf, "external", 8) == 0) + mddev->bitmap_info.external = 1; + else if ((strncmp(buf, "internal", 8) == 0) || + (strncmp(buf, "clustered", 9) == 0)) + mddev->bitmap_info.external = 0; + else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_metadata = +__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t can_clear_show(struct mddev *mddev, char *page) +{ + int len; + spin_lock(&mddev->lock); + if (mddev->bitmap) + len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? + "false" : "true")); + else + len = sprintf(page, "\n"); + spin_unlock(&mddev->lock); + return len; +} + +static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap == NULL) + return -ENOENT; + if (strncmp(buf, "false", 5) == 0) + mddev->bitmap->need_sync = 1; + else if (strncmp(buf, "true", 4) == 0) { + if (mddev->degraded) + return -EBUSY; + mddev->bitmap->need_sync = 0; + } else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_can_clear = +__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); + +static ssize_t +behind_writes_used_show(struct mddev *mddev, char *page) +{ + ssize_t ret; + spin_lock(&mddev->lock); + if (mddev->bitmap == NULL) + ret = sprintf(page, "0\n"); + else + ret = sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); + spin_unlock(&mddev->lock); + return ret; +} + +static ssize_t +behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + +static struct attribute *md_bitmap_attrs[] = { + &bitmap_location.attr, + &bitmap_space.attr, + &bitmap_timeout.attr, + &bitmap_backlog.attr, + &bitmap_chunksize.attr, + &bitmap_metadata.attr, + &bitmap_can_clear.attr, + &max_backlog_used.attr, + NULL +}; +const struct attribute_group md_bitmap_group = { + .name = "bitmap", + .attrs = md_bitmap_attrs, +}; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h new file mode 100644 index 000000000..3a4750952 --- /dev/null +++ b/drivers/md/md-bitmap.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stripe is clean + * Anything that dirties the stripe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_HOSTENDIAN =15, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __le32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* + * count of dirty bits on the page + */ + unsigned int count:30; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + unsigned long pages; /* total number of pages + * in the bitmap */ + unsigned long missing_pages; /* number of pages + * not yet allocated */ + unsigned long chunkshift; /* chunksize = 2^chunkshift + * (for bitops) */ + unsigned long chunks; /* Total number of data + * chunks for the array */ + } counts; + + struct mddev *mddev; /* the md device that the bitmap is for */ + + __u64 events_cleared; + int need_sync; + + struct bitmap_storage { + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap + * file superblock */ + struct page **filemap; /* list of cache pages for + * the file */ + unsigned long *filemap_attr; /* attributes associated + * w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file*/ + unsigned long bytes; /* total bytes in the bitmap */ + } storage; + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct kernfs_node *sysfs_can_clear; + int cluster_slot; /* Slot offset for clustered env */ +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); +int md_bitmap_load(struct mddev *mddev); +void md_bitmap_flush(struct mddev *mddev); +void md_bitmap_destroy(struct mddev *mddev); + +void md_bitmap_print_sb(struct bitmap *bitmap); +void md_bitmap_update_sb(struct bitmap *bitmap); +void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); + +int md_bitmap_setallbits(struct bitmap *bitmap); +void md_bitmap_write_all(struct bitmap *bitmap); + +void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); +void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); +void md_bitmap_close_sync(struct bitmap *bitmap); +void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); +void md_bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); + +void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_daemon_work(struct mddev *mddev); + +int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init); +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); +int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *lo, sector_t *hi, bool clear_bits); +void md_bitmap_free(struct bitmap *bitmap); +void md_bitmap_wait_behind_writes(struct mddev *mddev); + +static inline bool md_bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap && bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + +#endif + +#endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c new file mode 100644 index 000000000..10e0c5381 --- /dev/null +++ b/drivers/md/md-cluster.c @@ -0,0 +1,1602 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2015, SUSE + */ + + +#include +#include +#include +#include +#include +#include "md.h" +#include "md-bitmap.h" +#include "md-cluster.h" + +#define LVB_SIZE 64 +#define NEW_DEV_TIMEOUT 5000 + +struct dlm_lock_resource { + dlm_lockspace_t *ls; + struct dlm_lksb lksb; + char *name; /* lock name. */ + uint32_t flags; /* flags to pass to dlm_lock() */ + wait_queue_head_t sync_locking; /* wait queue for synchronized locking */ + bool sync_locking_done; + void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ + struct mddev *mddev; /* pointing back to mddev. */ + int mode; +}; + +struct resync_info { + __le64 lo; + __le64 hi; +}; + +/* md_cluster_info flags */ +#define MD_CLUSTER_WAITING_FOR_NEWDISK 1 +#define MD_CLUSTER_SUSPEND_READ_BALANCING 2 +#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 + +/* Lock the send communication. This is done through + * bit manipulation as opposed to a mutex in order to + * accommodate lock and hold. See next comment. + */ +#define MD_CLUSTER_SEND_LOCK 4 +/* If cluster operations (such as adding a disk) must lock the + * communication channel, so as to perform extra operations + * (update metadata) and no other operation is allowed on the + * MD. Token needs to be locked and held until the operation + * completes witha md_update_sb(), which would eventually release + * the lock. + */ +#define MD_CLUSTER_SEND_LOCKED_ALREADY 5 +/* We should receive message after node joined cluster and + * set up all the related infos such as bitmap and personality */ +#define MD_CLUSTER_ALREADY_IN_CLUSTER 6 +#define MD_CLUSTER_PENDING_RECV_EVENT 7 +#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 + +struct md_cluster_info { + struct mddev *mddev; /* the md device which md_cluster_info belongs to */ + /* dlm lock space and resources for clustered raid. */ + dlm_lockspace_t *lockspace; + int slot_number; + struct completion completion; + struct mutex recv_mutex; + struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource **other_bitmap_lockres; + struct dlm_lock_resource *resync_lockres; + struct list_head suspend_list; + + spinlock_t suspend_lock; + /* record the region which write should be suspended */ + sector_t suspend_lo; + sector_t suspend_hi; + int suspend_from; /* the slot which broadcast suspend_lo/hi */ + + struct md_thread *recovery_thread; + unsigned long recovery_map; + /* communication loc resources */ + struct dlm_lock_resource *ack_lockres; + struct dlm_lock_resource *message_lockres; + struct dlm_lock_resource *token_lockres; + struct dlm_lock_resource *no_new_dev_lockres; + struct md_thread *recv_thread; + struct completion newdisk_completion; + wait_queue_head_t wait; + unsigned long state; + /* record the region in RESYNCING message */ + sector_t sync_low; + sector_t sync_hi; +}; + +enum msg_type { + METADATA_UPDATED = 0, + RESYNCING, + NEWDISK, + REMOVE, + RE_ADD, + BITMAP_NEEDS_SYNC, + CHANGE_CAPACITY, + BITMAP_RESIZE, +}; + +struct cluster_msg { + __le32 type; + __le32 slot; + /* TODO: Unionize this for smaller footprint */ + __le64 low; + __le64 high; + char uuid[16]; + __le32 raid_slot; +}; + +static void sync_ast(void *arg) +{ + struct dlm_lock_resource *res; + + res = arg; + res->sync_locking_done = true; + wake_up(&res->sync_locking); +} + +static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) +{ + int ret = 0; + + ret = dlm_lock(res->ls, mode, &res->lksb, + res->flags, res->name, strlen(res->name), + 0, sync_ast, res, res->bast); + if (ret) + return ret; + wait_event(res->sync_locking, res->sync_locking_done); + res->sync_locking_done = false; + if (res->lksb.sb_status == 0) + res->mode = mode; + return res->lksb.sb_status; +} + +static int dlm_unlock_sync(struct dlm_lock_resource *res) +{ + return dlm_lock_sync(res, DLM_LOCK_NL); +} + +/* + * An variation of dlm_lock_sync, which make lock request could + * be interrupted + */ +static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode, + struct mddev *mddev) +{ + int ret = 0; + + ret = dlm_lock(res->ls, mode, &res->lksb, + res->flags, res->name, strlen(res->name), + 0, sync_ast, res, res->bast); + if (ret) + return ret; + + wait_event(res->sync_locking, res->sync_locking_done + || kthread_should_stop() + || test_bit(MD_CLOSING, &mddev->flags)); + if (!res->sync_locking_done) { + /* + * the convert queue contains the lock request when request is + * interrupted, and sync_ast could still be run, so need to + * cancel the request and reset completion + */ + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL, + &res->lksb, res); + res->sync_locking_done = false; + if (unlikely(ret != 0)) + pr_info("failed to cancel previous lock request " + "%s return %d\n", res->name, ret); + return -EPERM; + } else + res->sync_locking_done = false; + if (res->lksb.sb_status == 0) + res->mode = mode; + return res->lksb.sb_status; +} + +static struct dlm_lock_resource *lockres_init(struct mddev *mddev, + char *name, void (*bastfn)(void *arg, int mode), int with_lvb) +{ + struct dlm_lock_resource *res = NULL; + int ret, namelen; + struct md_cluster_info *cinfo = mddev->cluster_info; + + res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + if (!res) + return NULL; + init_waitqueue_head(&res->sync_locking); + res->sync_locking_done = false; + res->ls = cinfo->lockspace; + res->mddev = mddev; + res->mode = DLM_LOCK_IV; + namelen = strlen(name); + res->name = kzalloc(namelen + 1, GFP_KERNEL); + if (!res->name) { + pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); + goto out_err; + } + strscpy(res->name, name, namelen + 1); + if (with_lvb) { + res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); + if (!res->lksb.sb_lvbptr) { + pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); + goto out_err; + } + res->flags = DLM_LKF_VALBLK; + } + + if (bastfn) + res->bast = bastfn; + + res->flags |= DLM_LKF_EXPEDITE; + + ret = dlm_lock_sync(res, DLM_LOCK_NL); + if (ret) { + pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); + goto out_err; + } + res->flags &= ~DLM_LKF_EXPEDITE; + res->flags |= DLM_LKF_CONVERT; + + return res; +out_err: + kfree(res->lksb.sb_lvbptr); + kfree(res->name); + kfree(res); + return NULL; +} + +static void lockres_free(struct dlm_lock_resource *res) +{ + int ret = 0; + + if (!res) + return; + + /* + * use FORCEUNLOCK flag, so we can unlock even the lock is on the + * waiting or convert queue + */ + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK, + &res->lksb, res); + if (unlikely(ret != 0)) + pr_err("failed to unlock %s return %d\n", res->name, ret); + else + wait_event(res->sync_locking, res->sync_locking_done); + + kfree(res->name); + kfree(res->lksb.sb_lvbptr); + kfree(res); +} + +static void add_resync_info(struct dlm_lock_resource *lockres, + sector_t lo, sector_t hi) +{ + struct resync_info *ri; + + ri = (struct resync_info *)lockres->lksb.sb_lvbptr; + ri->lo = cpu_to_le64(lo); + ri->hi = cpu_to_le64(hi); +} + +static int read_resync_info(struct mddev *mddev, + struct dlm_lock_resource *lockres) +{ + struct resync_info ri; + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret = 0; + + dlm_lock_sync(lockres, DLM_LOCK_CR); + memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); + if (le64_to_cpu(ri.hi) > 0) { + cinfo->suspend_hi = le64_to_cpu(ri.hi); + cinfo->suspend_lo = le64_to_cpu(ri.lo); + ret = 1; + } + dlm_unlock_sync(lockres); + return ret; +} + +static void recover_bitmaps(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct md_cluster_info *cinfo = mddev->cluster_info; + struct dlm_lock_resource *bm_lockres; + char str[64]; + int slot, ret; + sector_t lo, hi; + + while (cinfo->recovery_map) { + slot = fls64((u64)cinfo->recovery_map) - 1; + + snprintf(str, 64, "bitmap%04d", slot); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) { + pr_err("md-cluster: Cannot initialize bitmaps\n"); + goto clear_bit; + } + + ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev); + if (ret) { + pr_err("md-cluster: Could not DLM lock %s: %d\n", + str, ret); + goto clear_bit; + } + ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); + if (ret) { + pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); + goto clear_bit; + } + + /* Clear suspend_area associated with the bitmap */ + spin_lock_irq(&cinfo->suspend_lock); + cinfo->suspend_hi = 0; + cinfo->suspend_lo = 0; + cinfo->suspend_from = -1; + spin_unlock_irq(&cinfo->suspend_lock); + + /* Kick off a reshape if needed */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->reshape_position != MaxSector) + md_wakeup_thread(mddev->sync_thread); + + if (hi > 0) { + if (lo < mddev->recovery_cp) + mddev->recovery_cp = lo; + /* wake up thread to continue resync in case resync + * is not finished */ + if (mddev->recovery_cp != MaxSector) { + /* + * clear the REMOTE flag since we will launch + * resync thread in current node. + */ + clear_bit(MD_RESYNCING_REMOTE, + &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + } +clear_bit: + lockres_free(bm_lockres); + clear_bit(slot, &cinfo->recovery_map); + } +} + +static void recover_prep(void *arg) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); +} + +static void __recover_slot(struct mddev *mddev, int slot) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + set_bit(slot, &cinfo->recovery_map); + if (!cinfo->recovery_thread) { + cinfo->recovery_thread = md_register_thread(recover_bitmaps, + mddev, "recover"); + if (!cinfo->recovery_thread) { + pr_warn("md-cluster: Could not create recovery thread\n"); + return; + } + } + md_wakeup_thread(cinfo->recovery_thread); +} + +static void recover_slot(void *arg, struct dlm_slot *slot) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + + pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", + mddev->bitmap_info.cluster_name, + slot->nodeid, slot->slot, + cinfo->slot_number); + /* deduct one since dlm slot starts from one while the num of + * cluster-md begins with 0 */ + __recover_slot(mddev, slot->slot - 1); +} + +static void recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cinfo->slot_number = our_slot; + /* completion is only need to be complete when node join cluster, + * it doesn't need to run during another node's failure */ + if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) { + complete(&cinfo->completion); + clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + } + clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); +} + +/* the ops is called when node join the cluster, and do lock recovery + * if node failure occurs */ +static const struct dlm_lockspace_ops md_ls_ops = { + .recover_prep = recover_prep, + .recover_slot = recover_slot, + .recover_done = recover_done, +}; + +/* + * The BAST function for the ack lock resource + * This function wakes up the receive thread in + * order to receive and process the message. + */ +static void ack_bast(void *arg, int mode) +{ + struct dlm_lock_resource *res = arg; + struct md_cluster_info *cinfo = res->mddev->cluster_info; + + if (mode == DLM_LOCK_EX) { + if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state)) + md_wakeup_thread(cinfo->recv_thread); + else + set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state); + } +} + +static void remove_suspend_info(struct mddev *mddev, int slot) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); + spin_lock_irq(&cinfo->suspend_lock); + cinfo->suspend_hi = 0; + cinfo->suspend_lo = 0; + spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 0); +} + +static void process_suspend_info(struct mddev *mddev, + int slot, sector_t lo, sector_t hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct mdp_superblock_1 *sb = NULL; + struct md_rdev *rdev; + + if (!hi) { + /* + * clear the REMOTE flag since resync or recovery is finished + * in remote node. + */ + clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); + remove_suspend_info(mddev, slot); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + return; + } + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + sb = page_address(rdev->sb_page); + break; + } + + /* + * The bitmaps are not same for different nodes + * if RESYNCING is happening in one node, then + * the node which received the RESYNCING message + * probably will perform resync with the region + * [lo, hi] again, so we could reduce resync time + * a lot if we can ensure that the bitmaps among + * different nodes are match up well. + * + * sync_low/hi is used to record the region which + * arrived in the previous RESYNCING message, + * + * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK + * and set RESYNC_MASK since resync thread is running + * in another node, so we don't need to do the resync + * again with the same section. + * + * Skip md_bitmap_sync_with_cluster in case reshape + * happening, because reshaping region is small and + * we don't want to trigger lots of WARN. + */ + if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) + md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, lo, hi); + cinfo->sync_low = lo; + cinfo->sync_hi = hi; + + mddev->pers->quiesce(mddev, 1); + spin_lock_irq(&cinfo->suspend_lock); + cinfo->suspend_from = slot; + cinfo->suspend_lo = lo; + cinfo->suspend_hi = hi; + spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 0); +} + +static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +{ + char disk_uuid[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + char event_name[] = "EVENT=ADD_DEVICE"; + char raid_slot[16]; + char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; + int len; + + len = snprintf(disk_uuid, 64, "DEVICE_UUID="); + sprintf(disk_uuid + len, "%pU", cmsg->uuid); + snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot)); + pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); + init_completion(&cinfo->newdisk_completion); + set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); + wait_for_completion_timeout(&cinfo->newdisk_completion, + NEW_DEV_TIMEOUT); + clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); +} + + +static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) +{ + int got_lock = 0; + struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->good_device_nr = le32_to_cpu(msg->raid_slot); + + dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + wait_event(mddev->thread->wqueue, + (got_lock = mddev_trylock(mddev)) || + test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); + md_reload_sb(mddev, mddev->good_device_nr); + if (got_lock) + mddev_unlock(mddev); +} + +static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) +{ + struct md_rdev *rdev; + + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); + if (rdev) { + set_bit(ClusterRemove, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + else + pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); + rcu_read_unlock(); +} + +static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) +{ + struct md_rdev *rdev; + + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); + if (rdev && test_bit(Faulty, &rdev->flags)) + clear_bit(Faulty, &rdev->flags); + else + pr_warn("%s: %d Could not find disk(%d) which is faulty", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); + rcu_read_unlock(); +} + +static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) +{ + int ret = 0; + + if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), + "node %d received its own msg\n", le32_to_cpu(msg->slot))) + return -1; + switch (le32_to_cpu(msg->type)) { + case METADATA_UPDATED: + process_metadata_update(mddev, msg); + break; + case CHANGE_CAPACITY: + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + break; + case RESYNCING: + set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); + process_suspend_info(mddev, le32_to_cpu(msg->slot), + le64_to_cpu(msg->low), + le64_to_cpu(msg->high)); + break; + case NEWDISK: + process_add_new_disk(mddev, msg); + break; + case REMOVE: + process_remove_disk(mddev, msg); + break; + case RE_ADD: + process_readd_disk(mddev, msg); + break; + case BITMAP_NEEDS_SYNC: + __recover_slot(mddev, le32_to_cpu(msg->slot)); + break; + case BITMAP_RESIZE: + if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) + ret = md_bitmap_resize(mddev->bitmap, + le64_to_cpu(msg->high), 0, 0); + break; + default: + ret = -1; + pr_warn("%s:%d Received unknown message from %d\n", + __func__, __LINE__, msg->slot); + } + return ret; +} + +/* + * thread for receiving message + */ +static void recv_daemon(struct md_thread *thread) +{ + struct md_cluster_info *cinfo = thread->mddev->cluster_info; + struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; + struct dlm_lock_resource *message_lockres = cinfo->message_lockres; + struct cluster_msg msg; + int ret; + + mutex_lock(&cinfo->recv_mutex); + /*get CR on Message*/ + if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { + pr_err("md/raid1:failed to get CR on MESSAGE\n"); + mutex_unlock(&cinfo->recv_mutex); + return; + } + + /* read lvb and wake up thread to process this message_lockres */ + memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); + ret = process_recvd_msg(thread->mddev, &msg); + if (ret) + goto out; + + /*release CR on ack_lockres*/ + ret = dlm_unlock_sync(ack_lockres); + if (unlikely(ret != 0)) + pr_info("unlock ack failed return %d\n", ret); + /*up-convert to PR on message_lockres*/ + ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR); + if (unlikely(ret != 0)) + pr_info("lock PR on msg failed return %d\n", ret); + /*get CR on ack_lockres again*/ + ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); + if (unlikely(ret != 0)) + pr_info("lock CR on ack failed return %d\n", ret); +out: + /*release CR on message_lockres*/ + ret = dlm_unlock_sync(message_lockres); + if (unlikely(ret != 0)) + pr_info("unlock msg failed return %d\n", ret); + mutex_unlock(&cinfo->recv_mutex); +} + +/* lock_token() + * Takes the lock on the TOKEN lock resource so no other + * node can communicate while the operation is underway. + */ +static int lock_token(struct md_cluster_info *cinfo) +{ + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + } else { + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); + } + return error; +} + +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) +{ + int rv, set_bit = 0; + struct mddev *mddev = cinfo->mddev; + + /* + * If resync thread run after raid1d thread, then process_metadata_update + * could not continue if raid1d held reconfig_mutex (and raid1d is blocked + * since another node already got EX on Token and waiting the EX of Ack), + * so let resync wake up thread in case flag is set. + */ + if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + &cinfo->state)) { + rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + &cinfo->state); + WARN_ON_ONCE(rv); + md_wakeup_thread(mddev->thread); + set_bit = 1; + } + + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); + rv = lock_token(cinfo); + if (set_bit) + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return rv; +} + +static void unlock_comm(struct md_cluster_info *cinfo) +{ + WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); + mutex_unlock(&cinfo->recv_mutex); + dlm_unlock_sync(cinfo->token_lockres); + clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state); + wake_up(&cinfo->wait); +} + +/* __sendmsg() + * This function performs the actual sending of the message. This function is + * usually called after performing the encompassing operation + * The function: + * 1. Grabs the message lockresource in EX mode + * 2. Copies the message to the message LVB + * 3. Downconverts message lockresource to CW + * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes + * and the other nodes read the message. The thread will wait here until all other + * nodes have released ack lock resource. + * 5. Downconvert ack lockresource to CR + */ +static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) +{ + int error; + int slot = cinfo->slot_number - 1; + + cmsg->slot = cpu_to_le32(slot); + /*get EX on Message*/ + error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); + goto failed_message; + } + + memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, + sizeof(struct cluster_msg)); + /*down-convert EX to CW on Message*/ + error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW); + if (error) { + pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n", + error); + goto failed_ack; + } + + /*up-convert CR to EX on Ack*/ + error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n", + error); + goto failed_ack; + } + + /*down-convert EX to CR on Ack*/ + error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR); + if (error) { + pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n", + error); + goto failed_ack; + } + +failed_ack: + error = dlm_unlock_sync(cinfo->message_lockres); + if (unlikely(error != 0)) { + pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n", + error); + /* in case the message can't be released due to some reason */ + goto failed_ack; + } +failed_message: + return error; +} + +static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, + bool mddev_locked) +{ + int ret; + + ret = lock_comm(cinfo, mddev_locked); + if (!ret) { + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + } + return ret; +} + +static int gather_all_resync_info(struct mddev *mddev, int total_slots) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int i, ret = 0; + struct dlm_lock_resource *bm_lockres; + char str[64]; + sector_t lo, hi; + + + for (i = 0; i < total_slots; i++) { + memset(str, '\0', 64); + snprintf(str, 64, "bitmap%04d", i); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) + return -ENOMEM; + if (i == (cinfo->slot_number - 1)) { + lockres_free(bm_lockres); + continue; + } + + bm_lockres->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (ret == -EAGAIN) { + if (read_resync_info(mddev, bm_lockres)) { + pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", + __func__, __LINE__, + (unsigned long long) cinfo->suspend_lo, + (unsigned long long) cinfo->suspend_hi, + i); + cinfo->suspend_from = i; + } + ret = 0; + lockres_free(bm_lockres); + continue; + } + if (ret) { + lockres_free(bm_lockres); + goto out; + } + + /* Read the disk bitmap sb and check if it needs recovery */ + ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + if (ret) { + pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); + lockres_free(bm_lockres); + continue; + } + if ((hi > 0) && (lo < mddev->recovery_cp)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + mddev->recovery_cp = lo; + md_check_recovery(mddev); + } + + lockres_free(bm_lockres); + } +out: + return ret; +} + +static int join(struct mddev *mddev, int nodes) +{ + struct md_cluster_info *cinfo; + int ret, ops_rv; + char str[64]; + + cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); + if (!cinfo) + return -ENOMEM; + + INIT_LIST_HEAD(&cinfo->suspend_list); + spin_lock_init(&cinfo->suspend_lock); + init_completion(&cinfo->completion); + set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + init_waitqueue_head(&cinfo->wait); + mutex_init(&cinfo->recv_mutex); + + mddev->cluster_info = cinfo; + cinfo->mddev = mddev; + + memset(str, 0, 64); + sprintf(str, "%pU", mddev->uuid); + ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, + 0, LVB_SIZE, &md_ls_ops, mddev, + &ops_rv, &cinfo->lockspace); + if (ret) + goto err; + wait_for_completion(&cinfo->completion); + if (nodes < cinfo->slot_number) { + pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).", + cinfo->slot_number, nodes); + ret = -ERANGE; + goto err; + } + /* Initiate the communication resources */ + ret = -ENOMEM; + cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + if (!cinfo->recv_thread) { + pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); + goto err; + } + cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1); + if (!cinfo->message_lockres) + goto err; + cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); + if (!cinfo->token_lockres) + goto err; + cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); + if (!cinfo->no_new_dev_lockres) + goto err; + + ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (ret) { + ret = -EAGAIN; + pr_err("md-cluster: can't join cluster to avoid lock issue\n"); + goto err; + } + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); + if (!cinfo->ack_lockres) { + ret = -ENOMEM; + goto err; + } + /* get sync CR lock on ACK. */ + if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) + pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", + ret); + dlm_unlock_sync(cinfo->token_lockres); + /* get sync CR lock on no-new-dev. */ + if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) + pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); + + + pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); + snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); + cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); + if (!cinfo->bitmap_lockres) { + ret = -ENOMEM; + goto err; + } + if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { + pr_err("Failed to get bitmap lock\n"); + ret = -EINVAL; + goto err; + } + + cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); + if (!cinfo->resync_lockres) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + md_unregister_thread(&cinfo->recovery_thread); + md_unregister_thread(&cinfo->recv_thread); + lockres_free(cinfo->message_lockres); + lockres_free(cinfo->token_lockres); + lockres_free(cinfo->ack_lockres); + lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); + lockres_free(cinfo->bitmap_lockres); + if (cinfo->lockspace) + dlm_release_lockspace(cinfo->lockspace, 2); + mddev->cluster_info = NULL; + kfree(cinfo); + return ret; +} + +static void load_bitmaps(struct mddev *mddev, int total_slots) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + /* load all the node's bitmap info for resync */ + if (gather_all_resync_info(mddev, total_slots)) + pr_err("md-cluster: failed to gather all resyn infos\n"); + set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state); + /* wake up recv thread in case something need to be handled */ + if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state)) + md_wakeup_thread(cinfo->recv_thread); +} + +static void resync_bitmap(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int err; + + cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); + err = sendmsg(cinfo, &cmsg, 1); + if (err) + pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n", + __func__, __LINE__, err); +} + +static void unlock_all_bitmaps(struct mddev *mddev); +static int leave(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + if (!cinfo) + return 0; + + /* + * BITMAP_NEEDS_SYNC message should be sent when node + * is leaving the cluster with dirty bitmap, also we + * can only deliver it when dlm connection is available. + * + * Also, we should send BITMAP_NEEDS_SYNC message in + * case reshaping is interrupted. + */ + if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + (mddev->reshape_position != MaxSector && + test_bit(MD_CLOSING, &mddev->flags))) + resync_bitmap(mddev); + + set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + md_unregister_thread(&cinfo->recovery_thread); + md_unregister_thread(&cinfo->recv_thread); + lockres_free(cinfo->message_lockres); + lockres_free(cinfo->token_lockres); + lockres_free(cinfo->ack_lockres); + lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); + lockres_free(cinfo->bitmap_lockres); + unlock_all_bitmaps(mddev); + dlm_release_lockspace(cinfo->lockspace, 2); + kfree(cinfo); + return 0; +} + +/* slot_number(): Returns the MD slot number to use + * DLM starts the slot numbers from 1, wheras cluster-md + * wants the number to be from zero, so we deduct one + */ +static int slot_number(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return cinfo->slot_number - 1; +} + +/* + * Check if the communication is already locked, else lock the communication + * channel. + * If it is already locked, token is in EX mode, and hence lock_token() + * should not be called. + */ +static int metadata_update_start(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret; + + /* + * metadata_update_start is always called with the protection of + * reconfig_mutex, so set WAITING_FOR_TOKEN here. + */ + ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + &cinfo->state); + WARN_ON_ONCE(ret); + md_wakeup_thread(mddev->thread); + + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) || + test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state)); + + /* If token is already locked, return 0 */ + if (cinfo->token_lockres->mode == DLM_LOCK_EX) { + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return 0; + } + + ret = lock_token(cinfo); + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return ret; +} + +static int metadata_update_finish(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + struct md_rdev *rdev; + int ret = 0; + int raid_slot = -1; + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(METADATA_UPDATED); + /* Pick up a good active device number to send. + */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + raid_slot = rdev->desc_nr; + break; + } + if (raid_slot >= 0) { + cmsg.raid_slot = cpu_to_le32(raid_slot); + ret = __sendmsg(cinfo, &cmsg); + } else + pr_warn("md-cluster: No good device id found to send\n"); + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + unlock_comm(cinfo); + return ret; +} + +static void metadata_update_cancel(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + unlock_comm(cinfo); +} + +static int update_bitmap_size(struct mddev *mddev, sector_t size) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int ret; + + cmsg.type = cpu_to_le32(BITMAP_RESIZE); + cmsg.high = cpu_to_le64(size); + ret = sendmsg(cinfo, &cmsg, 0); + if (ret) + pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n", + __func__, __LINE__, ret); + return ret; +} + +static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) +{ + struct bitmap_counts *counts; + char str[64]; + struct dlm_lock_resource *bm_lockres; + struct bitmap *bitmap = mddev->bitmap; + unsigned long my_pages = bitmap->counts.pages; + int i, rv; + + /* + * We need to ensure all the nodes can grow to a larger + * bitmap size before make the reshaping. + */ + rv = update_bitmap_size(mddev, newsize); + if (rv) + return rv; + + for (i = 0; i < mddev->bitmap_info.nodes; i++) { + if (i == md_cluster_ops->slot_number(mddev)) + continue; + + bitmap = get_bitmap_from_slot(mddev, i); + if (IS_ERR(bitmap)) { + pr_err("can't get bitmap from slot %d\n", i); + bitmap = NULL; + goto out; + } + counts = &bitmap->counts; + + /* + * If we can hold the bitmap lock of one node then + * the slot is not occupied, update the pages. + */ + snprintf(str, 64, "bitmap%04d", i); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) { + pr_err("Cannot initialize %s lock\n", str); + goto out; + } + bm_lockres->flags |= DLM_LKF_NOQUEUE; + rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (!rv) + counts->pages = my_pages; + lockres_free(bm_lockres); + + if (my_pages != counts->pages) + /* + * Let's revert the bitmap size if one node + * can't resize bitmap + */ + goto out; + md_bitmap_free(bitmap); + } + + return 0; +out: + md_bitmap_free(bitmap); + update_bitmap_size(mddev, oldsize); + return -1; +} + +/* + * return 0 if all the bitmaps have the same sync_size + */ +static int cluster_check_sync_size(struct mddev *mddev) +{ + int i, rv; + bitmap_super_t *sb; + unsigned long my_sync_size, sync_size = 0; + int node_num = mddev->bitmap_info.nodes; + int current_slot = md_cluster_ops->slot_number(mddev); + struct bitmap *bitmap = mddev->bitmap; + char str[64]; + struct dlm_lock_resource *bm_lockres; + + sb = kmap_atomic(bitmap->storage.sb_page); + my_sync_size = sb->sync_size; + kunmap_atomic(sb); + + for (i = 0; i < node_num; i++) { + if (i == current_slot) + continue; + + bitmap = get_bitmap_from_slot(mddev, i); + if (IS_ERR(bitmap)) { + pr_err("can't get bitmap from slot %d\n", i); + return -1; + } + + /* + * If we can hold the bitmap lock of one node then + * the slot is not occupied, update the sb. + */ + snprintf(str, 64, "bitmap%04d", i); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) { + pr_err("md-cluster: Cannot initialize %s\n", str); + md_bitmap_free(bitmap); + return -1; + } + bm_lockres->flags |= DLM_LKF_NOQUEUE; + rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (!rv) + md_bitmap_update_sb(bitmap); + lockres_free(bm_lockres); + + sb = kmap_atomic(bitmap->storage.sb_page); + if (sync_size == 0) + sync_size = sb->sync_size; + else if (sync_size != sb->sync_size) { + kunmap_atomic(sb); + md_bitmap_free(bitmap); + return -1; + } + kunmap_atomic(sb); + md_bitmap_free(bitmap); + } + + return (my_sync_size == sync_size) ? 0 : -1; +} + +/* + * Update the size for cluster raid is a little more complex, we perform it + * by the steps: + * 1. hold token lock and update superblock in initiator node. + * 2. send METADATA_UPDATED msg to other nodes. + * 3. The initiator node continues to check each bitmap's sync_size, if all + * bitmaps have the same value of sync_size, then we can set capacity and + * let other nodes to perform it. If one node can't update sync_size + * accordingly, we need to revert to previous value. + */ +static void update_size(struct mddev *mddev, sector_t old_dev_sectors) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + struct md_rdev *rdev; + int ret = 0; + int raid_slot = -1; + + md_update_sb(mddev, 1); + if (lock_comm(cinfo, 1)) { + pr_err("%s: lock_comm failed\n", __func__); + return; + } + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(METADATA_UPDATED); + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { + raid_slot = rdev->desc_nr; + break; + } + if (raid_slot >= 0) { + cmsg.raid_slot = cpu_to_le32(raid_slot); + /* + * We can only change capiticy after all the nodes can do it, + * so need to wait after other nodes already received the msg + * and handled the change + */ + ret = __sendmsg(cinfo, &cmsg); + if (ret) { + pr_err("%s:%d: failed to send METADATA_UPDATED msg\n", + __func__, __LINE__); + unlock_comm(cinfo); + return; + } + } else { + pr_err("md-cluster: No good device id found to send\n"); + unlock_comm(cinfo); + return; + } + + /* + * check the sync_size from other node's bitmap, if sync_size + * have already updated in other nodes as expected, send an + * empty metadata msg to permit the change of capacity + */ + if (cluster_check_sync_size(mddev) == 0) { + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(CHANGE_CAPACITY); + ret = __sendmsg(cinfo, &cmsg); + if (ret) + pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n", + __func__, __LINE__); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + } else { + /* revert to previous sectors */ + ret = mddev->pers->resize(mddev, old_dev_sectors); + ret = __sendmsg(cinfo, &cmsg); + if (ret) + pr_err("%s:%d: failed to send METADATA_UPDATED msg\n", + __func__, __LINE__); + } + unlock_comm(cinfo); +} + +static int resync_start(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); +} + +static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + spin_lock_irq(&cinfo->suspend_lock); + *lo = cinfo->suspend_lo; + *hi = cinfo->suspend_hi; + spin_unlock_irq(&cinfo->suspend_lock); +} + +static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct resync_info ri; + struct cluster_msg cmsg = {0}; + + /* do not send zero again, if we have sent before */ + if (hi == 0) { + memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); + if (le64_to_cpu(ri.hi) == 0) + return 0; + } + + add_resync_info(cinfo->bitmap_lockres, lo, hi); + /* Re-acquire the lock to refresh LVB */ + dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); + cmsg.type = cpu_to_le32(RESYNCING); + cmsg.low = cpu_to_le64(lo); + cmsg.high = cpu_to_le64(hi); + + /* + * mddev_lock is held if resync_info_update is called from + * resync_finish (md_reap_sync_thread -> resync_finish) + */ + if (lo == 0 && hi == 0) + return sendmsg(cinfo, &cmsg, 1); + else + return sendmsg(cinfo, &cmsg, 0); +} + +static int resync_finish(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret = 0; + + clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); + + /* + * If resync thread is interrupted so we can't say resync is finished, + * another node will launch resync thread to continue. + */ + if (!test_bit(MD_CLOSING, &mddev->flags)) + ret = resync_info_update(mddev, 0, 0); + dlm_unlock_sync(cinfo->resync_lockres); + return ret; +} + +static int area_resyncing(struct mddev *mddev, int direction, + sector_t lo, sector_t hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret = 0; + + if ((direction == READ) && + test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state)) + return 1; + + spin_lock_irq(&cinfo->suspend_lock); + if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi) + ret = 1; + spin_unlock_irq(&cinfo->suspend_lock); + return ret; +} + +/* add_new_disk() - initiates a disk add + * However, if this fails before writing md_update_sb(), + * add_new_disk_cancel() must be called to release token lock + */ +static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + int ret = 0; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + char *uuid = sb->device_uuid; + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(NEWDISK); + memcpy(cmsg.uuid, uuid, 16); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); + if (lock_comm(cinfo, 1)) + return -EAGAIN; + ret = __sendmsg(cinfo, &cmsg); + if (ret) { + unlock_comm(cinfo); + return ret; + } + cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); + cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; + /* Some node does not "see" the device */ + if (ret == -EAGAIN) + ret = -ENOENT; + if (ret) + unlock_comm(cinfo); + else { + dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which + * will run soon after add_new_disk, the below path will be + * invoked: + * md_wakeup_thread(mddev->thread) + * -> conf->thread (raid1d) + * -> md_check_recovery -> md_update_sb + * -> metadata_update_start/finish + * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually. + * + * For other failure cases, metadata_update_cancel and + * add_new_disk_cancel also clear below bit as well. + * */ + set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + wake_up(&cinfo->wait); + } + return ret; +} + +static void add_new_disk_cancel(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + unlock_comm(cinfo); +} + +static int new_disk_ack(struct mddev *mddev, bool ack) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) { + pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev)); + return -EINVAL; + } + + if (ack) + dlm_unlock_sync(cinfo->no_new_dev_lockres); + complete(&cinfo->newdisk_completion); + return 0; +} + +static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct cluster_msg cmsg = {0}; + struct md_cluster_info *cinfo = mddev->cluster_info; + cmsg.type = cpu_to_le32(REMOVE); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); + return sendmsg(cinfo, &cmsg, 1); +} + +static int lock_all_bitmaps(struct mddev *mddev) +{ + int slot, my_slot, ret, held = 1, i = 0; + char str[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cinfo->other_bitmap_lockres = + kcalloc(mddev->bitmap_info.nodes - 1, + sizeof(struct dlm_lock_resource *), GFP_KERNEL); + if (!cinfo->other_bitmap_lockres) { + pr_err("md: can't alloc mem for other bitmap locks\n"); + return 0; + } + + my_slot = slot_number(mddev); + for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) { + if (slot == my_slot) + continue; + + memset(str, '\0', 64); + snprintf(str, 64, "bitmap%04d", slot); + cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1); + if (!cinfo->other_bitmap_lockres[i]) + return -ENOMEM; + + cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW); + if (ret) + held = -1; + i++; + } + + return held; +} + +static void unlock_all_bitmaps(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int i; + + /* release other node's bitmap lock if they are existed */ + if (cinfo->other_bitmap_lockres) { + for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { + if (cinfo->other_bitmap_lockres[i]) { + lockres_free(cinfo->other_bitmap_lockres[i]); + } + } + kfree(cinfo->other_bitmap_lockres); + cinfo->other_bitmap_lockres = NULL; + } +} + +static int gather_bitmaps(struct md_rdev *rdev) +{ + int sn, err; + sector_t lo, hi; + struct cluster_msg cmsg = {0}; + struct mddev *mddev = rdev->mddev; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cmsg.type = cpu_to_le32(RE_ADD); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); + err = sendmsg(cinfo, &cmsg, 1); + if (err) + goto out; + + for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { + if (sn == (cinfo->slot_number - 1)) + continue; + err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); + if (err) { + pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); + goto out; + } + if ((hi > 0) && (lo < mddev->recovery_cp)) + mddev->recovery_cp = lo; + } +out: + return err; +} + +static struct md_cluster_operations cluster_ops = { + .join = join, + .leave = leave, + .slot_number = slot_number, + .resync_start = resync_start, + .resync_finish = resync_finish, + .resync_info_update = resync_info_update, + .resync_info_get = resync_info_get, + .metadata_update_start = metadata_update_start, + .metadata_update_finish = metadata_update_finish, + .metadata_update_cancel = metadata_update_cancel, + .area_resyncing = area_resyncing, + .add_new_disk = add_new_disk, + .add_new_disk_cancel = add_new_disk_cancel, + .new_disk_ack = new_disk_ack, + .remove_disk = remove_disk, + .load_bitmaps = load_bitmaps, + .gather_bitmaps = gather_bitmaps, + .resize_bitmaps = resize_bitmaps, + .lock_all_bitmaps = lock_all_bitmaps, + .unlock_all_bitmaps = unlock_all_bitmaps, + .update_size = update_size, +}; + +static int __init cluster_init(void) +{ + pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); + pr_info("Registering Cluster MD functions\n"); + register_md_cluster_operations(&cluster_ops, THIS_MODULE); + return 0; +} + +static void cluster_exit(void) +{ + unregister_md_cluster_operations(); +} + +module_init(cluster_init); +module_exit(cluster_exit); +MODULE_AUTHOR("SUSE"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clustering support for MD"); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h new file mode 100644 index 000000000..a78e30217 --- /dev/null +++ b/drivers/md/md-cluster.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + +#ifndef _MD_CLUSTER_H +#define _MD_CLUSTER_H + +#include "md.h" + +struct mddev; +struct md_rdev; + +struct md_cluster_operations { + int (*join)(struct mddev *mddev, int nodes); + int (*leave)(struct mddev *mddev); + int (*slot_number)(struct mddev *mddev); + int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); + int (*metadata_update_start)(struct mddev *mddev); + int (*metadata_update_finish)(struct mddev *mddev); + void (*metadata_update_cancel)(struct mddev *mddev); + int (*resync_start)(struct mddev *mddev); + int (*resync_finish)(struct mddev *mddev); + int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); + int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*add_new_disk_cancel)(struct mddev *mddev); + int (*new_disk_ack)(struct mddev *mddev, bool ack); + int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*load_bitmaps)(struct mddev *mddev, int total_slots); + int (*gather_bitmaps)(struct md_rdev *rdev); + int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize); + int (*lock_all_bitmaps)(struct mddev *mddev); + void (*unlock_all_bitmaps)(struct mddev *mddev); + void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors); +}; + +#endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c new file mode 100644 index 000000000..50ad81897 --- /dev/null +++ b/drivers/md/md-faulty.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * faulty.c : Multiple Devices driver for Linux + * + * Copyright (C) 2004 Neil Brown + * + * fautly-device-simulator personality for md + */ + + +/* + * The "faulty" personality causes some requests to fail. + * + * Possible failure modes are: + * reads fail "randomly" but succeed on retry + * writes fail "randomly" but succeed on retry + * reads for some address fail and then persist until a write + * reads for some address fail and then persist irrespective of write + * writes for some address fail and persist + * all writes fail + * + * Different modes can be active at a time, but only + * one can be set at array creation. Others can be added later. + * A mode can be one-shot or recurrent with the recurrence being + * once in every N requests. + * The bottom 5 bits of the "layout" indicate the mode. The + * remainder indicate a period, or 0 for one-shot. + * + * There is an implementation limit on the number of concurrently + * persisting-faulty blocks. When a new fault is requested that would + * exceed the limit, it is ignored. + * All current faults can be clear using a layout of "0". + * + * Requests are always sent to the device. If they are to fail, + * we clone the bio and insert a new b_end_io into the chain. + */ + +#define WriteTransient 0 +#define ReadTransient 1 +#define WritePersistent 2 +#define ReadPersistent 3 +#define WriteAll 4 /* doesn't go to device */ +#define ReadFixable 5 +#define Modes 6 + +#define ClearErrors 31 +#define ClearFaults 30 + +#define AllPersist 100 /* internal use only */ +#define NoPersist 101 + +#define ModeMask 0x1f +#define ModeShift 5 + +#define MaxFault 50 +#include +#include +#include +#include +#include "md.h" +#include + + +static void faulty_fail(struct bio *bio) +{ + struct bio *b = bio->bi_private; + + b->bi_iter.bi_size = bio->bi_iter.bi_size; + b->bi_iter.bi_sector = bio->bi_iter.bi_sector; + + bio_put(bio); + + bio_io_error(b); +} + +struct faulty_conf { + int period[Modes]; + atomic_t counters[Modes]; + sector_t faults[MaxFault]; + int modes[MaxFault]; + int nfaults; + struct md_rdev *rdev; +}; + +static int check_mode(struct faulty_conf *conf, int mode) +{ + if (conf->period[mode] == 0 && + atomic_read(&conf->counters[mode]) <= 0) + return 0; /* no failure, no decrement */ + + + if (atomic_dec_and_test(&conf->counters[mode])) { + if (conf->period[mode]) + atomic_set(&conf->counters[mode], conf->period[mode]); + return 1; + } + return 0; +} + +static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) +{ + /* If we find a ReadFixable sector, we fix it ... */ + int i; + for (i=0; infaults; i++) + if (conf->faults[i] >= start && + conf->faults[i] < end) { + /* found it ... */ + switch (conf->modes[i] * 2 + dir) { + case WritePersistent*2+WRITE: return 1; + case ReadPersistent*2+READ: return 1; + case ReadFixable*2+READ: return 1; + case ReadFixable*2+WRITE: + conf->modes[i] = NoPersist; + return 0; + case AllPersist*2+READ: + case AllPersist*2+WRITE: return 1; + default: + return 0; + } + } + return 0; +} + +static void add_sector(struct faulty_conf *conf, sector_t start, int mode) +{ + int i; + int n = conf->nfaults; + for (i=0; infaults; i++) + if (conf->faults[i] == start) { + switch(mode) { + case NoPersist: conf->modes[i] = mode; return; + case WritePersistent: + if (conf->modes[i] == ReadPersistent || + conf->modes[i] == ReadFixable) + conf->modes[i] = AllPersist; + else + conf->modes[i] = WritePersistent; + return; + case ReadPersistent: + if (conf->modes[i] == WritePersistent) + conf->modes[i] = AllPersist; + else + conf->modes[i] = ReadPersistent; + return; + case ReadFixable: + if (conf->modes[i] == WritePersistent || + conf->modes[i] == ReadPersistent) + conf->modes[i] = AllPersist; + else + conf->modes[i] = ReadFixable; + return; + } + } else if (conf->modes[i] == NoPersist) + n = i; + + if (n >= MaxFault) + return; + conf->faults[n] = start; + conf->modes[n] = mode; + if (conf->nfaults == n) + conf->nfaults = n+1; +} + +static bool faulty_make_request(struct mddev *mddev, struct bio *bio) +{ + struct faulty_conf *conf = mddev->private; + int failit = 0; + + if (bio_data_dir(bio) == WRITE) { + /* write request */ + if (atomic_read(&conf->counters[WriteAll])) { + /* special case - don't decrement, don't submit_bio_noacct, + * just fail immediately + */ + bio_io_error(bio); + return true; + } + + if (check_sector(conf, bio->bi_iter.bi_sector, + bio_end_sector(bio), WRITE)) + failit = 1; + if (check_mode(conf, WritePersistent)) { + add_sector(conf, bio->bi_iter.bi_sector, + WritePersistent); + failit = 1; + } + if (check_mode(conf, WriteTransient)) + failit = 1; + } else { + /* read request */ + if (check_sector(conf, bio->bi_iter.bi_sector, + bio_end_sector(bio), READ)) + failit = 1; + if (check_mode(conf, ReadTransient)) + failit = 1; + if (check_mode(conf, ReadPersistent)) { + add_sector(conf, bio->bi_iter.bi_sector, + ReadPersistent); + failit = 1; + } + if (check_mode(conf, ReadFixable)) { + add_sector(conf, bio->bi_iter.bi_sector, + ReadFixable); + failit = 1; + } + } + if (failit) { + struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, + &mddev->bio_set); + + b->bi_private = bio; + b->bi_end_io = faulty_fail; + bio = b; + } else + bio_set_dev(bio, conf->rdev->bdev); + + submit_bio_noacct(bio); + return true; +} + +static void faulty_status(struct seq_file *seq, struct mddev *mddev) +{ + struct faulty_conf *conf = mddev->private; + int n; + + if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) + seq_printf(seq, " WriteTransient=%d(%d)", + n, conf->period[WriteTransient]); + + if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) + seq_printf(seq, " ReadTransient=%d(%d)", + n, conf->period[ReadTransient]); + + if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) + seq_printf(seq, " WritePersistent=%d(%d)", + n, conf->period[WritePersistent]); + + if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) + seq_printf(seq, " ReadPersistent=%d(%d)", + n, conf->period[ReadPersistent]); + + + if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) + seq_printf(seq, " ReadFixable=%d(%d)", + n, conf->period[ReadFixable]); + + if ((n=atomic_read(&conf->counters[WriteAll])) != 0) + seq_printf(seq, " WriteAll"); + + seq_printf(seq, " nfaults=%d", conf->nfaults); +} + + +static int faulty_reshape(struct mddev *mddev) +{ + int mode = mddev->new_layout & ModeMask; + int count = mddev->new_layout >> ModeShift; + struct faulty_conf *conf = mddev->private; + + if (mddev->new_layout < 0) + return 0; + + /* new layout */ + if (mode == ClearFaults) + conf->nfaults = 0; + else if (mode == ClearErrors) { + int i; + for (i=0 ; i < Modes ; i++) { + conf->period[i] = 0; + atomic_set(&conf->counters[i], 0); + } + } else if (mode < Modes) { + conf->period[mode] = count; + if (!count) count++; + atomic_set(&conf->counters[mode], count); + } else + return -EINVAL; + mddev->new_layout = -1; + mddev->layout = -1; /* makes sure further changes come through */ + return 0; +} + +static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + +static int faulty_run(struct mddev *mddev) +{ + struct md_rdev *rdev; + int i; + struct faulty_conf *conf; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + return -ENOMEM; + + for (i=0; icounters[i], 0); + conf->period[i] = 0; + } + conf->nfaults = 0; + + rdev_for_each(rdev, mddev) { + conf->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } + + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); + mddev->private = conf; + + faulty_reshape(mddev); + + return 0; +} + +static void faulty_free(struct mddev *mddev, void *priv) +{ + struct faulty_conf *conf = priv; + + kfree(conf); +} + +static struct md_personality faulty_personality = +{ + .name = "faulty", + .level = LEVEL_FAULTY, + .owner = THIS_MODULE, + .make_request = faulty_make_request, + .run = faulty_run, + .free = faulty_free, + .status = faulty_status, + .check_reshape = faulty_reshape, + .size = faulty_size, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&faulty_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&faulty_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)"); +MODULE_ALIAS("md-personality-10"); /* faulty */ +MODULE_ALIAS("md-faulty"); +MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c new file mode 100644 index 000000000..4eb72b9dd --- /dev/null +++ b/drivers/md/md-linear.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + linear.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + or + + + Linear mode management functions. + +*/ + +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "md-linear.h" + +/* + * find which device holds a particular offset + */ +static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) +{ + int lo, mid, hi; + struct linear_conf *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = mddev->private; + + /* + * Binary Search + */ + + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; +} + +static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct linear_conf *conf; + sector_t array_sectors; + + conf = mddev->private; + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + + return array_sectors; +} + +static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) +{ + struct linear_conf *conf; + struct md_rdev *rdev; + int i, cnt; + + conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); + if (!conf) + return NULL; + + cnt = 0; + conf->array_sectors = 0; + + rdev_for_each(rdev, mddev) { + int j = rdev->raid_disk; + struct dev_info *disk = conf->disks + j; + sector_t sectors; + + if (j < 0 || j >= raid_disks || disk->rdev) { + pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); + goto out; + } + + disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + conf->array_sectors += rdev->sectors; + cnt++; + } + if (cnt != raid_disks) { + pr_warn("md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); + goto out; + } + + /* + * Here we calculate the device offsets. + */ + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; + + for (i = 1; i < raid_disks; i++) + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; + + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + + return conf; + +out: + kfree(conf); + return NULL; +} + +static int linear_run (struct mddev *mddev) +{ + struct linear_conf *conf; + int ret; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + conf = linear_conf(mddev, mddev->raid_disks); + + if (!conf) + return 1; + mddev->private = conf; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; +} + +static int linear_add(struct mddev *mddev, struct md_rdev *rdev) +{ + /* Adding a drive to a linear array allows the array to grow. + * It is permitted if the new drive has a matching superblock + * already on it, with raid_disk equal to raid_disks. + * It is achieved by creating a new linear_private_data structure + * and swapping it in in-place of the current one. + * The current one is never freed until the array is stopped. + * This avoids races. + */ + struct linear_conf *newconf, *oldconf; + + if (rdev->saved_raid_disk != mddev->raid_disks) + return -EINVAL; + + rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; + + newconf = linear_conf(mddev,mddev->raid_disks+1); + + if (!newconf) + return -ENOMEM; + + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ + mddev_suspend(mddev); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); + mddev->raid_disks++; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + mddev_resume(mddev); + kfree_rcu(oldconf, rcu); + return 0; +} + +static void linear_free(struct mddev *mddev, void *priv) +{ + struct linear_conf *conf = priv; + + kfree(conf); +} + +static bool linear_make_request(struct mddev *mddev, struct bio *bio) +{ + struct dev_info *tmp_dev; + sector_t start_sector, end_sector, data_offset; + sector_t bio_sector = bio->bi_iter.bi_sector; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + tmp_dev = which_dev(mddev, bio_sector); + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + end_sector = tmp_dev->end_sector; + data_offset = tmp_dev->rdev->data_offset; + + if (unlikely(bio_sector >= end_sector || + bio_sector < start_sector)) + goto out_of_bounds; + + if (unlikely(is_rdev_broken(tmp_dev->rdev))) { + md_error(mddev, tmp_dev->rdev); + bio_io_error(bio); + return true; + } + + if (unlikely(bio_end_sector(bio) > end_sector)) { + /* This bio crosses a device boundary, so we have to split it */ + struct bio *split = bio_split(bio, end_sector - bio_sector, + GFP_NOIO, &mddev->bio_set); + bio_chain(split, bio); + submit_bio_noacct(bio); + bio = split; + } + + bio_set_dev(bio, tmp_dev->rdev->bdev); + bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - + start_sector + data_offset; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !bdev_max_discard_sectors(bio->bi_bdev))) { + /* Just ignore it */ + bio_endio(bio); + } else { + if (mddev->gendisk) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_write_zeroes(mddev, bio); + submit_bio_noacct(bio); + } + return true; + +out_of_bounds: + pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_iter.bi_sector, + tmp_dev->rdev->bdev, + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + bio_io_error(bio); + return true; +} + +static void linear_status (struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); +} + +static void linear_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { + char *md_name = mdname(mddev); + + pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n", + md_name, rdev->bdev); + } +} + +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + +static struct md_personality linear_personality = +{ + .name = "linear", + .level = LEVEL_LINEAR, + .owner = THIS_MODULE, + .make_request = linear_make_request, + .run = linear_run, + .free = linear_free, + .status = linear_status, + .hot_add_disk = linear_add, + .size = linear_size, + .quiesce = linear_quiesce, + .error_handler = linear_error, +}; + +static int __init linear_init (void) +{ + return register_md_personality (&linear_personality); +} + +static void linear_exit (void) +{ + unregister_md_personality (&linear_personality); +} + +module_init(linear_init); +module_exit(linear_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); +MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ +MODULE_ALIAS("md-linear"); +MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h new file mode 100644 index 000000000..24e97db50 --- /dev/null +++ b/drivers/md/md-linear.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINEAR_H +#define _LINEAR_H + +struct dev_info { + struct md_rdev *rdev; + sector_t end_sector; +}; + +struct linear_conf +{ + struct rcu_head rcu; + sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ + struct dev_info disks[]; +}; +#endif diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c new file mode 100644 index 000000000..66edf5e72 --- /dev/null +++ b/drivers/md/md-multipath.c @@ -0,0 +1,470 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * multipath.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * MULTIPATH management functions. + * + * derived from raid1.c. + */ + +#include +#include +#include +#include +#include +#include "md.h" +#include "md-multipath.h" + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + +static int multipath_map (struct mpconf *conf) +{ + int i, disks = conf->raid_disks; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + rcu_read_lock(); + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return i; + } + } + rcu_read_unlock(); + + pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); + return (-1); +} + +static void multipath_reschedule_retry (struct multipath_bh *mp_bh) +{ + unsigned long flags; + struct mddev *mddev = mp_bh->mddev; + struct mpconf *conf = mddev->private; + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&mp_bh->retry_list, &conf->retry_list); + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(mddev->thread); +} + +/* + * multipath_end_bh_io() is called when we have finished servicing a multipathed + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) +{ + struct bio *bio = mp_bh->master_bio; + struct mpconf *conf = mp_bh->mddev->private; + + bio->bi_status = status; + bio_endio(bio); + mempool_free(mp_bh, &conf->pool); +} + +static void multipath_end_request(struct bio *bio) +{ + struct multipath_bh *mp_bh = bio->bi_private; + struct mpconf *conf = mp_bh->mddev->private; + struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; + + if (!bio->bi_status) + multipath_end_bh_io(mp_bh, 0); + else if (!(bio->bi_opf & REQ_RAHEAD)) { + /* + * oops, IO error: + */ + md_error (mp_bh->mddev, rdev); + pr_info("multipath: %pg: rescheduling sector %llu\n", + rdev->bdev, + (unsigned long long)bio->bi_iter.bi_sector); + multipath_reschedule_retry(mp_bh); + } else + multipath_end_bh_io(mp_bh, bio->bi_status); + rdev_dec_pending(rdev, conf->mddev); +} + +static bool multipath_make_request(struct mddev *mddev, struct bio * bio) +{ + struct mpconf *conf = mddev->private; + struct multipath_bh * mp_bh; + struct multipath_info *multipath; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); + + mp_bh->master_bio = bio; + mp_bh->mddev = mddev; + + mp_bh->path = multipath_map(conf); + if (mp_bh->path < 0) { + bio_io_error(bio); + mempool_free(mp_bh, &conf->pool); + return true; + } + multipath = conf->multipaths + mp_bh->path; + + bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO); + + mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; + mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; + mp_bh->bio.bi_end_io = multipath_end_request; + mp_bh->bio.bi_private = mp_bh; + mddev_check_write_zeroes(mddev, &mp_bh->bio); + submit_bio_noacct(&mp_bh->bio); + return true; +} + +static void multipath_status(struct seq_file *seq, struct mddev *mddev) +{ + struct mpconf *conf = mddev->private; + int i; + + seq_printf (seq, " [%d/%d] [", conf->raid_disks, + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); + seq_putc(seq, ']'); +} + +/* + * Careful, this can execute in IRQ contexts as well! + */ +static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + + if (conf->raid_disks - mddev->degraded <= 1) { + /* + * Uh oh, we can do nothing if this is our last path, but + * first check if this is a queued request for a device + * which has just failed. + */ + pr_warn("multipath: only one IO path left and IO error.\n"); + /* leave it active... it's all we have */ + return; + } + /* + * Mark disk as unusable + */ + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + set_bit(Faulty, &rdev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("multipath: IO failure on %pg, disabling IO path.\n" + "multipath: Operation continuing on %d IO paths.\n", + rdev->bdev, + conf->raid_disks - mddev->degraded); +} + +static void print_multipath_conf (struct mpconf *conf) +{ + int i; + struct multipath_info *tmp; + + pr_debug("MULTIPATH conf printout:\n"); + if (!conf) { + pr_debug("(conf==NULL)\n"); + return; + } + pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); + + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->multipaths + i; + if (tmp->rdev) + pr_debug(" disk%d, o:%d, dev:%pg\n", + i,!test_bit(Faulty, &tmp->rdev->flags), + tmp->rdev->bdev); + } +} + +static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + int err = -EEXIST; + int path; + struct multipath_info *p; + int first = 0; + int last = mddev->raid_disks - 1; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + print_multipath_conf(conf); + + for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + err = md_integrity_add_rdev(rdev, mddev); + if (err) + break; + spin_lock_irq(&conf->device_lock); + mddev->degraded--; + rdev->raid_disk = path; + set_bit(In_sync, &rdev->flags); + spin_unlock_irq(&conf->device_lock); + rcu_assign_pointer(p->rdev, rdev); + err = 0; + break; + } + + print_multipath_conf(conf); + + return err; +} + +static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct multipath_info *p = conf->multipaths + number; + + print_multipath_conf(conf); + + if (rdev == p->rdev) { + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + err = md_integrity_register(mddev); + } +abort: + + print_multipath_conf(conf); + return err; +} + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working multipaths. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ + +static void multipathd(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct multipath_bh *mp_bh; + struct bio *bio; + unsigned long flags; + struct mpconf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + + md_check_recovery(mddev); + for (;;) { + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) + break; + mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); + list_del(head->prev); + spin_unlock_irqrestore(&conf->device_lock, flags); + + bio = &mp_bh->bio; + bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; + + if ((mp_bh->path = multipath_map (conf))<0) { + pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n", + bio->bi_bdev, + (unsigned long long)bio->bi_iter.bi_sector); + multipath_end_bh_io(mp_bh, BLK_STS_IOERR); + } else { + pr_err("multipath: %pg: redirecting sector %llu to another IO path\n", + bio->bi_bdev, + (unsigned long long)bio->bi_iter.bi_sector); + *bio = *(mp_bh->master_bio); + bio->bi_iter.bi_sector += + conf->multipaths[mp_bh->path].rdev->data_offset; + bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; + bio->bi_end_io = multipath_end_request; + bio->bi_private = mp_bh; + submit_bio_noacct(bio); + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + +static int multipath_run (struct mddev *mddev) +{ + struct mpconf *conf; + int disk_idx; + struct multipath_info *disk; + struct md_rdev *rdev; + int working_disks; + int ret; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + if (mddev->level != LEVEL_MULTIPATH) { + pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", + mdname(mddev), mddev->level); + goto out; + } + /* + * copy the already verified devices into our private MULTIPATH + * bookkeeping area. [whatever we allocate in multipath_run(), + * should be freed in multipath_free()] + */ + + conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); + mddev->private = conf; + if (!conf) + goto out; + + conf->multipaths = kcalloc(mddev->raid_disks, + sizeof(struct multipath_info), + GFP_KERNEL); + if (!conf->multipaths) + goto out_free_conf; + + working_disks = 0; + rdev_for_each(rdev, mddev) { + disk_idx = rdev->raid_disk; + if (disk_idx < 0 || + disk_idx >= mddev->raid_disks) + continue; + + disk = conf->multipaths + disk_idx; + disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + if (!test_bit(Faulty, &rdev->flags)) + working_disks++; + } + + conf->raid_disks = mddev->raid_disks; + conf->mddev = mddev; + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + if (!working_disks) { + pr_warn("multipath: no operational IO paths for %s\n", + mdname(mddev)); + goto out_free_conf; + } + mddev->degraded = conf->raid_disks - working_disks; + + ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS, + sizeof(struct multipath_bh)); + if (ret) + goto out_free_conf; + + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); + if (!mddev->thread) + goto out_free_conf; + + pr_info("multipath: array %s active with %d out of %d IO paths\n", + mdname(mddev), conf->raid_disks - mddev->degraded, + mddev->raid_disks); + /* + * Ok, everything is just fine now + */ + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); + + if (md_integrity_register(mddev)) + goto out_free_conf; + + return 0; + +out_free_conf: + mempool_exit(&conf->pool); + kfree(conf->multipaths); + kfree(conf); + mddev->private = NULL; +out: + return -EIO; +} + +static void multipath_free(struct mddev *mddev, void *priv) +{ + struct mpconf *conf = priv; + + mempool_exit(&conf->pool); + kfree(conf->multipaths); + kfree(conf); +} + +static struct md_personality multipath_personality = +{ + .name = "multipath", + .level = LEVEL_MULTIPATH, + .owner = THIS_MODULE, + .make_request = multipath_make_request, + .run = multipath_run, + .free = multipath_free, + .status = multipath_status, + .error_handler = multipath_error, + .hot_add_disk = multipath_add_disk, + .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, +}; + +static int __init multipath_init (void) +{ + return register_md_personality (&multipath_personality); +} + +static void __exit multipath_exit (void) +{ + unregister_md_personality (&multipath_personality); +} + +module_init(multipath_init); +module_exit(multipath_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)"); +MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ +MODULE_ALIAS("md-multipath"); +MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h new file mode 100644 index 000000000..b3099e5fc --- /dev/null +++ b/drivers/md/md-multipath.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +struct multipath_info { + struct md_rdev *rdev; +}; + +struct mpconf { + struct mddev *mddev; + struct multipath_info *multipaths; + int raid_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t pool; +}; + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + struct mddev *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c new file mode 100644 index 000000000..6120f26a7 --- /dev/null +++ b/drivers/md/md.c @@ -0,0 +1,10020 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - RAID-6 extensions by H. Peter Anvin + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + - persistent bitmap code + Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + + + Errors, Warnings, etc. + Please use: + pr_crit() for error conditions that risk data loss + pr_err() for error conditions that are unexpected, like an IO error + or internal inconsistency + pr_warn() for error conditions that could have been predicated, like + adding a device to an array when it has incompatible metadata + pr_info() for every interesting, very rare events, like an array starting + or stopping, or resync starting or stopping + pr_debug() for everything else. + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "md.h" +#include "md-bitmap.h" +#include "md-cluster.h" + +/* pers_list is a list of registered personalities protected + * by pers_lock. + * pers_lock does extra service to protect accesses to + * mddev->thread when the mutex cannot be held. + */ +static LIST_HEAD(pers_list); +static DEFINE_SPINLOCK(pers_lock); + +static struct kobj_type md_ktype; + +struct md_cluster_operations *md_cluster_ops; +EXPORT_SYMBOL(md_cluster_ops); +static struct module *md_cluster_mod; + +static DECLARE_WAIT_QUEUE_HEAD(resync_wait); +static struct workqueue_struct *md_wq; +static struct workqueue_struct *md_misc_wq; +static struct workqueue_struct *md_rdev_misc_wq; + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); +static void mddev_detach(struct mddev *mddev); + +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +/* + * Default number of read corrections we'll attempt on an rdev + * before ejecting it from the array. We divide the read error + * count by 2 for every hour elapsed between read errors. + */ +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* Default safemode delay: 200 msec */ +#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwidth if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + * or /sys/block/mdX/md/sync_speed_{min,max} + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; +static inline int speed_min(struct mddev *mddev) +{ + return mddev->sync_speed_min ? + mddev->sync_speed_min : sysctl_speed_limit_min; +} + +static inline int speed_max(struct mddev *mddev) +{ + return mddev->sync_speed_max ? + mddev->sync_speed_max : sysctl_speed_limit_max; +} + +static void rdev_uninit_serial(struct md_rdev *rdev) +{ + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + return; + + kvfree(rdev->serial); + rdev->serial = NULL; +} + +static void rdevs_uninit_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + rdev_uninit_serial(rdev); +} + +static int rdev_init_serial(struct md_rdev *rdev) +{ + /* serial_nums equals with BARRIER_BUCKETS_NR */ + int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); + struct serial_in_rdev *serial = NULL; + + if (test_bit(CollisionCheck, &rdev->flags)) + return 0; + + serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, + GFP_KERNEL); + if (!serial) + return -ENOMEM; + + for (i = 0; i < serial_nums; i++) { + struct serial_in_rdev *serial_tmp = &serial[i]; + + spin_lock_init(&serial_tmp->serial_lock); + serial_tmp->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial_tmp->serial_io_wait); + } + + rdev->serial = serial; + set_bit(CollisionCheck, &rdev->flags); + + return 0; +} + +static int rdevs_init_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + int ret = 0; + + rdev_for_each(rdev, mddev) { + ret = rdev_init_serial(rdev); + if (ret) + break; + } + + /* Free all resources if pool is not existed */ + if (ret && !mddev->serial_info_pool) + rdevs_uninit_serial(mddev); + + return ret; +} + +/* + * rdev needs to enable serial stuffs if it meets the conditions: + * 1. it is multi-queue device flaged with writemostly. + * 2. the write-behind mode is enabled. + */ +static int rdev_need_serial(struct md_rdev *rdev) +{ + return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && + rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && + test_bit(WriteMostly, &rdev->flags)); +} + +/* + * Init resource for rdev(s), then create serial_info_pool if: + * 1. rdev is the first device which return true from rdev_enable_serial. + * 2. rdev is NULL, means we want to enable serialization for all rdevs. + */ +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + int ret = 0; + + if (rdev && !rdev_need_serial(rdev) && + !test_bit(CollisionCheck, &rdev->flags)) + return; + + if (!is_suspend) + mddev_suspend(mddev); + + if (!rdev) + ret = rdevs_init_serial(mddev); + else + ret = rdev_init_serial(rdev); + if (ret) + goto abort; + + if (mddev->serial_info_pool == NULL) { + /* + * already in memalloc noio context by + * mddev_suspend() + */ + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { + rdevs_uninit_serial(mddev); + pr_err("can't alloc memory pool for serialization\n"); + } + } + +abort: + if (!is_suspend) + mddev_resume(mddev); +} + +/* + * Free resource from rdev(s), and destroy serial_info_pool under conditions: + * 1. rdev is the last device flaged with CollisionCheck. + * 2. when bitmap is destroyed while policy is not enabled. + * 3. for disable policy, the pool is destroyed only when no rdev needs it. + */ +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + if (rdev && !test_bit(CollisionCheck, &rdev->flags)) + return; + + if (mddev->serial_info_pool) { + struct md_rdev *temp; + int num = 0; /* used to track if other rdevs need the pool */ + + if (!is_suspend) + mddev_suspend(mddev); + rdev_for_each(temp, mddev) { + if (!rdev) { + if (!mddev->serialize_policy || + !rdev_need_serial(temp)) + rdev_uninit_serial(temp); + else + num++; + } else if (temp != rdev && + test_bit(CollisionCheck, &temp->flags)) + num++; + } + + if (rdev) + rdev_uninit_serial(rdev); + + if (num) + pr_info("The mempool could be used by other devices\n"); + else { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } + if (!is_suspend) + mddev_resume(mddev); + } +} + +static struct ctl_table_header *raid_table_header; + +static struct ctl_table raid_table[] = { + { + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = S_IRUGO|S_IWUSR, + .proc_handler = proc_dointvec, + }, + { + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = S_IRUGO|S_IWUSR, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table raid_dir_table[] = { + { + .procname = "raid", + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = raid_table, + }, + { } +}; + +static struct ctl_table raid_root_table[] = { + { + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { } +}; + +static int start_readonly; + +/* + * The original mechanism for creating an md device is to create + * a device node in /dev and to open it. This causes races with device-close. + * The preferred method is to write to the "new_array" module parameter. + * This can avoid races. + * Setting create_on_open to false disables the original mechanism + * so all the races disappear. + */ +static bool create_on_open = true; + +/* + * We have a system wide 'event count' that is incremented + * on any 'interesting' event, and readers of /proc/mdstat + * can use 'poll' or 'select' to find out when the event + * count increases. + * + * Events are: + * start array, stop array, error, add device, remove device, + * start build, activate spare + */ +static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); +static atomic_t md_event_count; +void md_new_event(void) +{ + atomic_inc(&md_event_count); + wake_up(&md_event_waiters); +} +EXPORT_SYMBOL_GPL(md_new_event); + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list. + */ +static LIST_HEAD(all_mddevs); +static DEFINE_SPINLOCK(all_mddevs_lock); + +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (is_md_suspended(mddev)) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + +void md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + if (is_suspended(mddev, bio)) { + DEFINE_WAIT(__wait); + /* Bail out if REQ_NOWAIT is set for the bio */ + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!is_suspended(mddev, bio)) + break; + schedule(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + if (!percpu_ref_tryget_live(&mddev->active_io)) + goto check_suspended; + + if (!mddev->pers->make_request(mddev, bio)) { + percpu_ref_put(&mddev->active_io); + goto check_suspended; + } + + percpu_ref_put(&mddev->active_io); +} +EXPORT_SYMBOL(md_handle_request); + +static void md_submit_bio(struct bio *bio) +{ + const int rw = bio_data_dir(bio); + struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; + + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return; + } + + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + bio_io_error(bio); + return; + } + + bio = bio_split_to_limits(bio); + if (!bio) + return; + + if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { + if (bio_sectors(bio) != 0) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + + /* bio could be mergeable after passing to underlayer */ + bio->bi_opf &= ~REQ_NOMERGE; + + md_handle_request(mddev, bio); +} + +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once mddev_detach() is called and completes, the module will be + * completely unused. + */ +void mddev_suspend(struct mddev *mddev) +{ + WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); + if (mddev->suspended++) + return; + wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + percpu_ref_kill(&mddev->active_io); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); + mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); + + del_timer_sync(&mddev->safemode_timer); + /* restrict memory reclaim I/O during raid array is suspend */ + mddev->noio_flag = memalloc_noio_save(); +} +EXPORT_SYMBOL_GPL(mddev_suspend); + +void mddev_resume(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + if (--mddev->suspended) + return; + + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); + + percpu_ref_resurrect(&mddev->active_io); + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ +} +EXPORT_SYMBOL_GPL(mddev_resume); + +/* + * Generic flush handling for md + */ + +static void md_end_flush(struct bio *bio) +{ + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; + + bio_put(bio); + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); + + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); + } +} + +static void md_submit_flush_data(struct work_struct *ws); + +static void submit_flushes(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct md_rdev *rdev; + + mddev->start_flush = ktime_get_boottime(); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + struct bio *bi; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + bi = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, + GFP_NOIO, &mddev->bio_set); + bi->bi_end_io = md_end_flush; + bi->bi_private = rdev; + atomic_inc(&mddev->flush_pending); + submit_bio(bi); + rcu_read_lock(); + } + rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); +} + +static void md_submit_flush_data(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + spin_lock_irq(&mddev->lock); + mddev->prev_flush_start = mddev->start_flush; + mddev->flush_bio = NULL; + spin_unlock_irq(&mddev->lock); + wake_up(&mddev->sb_wait); + + if (bio->bi_iter.bi_size == 0) { + /* an empty barrier - all done */ + bio_endio(bio); + } else { + bio->bi_opf &= ~REQ_PREFLUSH; + md_handle_request(mddev, bio); + } +} + +/* + * Manages consolidation of flushes and submitting any flushes needed for + * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is + * being finished in another context. Returns false if the flushing is + * complete but still needs the I/O portion of the bio to be processed. + */ +bool md_flush_request(struct mddev *mddev, struct bio *bio) +{ + ktime_t req_start = ktime_get_boottime(); + spin_lock_irq(&mddev->lock); + /* flush requests wait until ongoing flush completes, + * hence coalescing all the pending requests. + */ + wait_event_lock_irq(mddev->sb_wait, + !mddev->flush_bio || + ktime_before(req_start, mddev->prev_flush_start), + mddev->lock); + /* new request after previous flush is completed */ + if (ktime_after(req_start, mddev->prev_flush_start)) { + WARN_ON(mddev->flush_bio); + /* + * Grab a reference to make sure mddev_suspend() will wait for + * this flush to be done. + * + * md_flush_reqeust() is called under md_handle_request() and + * 'active_io' is already grabbed, hence percpu_ref_is_zero() + * won't pass, percpu_ref_tryget_live() can't be used because + * percpu_ref_kill() can be called by mddev_suspend() + * concurrently. + */ + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); + percpu_ref_get(&mddev->active_io); + mddev->flush_bio = bio; + bio = NULL; + } + spin_unlock_irq(&mddev->lock); + + if (!bio) { + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); + } else { + /* flush was performed for some other bio while we waited. */ + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio); + else { + bio->bi_opf &= ~REQ_PREFLUSH; + return false; + } + } + return true; +} +EXPORT_SYMBOL(md_flush_request); + +static inline struct mddev *mddev_get(struct mddev *mddev) +{ + lockdep_assert_held(&all_mddevs_lock); + + if (test_bit(MD_DELETED, &mddev->flags)) + return NULL; + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_delayed_delete(struct work_struct *ws); + +void mddev_put(struct mddev *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks) && + mddev->ctime == 0 && !mddev->hold_active) { + /* Array is not configured at all, and not held active, + * so destroy it */ + set_bit(MD_DELETED, &mddev->flags); + + /* + * Call queue_work inside the spinlock so that + * flush_workqueue() after mddev_find will succeed in waiting + * for the work to be done. + */ + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + queue_work(md_misc_wq, &mddev->del_work); + } + spin_unlock(&all_mddevs_lock); +} + +static void md_safemode_timeout(struct timer_list *t); + +void mddev_init(struct mddev *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + spin_lock_init(&mddev->lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->reshape_backwards = 0; + mddev->last_sync_action = "none"; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; +} +EXPORT_SYMBOL_GPL(mddev_init); + +static struct mddev *mddev_find_locked(dev_t unit) +{ + struct mddev *mddev; + + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) + return mddev; + + return NULL; +} + +/* find an unused unit number */ +static dev_t mddev_alloc_unit(void) +{ + static int next_minor = 512; + int start = next_minor; + bool is_free = 0; + dev_t dev = 0; + + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) + return 0; /* Oh dear, all in use. */ + is_free = !mddev_find_locked(dev); + } + + return dev; +} + +static struct mddev *mddev_alloc(dev_t unit) +{ + struct mddev *new; + int error; + + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + mddev_init(new); + + spin_lock(&all_mddevs_lock); + if (unit) { + error = -EEXIST; + if (mddev_find_locked(unit)) + goto out_free_new; + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + new->hold_active = UNTIL_IOCTL; + } else { + error = -ENODEV; + new->unit = mddev_alloc_unit(); + if (!new->unit) + goto out_free_new; + new->md_minor = MINOR(new->unit); + new->hold_active = UNTIL_STOP; + } + + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; +out_free_new: + spin_unlock(&all_mddevs_lock); + kfree(new); + return ERR_PTR(error); +} + +static void mddev_free(struct mddev *mddev) +{ + spin_lock(&all_mddevs_lock); + list_del(&mddev->all_mddevs); + spin_unlock(&all_mddevs_lock); + + kfree(mddev); +} + +static const struct attribute_group md_redundancy_group; + +void mddev_unlock(struct mddev *mddev) +{ + if (mddev->to_remove) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. + */ + const struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; + mddev->sysfs_active = 1; + mutex_unlock(&mddev->reconfig_mutex); + + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); + mddev->sysfs_action = NULL; + mddev->sysfs_completed = NULL; + mddev->sysfs_degraded = NULL; + } + } + mddev->sysfs_active = 0; + } else + mutex_unlock(&mddev->reconfig_mutex); + + /* As we've dropped the mutex we need a spinlock to + * make sure the thread doesn't disappear + */ + spin_lock(&pers_lock); + md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); + spin_unlock(&pers_lock); +} +EXPORT_SYMBOL_GPL(mddev_unlock); + +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} +EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + +struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} +EXPORT_SYMBOL_GPL(md_find_rdev_rcu); + +static struct md_personality *find_pers(int level, char *clevel) +{ + struct md_personality *pers; + list_for_each_entry(pers, &pers_list, list) { + if (level != LEVEL_NONE && pers->level == level) + return pers; + if (strcmp(pers->name, clevel)==0) + return pers; + } + return NULL; +} + +/* return the offset of the super block in 512byte sectors */ +static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) +{ + return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); +} + +static int alloc_disk_sb(struct md_rdev *rdev) +{ + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) + return -ENOMEM; + return 0; +} + +void md_rdev_clear(struct md_rdev *rdev) +{ + if (rdev->sb_page) { + put_page(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_start = 0; + rdev->sectors = 0; + } + if (rdev->bb_page) { + put_page(rdev->bb_page); + rdev->bb_page = NULL; + } + badblocks_exit(&rdev->badblocks); +} +EXPORT_SYMBOL_GPL(md_rdev_clear); + +static void super_written(struct bio *bio) +{ + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; + + if (bio->bi_status) { + pr_err("md: %s gets error=%d\n", __func__, + blk_status_to_errno(bio->bi_status)); + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags) + && (bio->bi_opf & MD_FAILFAST)) { + set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); + set_bit(LastDev, &rdev->flags); + } + } else + clear_bit(LastDev, &rdev->flags); + + bio_put(bio); + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->pending_writes)) + wake_up(&mddev->sb_wait); +} + +void md_super_write(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page) +{ + /* write first size bytes of page to sector of rdev + * Increment mddev->pending_writes before returning + * and decrement it on completion, waking up sb_wait + * if zero is reached. + * If an error occurred, call md_error + */ + struct bio *bio; + + if (!page) + return; + + if (test_bit(Faulty, &rdev->flags)) + return; + + bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, + 1, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, + GFP_NOIO, &mddev->sync_set); + + atomic_inc(&rdev->nr_pending); + + bio->bi_iter.bi_sector = sector; + bio_add_page(bio, page, size, 0); + bio->bi_private = rdev; + bio->bi_end_io = super_written; + + if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && + test_bit(FailFast, &rdev->flags) && + !test_bit(LastDev, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; + + atomic_inc(&mddev->pending_writes); + submit_bio(bio); +} + +int md_super_wait(struct mddev *mddev) +{ + /* wait for all superblock writes that were scheduled to complete */ + wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) + return -EAGAIN; + return 0; +} + +int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, blk_opf_t opf, bool metadata_op) +{ + struct bio bio; + struct bio_vec bvec; + + if (metadata_op && rdev->meta_bdev) + bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); + else + bio_init(&bio, rdev->bdev, &bvec, 1, opf); + + if (metadata_op) + bio.bi_iter.bi_sector = sector + rdev->sb_start; + else if (rdev->mddev->reshape_position != MaxSector && + (rdev->mddev->reshape_backwards == + (sector >= rdev->mddev->reshape_position))) + bio.bi_iter.bi_sector = sector + rdev->new_data_offset; + else + bio.bi_iter.bi_sector = sector + rdev->data_offset; + bio_add_page(&bio, page, size, 0); + + submit_bio_wait(&bio); + + return !bio.bi_status; +} +EXPORT_SYMBOL_GPL(sync_page_io); + +static int read_disk_sb(struct md_rdev *rdev, int size) +{ + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + pr_err("md: disabled device %pg, could not read superblock.\n", + rdev->bdev); + return -EINVAL; +} + +static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + return sb1->set_uuid0 == sb2->set_uuid0 && + sb1->set_uuid1 == sb2->set_uuid1 && + sb1->set_uuid2 == sb2->set_uuid2 && + sb1->set_uuid3 == sb2->set_uuid3; +} + +static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); +abort: + kfree(tmp1); + kfree(tmp2); + return ret; +} + +static u32 md_csum_fold(u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + return (csum & 0xffff) + (csum >> 16); +} + +static unsigned int calc_sb_csum(mdp_super_t *sb) +{ + u64 newcsum = 0; + u32 *sb32 = (u32*)sb; + int i; + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + + for (i = 0; i < MD_SB_BYTES/4 ; i++) + newcsum += sb32[i]; + csum = (newcsum & 0xffffffff) + (newcsum>>32); + +#ifdef CONFIG_ALPHA + /* This used to use csum_partial, which was wrong for several + * reasons including that different results are returned on + * different architectures. It isn't critical that we get exactly + * the same return value as before (we always csum_fold before + * testing, and that removes any differences). However as we + * know that csum_partial always returned a 16bit value on + * alphas, do a fold to maximise conformity to previous behaviour. + */ + sb->sb_csum = md_csum_fold(disk_csum); +#else + sb->sb_csum = disk_csum; +#endif + return csum; +} + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(struct mddev *mddev, struct md_rdev *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(struct mddev *mddev, struct md_rdev *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(struct md_rdev *rdev, + struct md_rdev *refdev, + int minor_version); + int (*validate_super)(struct mddev *mddev, + struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, + struct md_rdev *rdev); + unsigned long long (*rdev_size_change)(struct md_rdev *rdev, + sector_t num_sectors); + int (*allow_new_offset)(struct md_rdev *rdev, + unsigned long long new_offset); +}; + +/* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(struct mddev *mddev) +{ + if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) + return 0; + pr_warn("%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + +/* + * load_super for 0.90.0 + */ +static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) +{ + mdp_super_t *sb; + int ret; + bool spare_disk = true; + + /* + * Calculate the position of the superblock (512byte sectors), + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + rdev->sb_start = calc_dev_sboffset(rdev); + + ret = read_disk_sb(rdev, MD_SB_BYTES); + if (ret) + return ret; + + ret = -EINVAL; + + sb = page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + pr_warn("md: invalid raid superblock magic on %pg\n", + rdev->bdev); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version < 90 || + sb->minor_version > 91) { + pr_warn("Bad version number %d.%d on %pg\n", + sb->major_version, sb->minor_version, rdev->bdev); + goto abort; + } + + if (sb->raid_disks <= 0) + goto abort; + + if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { + pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + rdev->new_data_offset = 0; + rdev->sb_size = MD_SB_BYTES; + rdev->badblocks.shift = -1; + + if (sb->level == LEVEL_MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == LEVEL_MULTIPATH || + (rdev->desc_nr >= 0 && + rdev->desc_nr < MD_SB_DISKS && + sb->disks[rdev->desc_nr].state & + ((1<sb_page); + if (!md_uuid_equal(refsb, sb)) { + pr_warn("md: %pg has different UUID to %pg\n", + rdev->bdev, refdev->bdev); + goto abort; + } + if (!md_sb_equal(refsb, sb)) { + pr_warn("md: %pg has same UUID but different superblock to %pg\n", + rdev->bdev, refdev->bdev); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + + if (!spare_disk && ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * (not needed for Linear and RAID0 as metadata doesn't + * record this size) + */ + if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) + rdev->sectors = (sector_t)(2ULL << 32) - 2; + + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) + /* "this cannot possibly happen" ... */ + ret = -EINVAL; + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = page_address(rdev->sb_page); + __u64 ev1 = md_event(sb); + + rdev->raid_disk = -1; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->external = 0; + mddev->chunk_sectors = sb->chunk_size >> 9; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->clevel[0] = 0; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->dev_sectors = ((sector_t)sb->size) * 2; + mddev->events = ev1; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* bitmap can use 60 K after the 4K superblocks */ + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->reshape_backwards = 0; + + if (mddev->minor_version >= 91) { + mddev->reshape_position = sb->reshape_position; + mddev->delta_disks = sb->delta_disks; + mddev->new_level = sb->new_level; + mddev->new_layout = sb->new_layout; + mddev->new_chunk_sectors = sb->new_chunk >> 9; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + } + if (mddev->level == 0) + mddev->layout = -1; + + if (sb->state & (1<recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + + if (sb->state & (1<bitmap_info.file == NULL) { + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; + } + + } else if (mddev->pers == NULL) { + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ + ++ev1; + if (sb->disks[rdev->desc_nr].state & ( + (1<events) + return -EINVAL; + } else if (mddev->bitmap) { + /* if adding to array with a bitmap, then we can accept an + * older device ... but not too old. + */ + if (ev1 < mddev->bitmap->events_cleared) + return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } + + if (mddev->level != LEVEL_MULTIPATH) { + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<flags); + else if (desc->state & (1<raid_disk < mddev->raid_disks */) { + set_bit(In_sync, &rdev->flags); + rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; + } else if (desc->state & (1<minor_version >= 91) { + rdev->recovery_offset = 0; + rdev->raid_disk = desc->raid_disk; + } + } + if (desc->state & (1<flags); + if (desc->state & (1<flags); + } else /* MULTIPATH are always insync */ + set_bit(In_sync, &rdev->flags); + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + mdp_super_t *sb; + struct md_rdev *rdev2; + int next_spare = mddev->raid_disks; + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); + * 3/ any empty disks < next_spare become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + rdev->sb_size = MD_SB_BYTES; + + sb = page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); + sb->level = mddev->level; + sb->size = mddev->dev_sectors / 2; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->md_minor; + sb->not_persistent = 0; + sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->reshape_position == MaxSector) + sb->minor_version = 90; + else { + sb->minor_version = 91; + sb->reshape_position = mddev->reshape_position; + sb->new_level = mddev->new_level; + sb->delta_disks = mddev->delta_disks; + sb->new_layout = mddev->new_layout; + sb->new_chunk = mddev->new_chunk_sectors << 9; + } + mddev->minor_version = sb->minor_version; + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_sectors << 9; + + if (mddev->bitmap && mddev->bitmap_info.file == NULL) + sb->state |= (1<disks[0].state = (1<flags); + + if (rdev2->raid_disk >= 0 && + sb->minor_version >= 91) + /* we have nowhere to store the recovery_offset, + * but if it is not below the reshape_position, + * we can piggy-back on that. + */ + is_active = 1; + if (rdev2->raid_disk < 0 || + test_bit(Faulty, &rdev2->flags)) + is_active = 0; + if (is_active) + desc_nr = rdev2->raid_disk; + else + desc_nr = next_spare++; + rdev2->desc_nr = desc_nr; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (is_active) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (test_bit(Faulty, &rdev2->flags)) + d->state = (1<state = (1<flags)) + d->state |= (1<state = 0; + spare++; + working++; + } + if (test_bit(WriteMostly, &rdev2->flags)) + d->state |= (1<flags)) + d->state |= (1<raid_disks ; i++) { + mdp_disk_t *d = &sb->disks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<state |= (1<nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * rdev_size_change for 0.90.0 + */ +static unsigned long long +super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) +{ + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) + return 0; /* component must fit device */ + if (rdev->mddev->bitmap_info.offset) + return 0; /* can't move bitmap */ + rdev->sb_start = calc_dev_sboffset(rdev); + if (!num_sectors || num_sectors > rdev->sb_start) + num_sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ + if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) + num_sectors = (sector_t)(2ULL << 32) - 2; + do { + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + } while (md_super_wait(rdev->mddev) < 0); + return num_sectors; +} + +static int +super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) +{ + /* non-zero offset changes not possible with v0.90 */ + return new_offset == 0; +} + +/* + * version 1 superblock + */ + +static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) +{ + __le32 disk_csum; + u32 csum; + unsigned long long newcsum; + int size = 256 + le32_to_cpu(sb->max_dev)*2; + __le32 *isuper = (__le32*)sb; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (; size >= 4; size -= 4) + newcsum += le32_to_cpu(*isuper++); + + if (size == 2) + newcsum += le16_to_cpu(*(__le16*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return cpu_to_le32(csum); +} + +static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_start; + sector_t sectors; + int bmask; + bool spare_disk = true; + + /* + * Calculate the position of the superblock in 512byte sectors. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; + sb_start &= ~(sector_t)(4*2-1); + break; + case 1: + sb_start = 0; + break; + case 2: + sb_start = 8; + break; + default: + return -EINVAL; + } + rdev->sb_start = sb_start; + + /* superblock is rarely larger than 1K, but it can be larger, + * and it is safe to read 4k, so we do that + */ + ret = read_disk_sb(rdev, 4096); + if (ret) return ret; + + sb = page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != rdev->sb_start || + (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + pr_warn("md: invalid superblock checksum on %pg\n", + rdev->bdev); + return -EINVAL; + } + if (le64_to_cpu(sb->data_size) < 10) { + pr_warn("md: data_size too small on %pg\n", + rdev->bdev); + return -EINVAL; + } + if (sb->pad0 || + sb->pad3[0] || + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) + /* Some padding is non-zero, might be a new feature */ + return -EINVAL; + + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = rdev->data_offset; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); + atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); + + rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + + if (minor_version + && rdev->data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; + if (minor_version + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; + + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) + rdev->desc_nr = -1; + else + rdev->desc_nr = le32_to_cpu(sb->dev_number); + + if (!rdev->bb_page) { + rdev->bb_page = alloc_page(GFP_KERNEL); + if (!rdev->bb_page) + return -ENOMEM; + } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && + rdev->badblocks.count == 0) { + /* need to load the bad block list. + * Currently we limit it to one page. + */ + s32 offset; + sector_t bb_sector; + __le64 *bbp; + int i; + int sectors = le16_to_cpu(sb->bblog_size); + if (sectors > (PAGE_SIZE / 512)) + return -EINVAL; + offset = le32_to_cpu(sb->bblog_offset); + if (offset == 0) + return -EINVAL; + bb_sector = (long long)offset; + if (!sync_page_io(rdev, bb_sector, sectors << 9, + rdev->bb_page, REQ_OP_READ, true)) + return -EIO; + bbp = (__le64 *)page_address(rdev->bb_page); + rdev->badblocks.shift = sb->bblog_shift; + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { + u64 bb = le64_to_cpu(*bbp); + int count = bb & (0x3ff); + u64 sector = bb >> 10; + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + if (bb + 1 == 0) + break; + if (badblocks_set(&rdev->badblocks, sector, count, 1)) + return -EINVAL; + } + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; + + if ((le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { + rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); + rdev->ppl.size = le16_to_cpu(sb->ppl.size); + rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; + } + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || + (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + spare_disk = false; + + if (!refdev) { + if (!spare_disk) + ret = 1; + else + ret = 0; + } else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + pr_warn("md: %pg has strangely different superblock to %pg\n", + rdev->bdev, + refdev->bdev); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (!spare_disk && ev1 > ev2) + ret = 1; + else + ret = 0; + } + if (minor_version) + sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; + else + sectors = rdev->sb_start; + if (sectors < le64_to_cpu(sb->data_size)) + return -EINVAL; + rdev->sectors = le64_to_cpu(sb->data_size); + return ret; +} + +static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + __u64 ev1 = le64_to_cpu(sb->events); + + rdev->raid_disk = -1; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->patch_version = 0; + mddev->external = 0; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); + mddev->ctime = le64_to_cpu(sb->ctime); + mddev->utime = le64_to_cpu(sb->utime); + mddev->level = le32_to_cpu(sb->level); + mddev->clevel[0] = 0; + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->events = ev1; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* Default location for bitmap is 1K after superblock + * using 3K - total of 4K + */ + mddev->bitmap_info.default_offset = 1024 >> 9; + mddev->bitmap_info.default_space = (4096-1024) >> 9; + mddev->reshape_backwards = 0; + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && + mddev->bitmap_info.file == NULL) { + mddev->bitmap_info.offset = + (__s32)le32_to_cpu(sb->bitmap_offset); + /* Metadata doesn't record how much space is available. + * For 1.0, we assume we can use up to the superblock + * if before, else to 4K beyond superblock. + * For others, assume no change is possible. + */ + if (mddev->minor_version > 0) + mddev->bitmap_info.space = 0; + else if (mddev->bitmap_info.offset > 0) + mddev->bitmap_info.space = + 8 - mddev->bitmap_info.offset; + else + mddev->bitmap_info.space = + -mddev->bitmap_info.offset; + } + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + mddev->delta_disks = le32_to_cpu(sb->delta_disks); + mddev->new_level = le32_to_cpu(sb->new_level); + mddev->new_layout = le32_to_cpu(sb->new_layout); + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); + if (mddev->delta_disks < 0 || + (mddev->delta_disks == 0 && + (le32_to_cpu(sb->feature_map) + & MD_FEATURE_RESHAPE_BACKWARDS))) + mddev->reshape_backwards = 1; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + } + + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); + + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) + return -EINVAL; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && + (le32_to_cpu(sb->feature_map) & + MD_FEATURE_MULTIPLE_PPLS)) + return -EINVAL; + set_bit(MD_HAS_PPL, &mddev->flags); + } + } else if (mddev->pers == NULL) { + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ + ++ev1; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) + if (ev1 < mddev->events) + return -EINVAL; + } else if (mddev->bitmap) { + /* If adding to array with a bitmap, then we can accept an + * older device, but not too old. + */ + if (ev1 < mddev->bitmap->events_cleared) + return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } + if (mddev->level != LEVEL_MULTIPATH) { + int role; + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = MD_DISK_ROLE_SPARE; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case MD_DISK_ROLE_SPARE: /* spare */ + break; + case MD_DISK_ROLE_FAULTY: /* faulty */ + set_bit(Faulty, &rdev->flags); + break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + pr_warn("md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + rdev->raid_disk = 0; + break; + default: + rdev->saved_raid_disk = role; + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) { + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) + set_bit(In_sync, &rdev->flags); + } + rdev->raid_disk = role; + break; + } + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); + if (sb->devflags & FailFast1) + set_bit(FailFast, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); + } else /* MULTIPATH are always insync */ + set_bit(In_sync, &rdev->flags); + + return 0; +} + +static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mdp_superblock_1 *sb; + struct md_rdev *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + sb->recovery_offset = cpu_to_le64(0); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); + else + sb->resync_offset = cpu_to_le64(0); + + sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); + + sb->raid_disks = cpu_to_le32(mddev->raid_disks); + sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + if (test_bit(FailFast, &rdev->flags)) + sb->devflags |= FailFast1; + else + sb->devflags &= ~FailFast1; + + if (test_bit(WriteMostly, &rdev->flags)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->data_size = cpu_to_le64(rdev->sectors); + + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); + sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); + } + + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); + } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); + + if (mddev->reshape_position != MaxSector) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->reshape_position = cpu_to_le64(mddev->reshape_position); + sb->new_layout = cpu_to_le32(mddev->new_layout); + sb->delta_disks = cpu_to_le32(mddev->delta_disks); + sb->new_level = cpu_to_le32(mddev->new_level); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); + if (mddev->delta_disks == 0 && + mddev->reshape_backwards) + sb->feature_map + |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + if (rdev->new_data_offset != rdev->data_offset) { + sb->feature_map + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset + - rdev->data_offset)); + } + } + + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + + if (rdev->badblocks.count == 0) + /* Nothing to do for bad blocks*/ ; + else if (sb->bblog_offset == 0) + /* Cannot record bad blocks on this device */ + md_error(mddev, rdev); + else { + struct badblocks *bb = &rdev->badblocks; + __le64 *bbp = (__le64 *)page_address(rdev->bb_page); + u64 *p = bb->page; + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + if (bb->changed) { + unsigned seq; + +retry: + seq = read_seqbegin(&bb->lock); + + memset(bbp, 0xff, PAGE_SIZE); + + for (i = 0 ; i < bb->count ; i++) { + u64 internal_bb = p[i]; + u64 store_bb = ((BB_OFFSET(internal_bb) << 10) + | BB_LEN(internal_bb)); + bbp[i] = cpu_to_le64(store_bb); + } + bb->changed = 0; + if (read_seqretry(&bb->lock, seq)) + goto retry; + + bb->sector = (rdev->sb_start + + (int)le32_to_cpu(sb->bblog_offset)); + bb->size = le16_to_cpu(sb->bblog_size); + } + } + + max_dev = 0; + rdev_for_each(rdev2, mddev) + if (rdev2->desc_nr+1 > max_dev) + max_dev = rdev2->desc_nr+1; + + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; + sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } else + max_dev = le32_to_cpu(sb->max_dev); + + for (i=0; idev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); + + if (test_bit(MD_HAS_PPL, &mddev->flags)) { + if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); + else + sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); + sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); + sb->ppl.size = cpu_to_le16(rdev->ppl.size); + } + + rdev_for_each(rdev2, mddev) { + i = rdev2->desc_nr; + if (test_bit(Faulty, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + else if (test_bit(In_sync, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (test_bit(Journal, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); + else if (rdev2->raid_disk >= 0) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); + } + + sb->sb_csum = calc_sb_1_csum(sb); +} + +static sector_t super_1_choose_bm_space(sector_t dev_size) +{ + sector_t bm_space; + + /* if the device is bigger than 8Gig, save 64k for bitmap + * usage, if bigger than 200Gig, save 128k + */ + if (dev_size < 64*2) + bm_space = 0; + else if (dev_size - 64*2 >= 200*1024*1024*2) + bm_space = 128*2; + else if (dev_size - 4*2 > 8*1024*1024*2) + bm_space = 64*2; + else + bm_space = 4*2; + return bm_space; +} + +static unsigned long long +super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) +{ + struct mdp_superblock_1 *sb; + sector_t max_sectors; + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) + return 0; /* component must fit device */ + if (rdev->data_offset != rdev->new_data_offset) + return 0; /* too confusing */ + if (rdev->sb_start < rdev->data_offset) { + /* minor versions 1 and 2; superblock before data */ + max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; + if (!num_sectors || num_sectors > max_sectors) + num_sectors = max_sectors; + } else if (rdev->mddev->bitmap_info.offset) { + /* minor version 0 with bitmap we can't move */ + return 0; + } else { + /* minor version 0; superblock after data */ + sector_t sb_start, bm_space; + sector_t dev_size = bdev_nr_sectors(rdev->bdev); + + /* 8K is for superblock */ + sb_start = dev_size - 8*2; + sb_start &= ~(sector_t)(4*2 - 1); + + bm_space = super_1_choose_bm_space(dev_size); + + /* Space that can be used to store date needs to decrease + * superblock bitmap space and bad block space(4K) + */ + max_sectors = sb_start - bm_space - 4*2; + + if (!num_sectors || num_sectors > max_sectors) + num_sectors = max_sectors; + rdev->sb_start = sb_start; + } + sb = page_address(rdev->sb_page); + sb->data_size = cpu_to_le64(num_sectors); + sb->super_offset = cpu_to_le64(rdev->sb_start); + sb->sb_csum = calc_sb_1_csum(sb); + do { + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + } while (md_super_wait(rdev->mddev) < 0); + return num_sectors; + +} + +static int +super_1_allow_new_offset(struct md_rdev *rdev, + unsigned long long new_offset) +{ + /* All necessary checks on new >= old have been done */ + struct bitmap *bitmap; + if (new_offset >= rdev->data_offset) + return 1; + + /* with 1.0 metadata, there is no metadata to tread on + * so we can always move back */ + if (rdev->mddev->minor_version == 0) + return 1; + + /* otherwise we must be sure not to step on + * any metadata, so stay: + * 36K beyond start of superblock + * beyond end of badblocks + * beyond write-intent bitmap + */ + if (rdev->sb_start + (32+4)*2 > new_offset) + return 0; + bitmap = rdev->mddev->bitmap; + if (bitmap && !rdev->mddev->bitmap_info.file && + rdev->sb_start + rdev->mddev->bitmap_info.offset + + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) + return 0; + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) + return 0; + + return 1; +} + +static struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + .rdev_size_change = super_90_rdev_size_change, + .allow_new_offset = super_90_allow_new_offset, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + .rdev_size_change = super_1_rdev_size_change, + .allow_new_offset = super_1_allow_new_offset, + }, +}; + +static void sync_super(struct mddev *mddev, struct md_rdev *rdev) +{ + if (mddev->sync_super) { + mddev->sync_super(mddev, rdev); + return; + } + + BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); + + super_types[mddev->major_version].sync_super(mddev, rdev); +} + +static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) +{ + struct md_rdev *rdev, *rdev2; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; + if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { + rcu_read_unlock(); + return 1; + } + } + } + rcu_read_unlock(); + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(struct mddev *mddev) +{ + struct md_rdev *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register, or already is */ + rdev_for_each(rdev, mddev) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + if (!reference || !bdev_get_integrity(reference->bdev)) + return 0; + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)); + + pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); + if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || + (mddev->level != 1 && mddev->level != 10 && + bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + /* + * No need to handle the failure of bioset_integrity_create, + * because the function is called by md_run() -> pers->run(), + * md_run calls bioset_exit -> bioset_integrity_free in case + * of failure case. + */ + pr_err("md: failed to create integrity pool for %s\n", + mdname(mddev)); + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* + * Attempt to add an rdev, but only if it is consistent with the current + * integrity profile + */ +int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return 0; + + bi_mddev = blk_get_integrity(mddev->gendisk); + + if (!bi_mddev) /* nothing to do */ + return 0; + + if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + return -ENXIO; + } + + return 0; +} +EXPORT_SYMBOL(md_integrity_add_rdev); + +static bool rdev_read_only(struct md_rdev *rdev) +{ + return bdev_read_only(rdev->bdev) || + (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); +} + +static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) +{ + char b[BDEVNAME_SIZE]; + int err; + + /* prevent duplicates */ + if (find_rdev(mddev, rdev->bdev->bd_dev)) + return -EEXIST; + + if (rdev_read_only(rdev) && mddev->pers) + return -EROFS; + + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (!test_bit(Journal, &rdev->flags) && + rdev->sectors && + (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { + if (mddev->pers) { + /* Cannot change size, so fail + * If mddev->level <= 0, then we don't care + * about aligning sizes (e.g. linear) + */ + if (mddev->level > 0) + return -ENOSPC; + } else + mddev->dev_sectors = rdev->sectors; + } + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + rcu_read_lock(); + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) + choice = mddev->raid_disks; + while (md_find_rdev_nr_rcu(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + if (!test_bit(Journal, &rdev->flags) && + mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + pr_warn("md: %s: array is limited to %d devices\n", + mdname(mddev), mddev->max_disks); + return -EBUSY; + } + snprintf(b, sizeof(b), "%pg", rdev->bdev); + strreplace(b, '/', '!'); + + rdev->mddev = mddev; + pr_debug("md: bind<%s>\n", b); + + if (mddev->raid_disks) + mddev_create_serial_pool(mddev, rdev, false); + + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) + goto fail; + + /* failure here is OK */ + err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); + + list_add_rcu(&rdev->same_set, &mddev->disks); + bd_link_disk_holder(rdev->bdev, mddev->gendisk); + + /* May as well allow recovery to be retried once */ + mddev->recovery_disabled++; + + return 0; + + fail: + pr_warn("md: failed to register dev-%s for %s\n", + b, mdname(mddev)); + return err; +} + +static void rdev_delayed_delete(struct work_struct *ws) +{ + struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); + kobject_del(&rdev->kobj); + kobject_put(&rdev->kobj); +} + +static void unbind_rdev_from_array(struct md_rdev *rdev) +{ + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); + list_del_rcu(&rdev->same_set); + pr_debug("md: unbind<%pg>\n", rdev->bdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); + rdev->mddev = NULL; + sysfs_remove_link(&rdev->kobj, "block"); + sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); + rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; + rdev->badblocks.count = 0; + /* We need to delay this, otherwise we can deadlock when + * writing to 'remove' to "dev/state". We also need + * to delay it due to rcu usage. + */ + synchronize_rcu(); + INIT_WORK(&rdev->del_work, rdev_delayed_delete); + kobject_get(&rdev->kobj); + queue_work(md_rdev_misc_wq, &rdev->del_work); +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by bd_claiming the device. + */ +static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) +{ + int err = 0; + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (struct md_rdev *)lock_rdev : rdev); + if (IS_ERR(bdev)) { + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(dev), MINOR(dev)); + return PTR_ERR(bdev); + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(struct md_rdev *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(struct md_rdev *rdev) +{ + pr_debug("md: export_rdev(%pg)\n", rdev->bdev); + md_rdev_clear(rdev); +#ifndef MODULE + if (test_bit(AutoDetected, &rdev->flags)) + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kobject_put(&rdev->kobj); +} + +void md_kick_rdev_from_array(struct md_rdev *rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} +EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); + +static void export_array(struct mddev *mddev) +{ + struct md_rdev *rdev; + + while (!list_empty(&mddev->disks)) { + rdev = list_first_entry(&mddev->disks, struct md_rdev, + same_set); + md_kick_rdev_from_array(rdev); + } + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static bool set_in_sync(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->lock); + if (!mddev->in_sync) { + mddev->sync_checkers++; + spin_unlock(&mddev->lock); + percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); + spin_lock(&mddev->lock); + if (!mddev->in_sync && + percpu_ref_is_zero(&mddev->writes_pending)) { + mddev->in_sync = 1; + /* + * Ensure ->in_sync is visible before we clear + * ->sync_checkers. + */ + smp_mb(); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + sysfs_notify_dirent_safe(mddev->sysfs_state); + } + if (--mddev->sync_checkers == 0) + percpu_ref_switch_to_percpu(&mddev->writes_pending); + } + if (mddev->safemode == 1) + mddev->safemode = 0; + return mddev->in_sync; +} + +static void sync_sbs(struct mddev *mddev, int nospares) +{ + /* Update each superblock (in-memory image), but + * if we are allowed to, skip spares which already + * have the right event counter, or have one earlier + * (which would mean they aren't being marked as dirty + * with the rest of the array) + */ + struct md_rdev *rdev; + rdev_for_each(rdev, mddev) { + if (rdev->sb_events == mddev->events || + (nospares && + rdev->raid_disk < 0 && + rdev->sb_events+1 == mddev->events)) { + /* Don't update this superblock */ + rdev->sb_loaded = 2; + } else { + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } + } +} + +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev = NULL, *iter; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(iter, mddev) + if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { + rdev = iter; + break; + } + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->layout != le32_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + +void md_update_sb(struct mddev *mddev, int force_change) +{ + struct md_rdev *rdev; + int sync_req; + int nospares = 0; + int any_badblocks_changed = 0; + int ret = -1; + + if (!md_is_rdwr(mddev)) { + if (force_change) + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return; + } + +repeat: + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) + force_change = 1; + if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) + nospares = 1; + ret = md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); + bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), + BIT(MD_SB_CHANGE_DEVS) | + BIT(MD_SB_CHANGE_CLEAN)); + return; + } + } + + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (!mddev->external) { + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; + ack_all_badblocks(&rdev->badblocks); + md_error(mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } + } + wake_up(&mddev->sb_wait); + return; + } + + spin_lock(&mddev->lock); + + mddev->utime = ktime_get_real_seconds(); + + if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) + force_change = 1; + if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + if (force_change) + nospares = 0; + if (mddev->degraded) + /* If the array is degraded, then skipping spares is both + * dangerous and fairly pointless. + * Dangerous because a device that was removed from the array + * might have a event_count that still looks up-to-date, + * so it can be re-added without a resync. + * Pointless because if there are any spares to skip, + * then a recovery will happen and soon that array won't + * be degraded any more and the spare can go back to sleep then. + */ + nospares = 0; + + sync_req = mddev->in_sync; + + /* If this is just a dirty<->clean transition, and the array is clean + * and 'events' is odd, we can roll back to the previous clean state */ + if (nospares + && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && mddev->can_decrease_events + && mddev->events != 1) { + mddev->events--; + mddev->can_decrease_events = 0; + } else { + /* otherwise we have to go forward and ... */ + mddev->events ++; + mddev->can_decrease_events = nospares; + } + + /* + * This 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug... + */ + WARN_ON(mddev->events == 0); + + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) + any_badblocks_changed++; + if (test_bit(Faulty, &rdev->flags)) + set_bit(FaultRecorded, &rdev->flags); + } + + sync_sbs(mddev, nospares); + spin_unlock(&mddev->lock); + + pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", + mdname(mddev), mddev->in_sync); + + if (mddev->queue) + blk_add_trace_msg(mddev->queue, "md md_update_sb"); +rewrite: + md_bitmap_update_sb(mddev->bitmap); + rdev_for_each(rdev, mddev) { + if (rdev->sb_loaded != 1) + continue; /* no noise on spare devices */ + + if (!test_bit(Faulty, &rdev->flags)) { + md_super_write(mddev,rdev, + rdev->sb_start, rdev->sb_size, + rdev->sb_page); + pr_debug("md: (write) %pg's sb offset: %llu\n", + rdev->bdev, + (unsigned long long)rdev->sb_start); + rdev->sb_events = mddev->events; + if (rdev->badblocks.size) { + md_super_write(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page); + rdev->badblocks.size = 0; + } + + } else + pr_debug("md: %pg (skipping faulty)\n", + rdev->bdev); + + if (mddev->level == LEVEL_MULTIPATH) + /* only need to write one superblock... */ + break; + } + if (md_super_wait(mddev) < 0) + goto rewrite; + /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ + + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); + + if (mddev->in_sync != sync_req || + !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) + /* have to write it out again */ + goto repeat; + wake_up(&mddev->sb_wait); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + sysfs_notify_dirent_safe(mddev->sysfs_completed); + + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(FaultRecorded, &rdev->flags)) + clear_bit(Blocked, &rdev->flags); + + if (any_badblocks_changed) + ack_all_badblocks(&rdev->badblocks); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } +} +EXPORT_SYMBOL(md_update_sb); + +static int add_bound_rdev(struct md_rdev *rdev) +{ + struct mddev *mddev = rdev->mddev; + int err = 0; + bool add_journal = test_bit(Journal, &rdev->flags); + + if (!mddev->pers->hot_remove_disk || add_journal) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + if (add_journal) + mddev_suspend(mddev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (add_journal) + mddev_resume(mddev); + if (err) { + md_kick_rdev_from_array(rdev); + return err; + } + } + sysfs_notify_dirent_safe(rdev->sysfs_state); + + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_new_event(); + md_wakeup_thread(mddev->thread); + return 0; +} + +/* words written to sysfs files may, or may not, be \n terminated. + * We want to accept with case. For this we use cmd_match. + */ +static int cmd_match(const char *cmd, const char *str) +{ + /* See if cmd, written into a sysfs file, matches + * str. They must either be the same, or cmd can + * have a trailing newline + */ + while (*cmd && *str && *cmd == *str) { + cmd++; + str++; + } + if (*cmd == '\n') + cmd++; + if (*str || *cmd) + return 0; + return 1; +} + +struct rdev_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct md_rdev *, char *); + ssize_t (*store)(struct md_rdev *, const char *, size_t); +}; + +static ssize_t +state_show(struct md_rdev *rdev, char *page) +{ + char *sep = ","; + size_t len = 0; + unsigned long flags = READ_ONCE(rdev->flags); + + if (test_bit(Faulty, &flags) || + (!test_bit(ExternalBbl, &flags) && + rdev->badblocks.unacked_exist)) + len += sprintf(page+len, "faulty%s", sep); + if (test_bit(In_sync, &flags)) + len += sprintf(page+len, "in_sync%s", sep); + if (test_bit(Journal, &flags)) + len += sprintf(page+len, "journal%s", sep); + if (test_bit(WriteMostly, &flags)) + len += sprintf(page+len, "write_mostly%s", sep); + if (test_bit(Blocked, &flags) || + (rdev->badblocks.unacked_exist + && !test_bit(Faulty, &flags))) + len += sprintf(page+len, "blocked%s", sep); + if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && + !test_bit(In_sync, &flags)) + len += sprintf(page+len, "spare%s", sep); + if (test_bit(WriteErrorSeen, &flags)) + len += sprintf(page+len, "write_error%s", sep); + if (test_bit(WantReplacement, &flags)) + len += sprintf(page+len, "want_replacement%s", sep); + if (test_bit(Replacement, &flags)) + len += sprintf(page+len, "replacement%s", sep); + if (test_bit(ExternalBbl, &flags)) + len += sprintf(page+len, "external_bbl%s", sep); + if (test_bit(FailFast, &flags)) + len += sprintf(page+len, "failfast%s", sep); + + if (len) + len -= strlen(sep); + + return len+sprintf(page+len, "\n"); +} + +static ssize_t +state_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + /* can write + * faulty - simulates an error + * remove - disconnects the device + * writemostly - sets write_mostly + * -writemostly - clears write_mostly + * blocked - sets the Blocked flags + * -blocked - clears the Blocked and possibly simulates an error + * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap + * write_error - sets WriteErrorSeen + * -write_error - clears WriteErrorSeen + * {,-}failfast - set/clear FailFast + */ + + struct mddev *mddev = rdev->mddev; + int err = -EINVAL; + bool need_update_sb = false; + + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { + md_error(rdev->mddev, rdev); + + if (test_bit(MD_BROKEN, &rdev->mddev->flags)) + err = -EBUSY; + else + err = 0; + } else if (cmd_match(buf, "remove")) { + if (rdev->mddev->pers) { + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + } + if (rdev->raid_disk >= 0) + err = -EBUSY; + else { + err = 0; + if (mddev_is_clustered(mddev)) + err = md_cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) { + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + } + md_new_event(); + } + } + } else if (cmd_match(buf, "writemostly")) { + set_bit(WriteMostly, &rdev->flags); + mddev_create_serial_pool(rdev->mddev, rdev, false); + need_update_sb = true; + err = 0; + } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_serial_pool(rdev->mddev, rdev, false); + clear_bit(WriteMostly, &rdev->flags); + need_update_sb = true; + err = 0; + } else if (cmd_match(buf, "blocked")) { + set_bit(Blocked, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-blocked")) { + if (!test_bit(Faulty, &rdev->flags) && + !test_bit(ExternalBbl, &rdev->flags) && + rdev->badblocks.unacked_exist) { + /* metadata handler doesn't understand badblocks, + * so we need to fail the device + */ + md_error(rdev->mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + + err = 0; + } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { + set_bit(In_sync, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "failfast")) { + set_bit(FailFast, &rdev->flags); + need_update_sb = true; + err = 0; + } else if (cmd_match(buf, "-failfast")) { + clear_bit(FailFast, &rdev->flags); + need_update_sb = true; + err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags)) { + if (rdev->mddev->pers == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; + } + } else if (cmd_match(buf, "write_error")) { + set_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-write_error")) { + clear_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "re-add")) { + if (!rdev->mddev->pers) + err = -EINVAL; + else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && + rdev->saved_raid_disk >= 0) { + /* clear_bit is performed _after_ all the devices + * have their local Faulty bit cleared. If any writes + * happen in the meantime in the local node, they + * will land in the local bitmap, which will be synced + * by this node eventually + */ + if (!mddev_is_clustered(rdev->mddev) || + (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + clear_bit(Faulty, &rdev->flags); + err = add_bound_rdev(rdev); + } + } else + err = -EBUSY; + } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { + set_bit(ExternalBbl, &rdev->flags); + rdev->badblocks.shift = 0; + err = 0; + } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { + clear_bit(ExternalBbl, &rdev->flags); + err = 0; + } + if (need_update_sb) + md_update_sb(mddev, 1); + if (!err) + sysfs_notify_dirent_safe(rdev->sysfs_state); + return err ? err : len; +} +static struct rdev_sysfs_entry rdev_state = +__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); + +static ssize_t +errors_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); +} + +static ssize_t +errors_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned int n; + int rv; + + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + atomic_set(&rdev->corrected_errors, n); + return len; +} +static struct rdev_sysfs_entry rdev_errors = +__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); + +static ssize_t +slot_show(struct md_rdev *rdev, char *page) +{ + if (test_bit(Journal, &rdev->flags)) + return sprintf(page, "journal\n"); + else if (rdev->raid_disk < 0) + return sprintf(page, "none\n"); + else + return sprintf(page, "%d\n", rdev->raid_disk); +} + +static ssize_t +slot_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + int slot; + int err; + + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; + if (strncmp(buf, "none", 4)==0) + slot = -1; + else { + err = kstrtouint(buf, 10, (unsigned int *)&slot); + if (err < 0) + return err; + if (slot < 0) + /* overflow */ + return -ENOSPC; + } + if (rdev->mddev->pers && slot == -1) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. + * For now we only support removing + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ + if (rdev->mddev->pers->hot_remove_disk == NULL) + return -EINVAL; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + } else if (rdev->mddev->pers) { + /* Activating a spare .. or possibly reactivating + * if we ever get bitmaps working here. + */ + int err; + + if (rdev->raid_disk != -1) + return -EBUSY; + + if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) + return -EBUSY; + + if (rdev->mddev->pers->hot_add_disk == NULL) + return -EINVAL; + + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + + rdev->raid_disk = slot; + if (test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = slot; + else + rdev->saved_raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); + err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); + if (err) { + rdev->raid_disk = -1; + return err; + } else + sysfs_notify_dirent_safe(rdev->sysfs_state); + /* failure here is OK */; + sysfs_link_rdev(rdev->mddev, rdev); + /* don't wakeup anyone, leave that to userspace. */ + } else { + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; + /* assume it is working */ + clear_bit(Faulty, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + set_bit(In_sync, &rdev->flags); + sysfs_notify_dirent_safe(rdev->sysfs_state); + } + return len; +} + +static struct rdev_sysfs_entry rdev_slot = +__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); + +static ssize_t +offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); +} + +static ssize_t +offset_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long offset; + if (kstrtoull(buf, 10, &offset) < 0) + return -EINVAL; + if (rdev->mddev->pers && rdev->raid_disk >= 0) + return -EBUSY; + if (rdev->sectors && rdev->mddev->external) + /* Must set offset before size, so overlap checks + * can be sane */ + return -EBUSY; + rdev->data_offset = offset; + rdev->new_data_offset = offset; + return len; +} + +static struct rdev_sysfs_entry rdev_offset = +__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); + +static ssize_t new_offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)rdev->new_data_offset); +} + +static ssize_t new_offset_store(struct md_rdev *rdev, + const char *buf, size_t len) +{ + unsigned long long new_offset; + struct mddev *mddev = rdev->mddev; + + if (kstrtoull(buf, 10, &new_offset) < 0) + return -EINVAL; + + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) + return -EBUSY; + if (new_offset == rdev->data_offset) + /* reset is always permitted */ + ; + else if (new_offset > rdev->data_offset) { + /* must not push array size beyond rdev_sectors */ + if (new_offset - rdev->data_offset + + mddev->dev_sectors > rdev->sectors) + return -E2BIG; + } + /* Metadata worries about other space details. */ + + /* decreasing the offset is inconsistent with a backwards + * reshape. + */ + if (new_offset < rdev->data_offset && + mddev->reshape_backwards) + return -EINVAL; + /* Increasing offset is inconsistent with forwards + * reshape. reshape_direction should be set to + * 'backwards' first. + */ + if (new_offset > rdev->data_offset && + !mddev->reshape_backwards) + return -EINVAL; + + if (mddev->pers && mddev->persistent && + !super_types[mddev->major_version] + .allow_new_offset(rdev, new_offset)) + return -E2BIG; + rdev->new_data_offset = new_offset; + if (new_offset > rdev->data_offset) + mddev->reshape_backwards = 1; + else if (new_offset < rdev->data_offset) + mddev->reshape_backwards = 0; + + return len; +} +static struct rdev_sysfs_entry rdev_new_offset = +__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); + +static ssize_t +rdev_size_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); +} + +static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) +{ + /* check if two start/length pairs overlap */ + if (a->data_offset + a->sectors <= b->data_offset) + return false; + if (b->data_offset + b->sectors <= a->data_offset) + return false; + return true; +} + +static bool md_rdev_overlaps(struct md_rdev *rdev) +{ + struct mddev *mddev; + struct md_rdev *rdev2; + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev->flags)) + continue; + rdev_for_each(rdev2, mddev) { + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && + md_rdevs_overlap(rdev, rdev2)) { + spin_unlock(&all_mddevs_lock); + return true; + } + } + } + spin_unlock(&all_mddevs_lock); + return false; +} + +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) +{ + unsigned long long blocks; + sector_t new; + + if (kstrtoull(buf, 10, &blocks) < 0) + return -EINVAL; + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + +static ssize_t +rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + struct mddev *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + sector_t sectors; + + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (rdev->data_offset != rdev->new_data_offset) + return -EINVAL; /* too confusing */ + if (my_mddev->pers && rdev->raid_disk >= 0) { + if (my_mddev->persistent) { + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) + return -EBUSY; + } else if (!sectors) + sectors = bdev_nr_sectors(rdev->bdev) - + rdev->data_offset; + if (!my_mddev->pers->resize) + /* Cannot change size for RAID0 or Linear etc */ + return -EINVAL; + } + if (sectors < my_mddev->dev_sectors) + return -EINVAL; /* component must fit device */ + + rdev->sectors = sectors; + + /* + * Check that all other rdevs with the same bdev do not overlap. This + * check does not provide a hard guarantee, it just helps avoid + * dangerous mistakes. + */ + if (sectors > oldsectors && my_mddev->external && + md_rdev_overlaps(rdev)) { + /* + * Someone else could have slipped in a size change here, but + * doing so is just silly. We put oldsectors back because we + * know it is safe, and trust userspace not to race with itself. + */ + rdev->sectors = oldsectors; + return -EBUSY; + } + return len; +} + +static struct rdev_sysfs_entry rdev_size = +__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); + +static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) +{ + unsigned long long recovery_start = rdev->recovery_offset; + + if (test_bit(In_sync, &rdev->flags) || + recovery_start == MaxSector) + return sprintf(page, "none\n"); + + return sprintf(page, "%llu\n", recovery_start); +} + +static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long recovery_start; + + if (cmd_match(buf, "none")) + recovery_start = MaxSector; + else if (kstrtoull(buf, 10, &recovery_start)) + return -EINVAL; + + if (rdev->mddev->pers && + rdev->raid_disk >= 0) + return -EBUSY; + + rdev->recovery_offset = recovery_start; + if (recovery_start == MaxSector) + set_bit(In_sync, &rdev->flags); + else + clear_bit(In_sync, &rdev->flags); + return len; +} + +static struct rdev_sysfs_entry rdev_recovery_start = +__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ +static ssize_t bb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + int rv = badblocks_store(&rdev->badblocks, page, len, 0); + /* Maybe that ack was all we needed */ + if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) + wake_up(&rdev->blocked_wait); + return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + +static ssize_t ubb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); + +static ssize_t +ppl_sector_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); +} + +static ssize_t +ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long sector; + + if (kstrtoull(buf, 10, §or) < 0) + return -EINVAL; + if (sector != (sector_t)sector) + return -EINVAL; + + if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && + rdev->raid_disk >= 0) + return -EBUSY; + + if (rdev->mddev->persistent) { + if (rdev->mddev->major_version == 0) + return -EINVAL; + if ((sector > rdev->sb_start && + sector - rdev->sb_start > S16_MAX) || + (sector < rdev->sb_start && + rdev->sb_start - sector > -S16_MIN)) + return -EINVAL; + rdev->ppl.offset = sector - rdev->sb_start; + } else if (!rdev->mddev->external) { + return -EBUSY; + } + rdev->ppl.sector = sector; + return len; +} + +static struct rdev_sysfs_entry rdev_ppl_sector = +__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); + +static ssize_t +ppl_size_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%u\n", rdev->ppl.size); +} + +static ssize_t +ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned int size; + + if (kstrtouint(buf, 10, &size) < 0) + return -EINVAL; + + if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && + rdev->raid_disk >= 0) + return -EBUSY; + + if (rdev->mddev->persistent) { + if (rdev->mddev->major_version == 0) + return -EINVAL; + if (size > U16_MAX) + return -EINVAL; + } else if (!rdev->mddev->external) { + return -EBUSY; + } + rdev->ppl.size = size; + return len; +} + +static struct rdev_sysfs_entry rdev_ppl_size = +__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); + +static struct attribute *rdev_default_attrs[] = { + &rdev_state.attr, + &rdev_errors.attr, + &rdev_slot.attr, + &rdev_offset.attr, + &rdev_new_offset.attr, + &rdev_size.attr, + &rdev_recovery_start.attr, + &rdev_bad_blocks.attr, + &rdev_unack_bad_blocks.attr, + &rdev_ppl_sector.attr, + &rdev_ppl_size.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rdev_default); +static ssize_t +rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + + if (!entry->show) + return -EIO; + if (!rdev->mddev) + return -ENODEV; + return entry->show(rdev, page); +} + +static ssize_t +rdev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + ssize_t rv; + struct mddev *mddev = rdev->mddev; + + if (!entry->store) + return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + rv = mddev ? mddev_lock(mddev) : -ENODEV; + if (!rv) { + if (rdev->mddev == NULL) + rv = -ENODEV; + else + rv = entry->store(rdev, page, length); + mddev_unlock(mddev); + } + return rv; +} + +static void rdev_free(struct kobject *ko) +{ + struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); + kfree(rdev); +} +static const struct sysfs_ops rdev_sysfs_ops = { + .show = rdev_attr_show, + .store = rdev_attr_store, +}; +static struct kobj_type rdev_ktype = { + .release = rdev_free, + .sysfs_ops = &rdev_sysfs_ops, + .default_groups = rdev_default_groups, +}; + +int md_rdev_init(struct md_rdev *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->new_data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error = 0; + rdev->sb_loaded = 0; + rdev->bb_page = NULL; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + return badblocks_init(&rdev->badblocks, 0); +} +EXPORT_SYMBOL_GPL(md_rdev_init); +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) +{ + int err; + struct md_rdev *rdev; + sector_t size; + + rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) + return ERR_PTR(-ENOMEM); + + err = md_rdev_init(rdev); + if (err) + goto abort_free; + err = alloc_disk_sb(rdev); + if (err) + goto abort_free; + + err = lock_rdev(rdev, newdev, super_format == -2); + if (err) + goto abort_free; + + kobject_init(&rdev->kobj, &rdev_ktype); + + size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; + if (!size) { + pr_warn("md: %pg has zero or unknown size, marking faulty!\n", + rdev->bdev); + err = -EINVAL; + goto abort_free; + } + + if (super_format >= 0) { + err = super_types[super_format]. + load_super(rdev, NULL, super_minor); + if (err == -EINVAL) { + pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", + rdev->bdev, + super_format, super_minor); + goto abort_free; + } + if (err < 0) { + pr_warn("md: could not read %pg's sb, not importing!\n", + rdev->bdev); + goto abort_free; + } + } + + return rdev; + +abort_free: + if (rdev->bdev) + unlock_rdev(rdev); + md_rdev_clear(rdev); + kfree(rdev); + return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + +static int analyze_sbs(struct mddev *mddev) +{ + int i; + struct md_rdev *rdev, *freshest, *tmp; + + freshest = NULL; + rdev_for_each_safe(rdev, tmp, mddev) + switch (super_types[mddev->major_version]. + load_super(rdev, freshest, mddev->minor_version)) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", + rdev->bdev); + md_kick_rdev_from_array(rdev); + } + + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } + + super_types[mddev->major_version]. + validate_super(mddev, freshest); + + i = 0; + rdev_for_each_safe(rdev, tmp, mddev) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { + pr_warn("md: %s: %pg: only %d devices permitted\n", + mdname(mddev), rdev->bdev, + mddev->max_disks); + md_kick_rdev_from_array(rdev); + continue; + } + if (rdev != freshest) { + if (super_types[mddev->major_version]. + validate_super(mddev, rdev)) { + pr_warn("md: kicking non-fresh %pg from array!\n", + rdev->bdev); + md_kick_rdev_from_array(rdev); + continue; + } + } + if (mddev->level == LEVEL_MULTIPATH) { + rdev->desc_nr = i++; + rdev->raid_disk = rdev->desc_nr; + set_bit(In_sync, &rdev->flags); + } else if (rdev->raid_disk >= + (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { + rdev->raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + } + } + + return 0; +} + +/* Read a fixed-point number. + * Numbers in sysfs attributes should be in "standard" units where + * possible, so time should be in seconds. + * However we internally use a a much smaller unit such as + * milliseconds or jiffies. + * This function takes a decimal number with a possible fractional + * component, and produces an integer which is the result of + * multiplying that number by 10^'scale'. + * all without any floating-point arithmetic. + */ +int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) +{ + unsigned long result = 0; + long decimals = -1; + while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { + if (*cp == '.') + decimals = 0; + else if (decimals < scale) { + unsigned int value; + value = *cp - '0'; + result = result * 10 + value; + if (decimals >= 0) + decimals++; + } + cp++; + } + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + if (decimals < 0) + decimals = 0; + *res = result * int_pow(10, scale - decimals); + return 0; +} + +static ssize_t +safe_delay_show(struct mddev *mddev, char *page) +{ + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); +} +static ssize_t +safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) +{ + unsigned long msec; + + if (mddev_is_clustered(mddev)) { + pr_warn("md: Safemode is disabled for clustered mode\n"); + return -EINVAL; + } + + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) + return -EINVAL; + if (msec == 0) + mddev->safemode_delay = 0; + else { + unsigned long old_delay = mddev->safemode_delay; + unsigned long new_delay = (msec*HZ)/1000; + + if (new_delay == 0) + new_delay = 1; + mddev->safemode_delay = new_delay; + if (new_delay < old_delay || old_delay == 0) + mod_timer(&mddev->safemode_timer, jiffies+1); + } + return len; +} +static struct md_sysfs_entry md_safe_delay = +__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); + +static ssize_t +level_show(struct mddev *mddev, char *page) +{ + struct md_personality *p; + int ret; + spin_lock(&mddev->lock); + p = mddev->pers; + if (p) + ret = sprintf(page, "%s\n", p->name); + else if (mddev->clevel[0]) + ret = sprintf(page, "%s\n", mddev->clevel); + else if (mddev->level != LEVEL_NONE) + ret = sprintf(page, "%d\n", mddev->level); + else + ret = 0; + spin_unlock(&mddev->lock); + return ret; +} + +static ssize_t +level_store(struct mddev *mddev, const char *buf, size_t len) +{ + char clevel[16]; + ssize_t rv; + size_t slen = len; + struct md_personality *pers, *oldpers; + long level; + void *priv, *oldpriv; + struct md_rdev *rdev; + + if (slen == 0 || slen >= sizeof(clevel)) + return -EINVAL; + + rv = mddev_lock(mddev); + if (rv) + return rv; + + if (mddev->pers == NULL) { + strncpy(mddev->clevel, buf, slen); + if (mddev->clevel[slen-1] == '\n') + slen--; + mddev->clevel[slen] = 0; + mddev->level = LEVEL_NONE; + rv = len; + goto out_unlock; + } + rv = -EROFS; + if (!md_is_rdwr(mddev)) + goto out_unlock; + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + rv = -EBUSY; + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) + goto out_unlock; + + rv = -EINVAL; + if (!mddev->pers->quiesce) { + pr_warn("md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + goto out_unlock; + } + + /* Now find the new personality */ + strncpy(clevel, buf, slen); + if (clevel[slen-1] == '\n') + slen--; + clevel[slen] = 0; + if (kstrtol(clevel, 10, &level)) + level = LEVEL_NONE; + + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); + spin_lock(&pers_lock); + pers = find_pers(level, clevel); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + pr_warn("md: personality %s not loaded\n", clevel); + rv = -EINVAL; + goto out_unlock; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + rv = len; + goto out_unlock; + } + if (!pers->takeover) { + module_put(pers->owner); + pr_warn("md: %s: %s does not support personality takeover\n", + mdname(mddev), clevel); + rv = -EINVAL; + goto out_unlock; + } + + rdev_for_each(rdev, mddev) + rdev->new_raid_disk = rdev->raid_disk; + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + module_put(pers->owner); + pr_warn("md: %s: %s would not accept array\n", + mdname(mddev), clevel); + rv = PTR_ERR(priv); + goto out_unlock; + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev_detach(mddev); + + spin_lock(&mddev->lock); + oldpers = mddev->pers; + oldpriv = mddev->private; + mddev->pers = pers; + mddev->private = priv; + strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + spin_unlock(&mddev->lock); + + if (oldpers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + + oldpers->free(mddev, oldpriv); + + if (oldpers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + pr_warn("md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + } + if (oldpers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + module_put(oldpers->owner); + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk >= mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sysfs_unlink_rdev(mddev, rdev); + } + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) + clear_bit(In_sync, &rdev->flags); + else { + if (sysfs_link_rdev(mddev, rdev)) + pr_warn("md: cannot register rd%d for %s after level change\n", + rdev->raid_disk, mdname(mddev)); + } + } + + if (pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } + blk_set_stacking_limits(&mddev->queue->limits); + pers->run(mddev); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + mddev_resume(mddev); + if (!mddev->thread) + md_update_sb(mddev, 1); + sysfs_notify_dirent_safe(mddev->sysfs_level); + md_new_event(); + rv = len; +out_unlock: + mddev_unlock(mddev); + return rv; +} + +static struct md_sysfs_entry md_level = +__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); + +static ssize_t +layout_show(struct mddev *mddev, char *page) +{ + /* just a number, not meaningful for all levels */ + if (mddev->reshape_position != MaxSector && + mddev->layout != mddev->new_layout) + return sprintf(page, "%d (%d)\n", + mddev->new_layout, mddev->layout); + return sprintf(page, "%d\n", mddev->layout); +} + +static ssize_t +layout_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + + if (mddev->pers) { + if (mddev->pers->check_reshape == NULL) + err = -EBUSY; + else if (!md_is_rdwr(mddev)) + err = -EROFS; + else { + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_layout = mddev->layout; + } + } else { + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } + mddev_unlock(mddev); + return err ?: len; +} +static struct md_sysfs_entry md_layout = +__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); + +static ssize_t +raid_disks_show(struct mddev *mddev, char *page) +{ + if (mddev->raid_disks == 0) + return 0; + if (mddev->reshape_position != MaxSector && + mddev->delta_disks != 0) + return sprintf(page, "%d (%d)\n", mddev->raid_disks, + mddev->raid_disks - mddev->delta_disks); + return sprintf(page, "%d\n", mddev->raid_disks); +} + +static int update_raid_disks(struct mddev *mddev, int raid_disks); + +static ssize_t +raid_disks_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers) + err = update_raid_disks(mddev, n); + else if (mddev->reshape_position != MaxSector) { + struct md_rdev *rdev; + int olddisks = mddev->raid_disks - mddev->delta_disks; + + err = -EINVAL; + rdev_for_each(rdev, mddev) { + if (olddisks < n && + rdev->data_offset < rdev->new_data_offset) + goto out_unlock; + if (olddisks > n && + rdev->data_offset > rdev->new_data_offset) + goto out_unlock; + } + err = 0; + mddev->delta_disks = n - olddisks; + mddev->raid_disks = n; + mddev->reshape_backwards = (mddev->delta_disks < 0); + } else + mddev->raid_disks = n; +out_unlock: + mddev_unlock(mddev); + return err ? err : len; +} +static struct md_sysfs_entry md_raid_disks = +__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); + +static ssize_t +uuid_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%pU\n", mddev->uuid); +} +static struct md_sysfs_entry md_uuid = +__ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t +chunk_size_show(struct mddev *mddev, char *page) +{ + if (mddev->reshape_position != MaxSector && + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); +} + +static ssize_t +chunk_size_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long n; + int err; + + err = kstrtoul(buf, 10, &n); + if (err < 0) + return err; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers) { + if (mddev->pers->check_reshape == NULL) + err = -EBUSY; + else if (!md_is_rdwr(mddev)) + err = -EROFS; + else { + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_chunk_sectors = mddev->chunk_sectors; + } + } else { + mddev->new_chunk_sectors = n >> 9; + if (mddev->reshape_position == MaxSector) + mddev->chunk_sectors = n >> 9; + } + mddev_unlock(mddev); + return err ?: len; +} +static struct md_sysfs_entry md_chunk_size = +__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); + +static ssize_t +resync_start_show(struct mddev *mddev, char *page) +{ + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); + return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); +} + +static ssize_t +resync_start_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long long n; + int err; + + if (cmd_match(buf, "none")) + n = MaxSector; + else { + err = kstrtoull(buf, 10, &n); + if (err < 0) + return err; + if (n != (sector_t)n) + return -EINVAL; + } + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + err = -EBUSY; + + if (!err) { + mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + } + mddev_unlock(mddev); + return err ?: len; +} +static struct md_sysfs_entry md_resync_start = +__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, + resync_start_show, resync_start_store); + +/* + * The array state can be: + * + * clear + * No devices, no size, no level + * Equivalent to STOP_ARRAY ioctl + * inactive + * May have some settings, but array is not active + * all IO results in error + * When written, doesn't tear down array, but just stops it + * suspended (not supported yet) + * All IO requests will block. The array can be reconfigured. + * Writing this, if accepted, will block until array is quiescent + * readonly + * no resync can happen. no superblocks get written. + * write requests fail + * read-auto + * like readonly, but behaves like 'clean' on a write request. + * + * clean - no pending writes, but otherwise active. + * When written to inactive array, starts without resync + * If a write request arrives then + * if metadata is known, mark 'dirty' and switch to 'active'. + * if not known, block and switch to write-pending + * If written to an active array that has pending writes, then fails. + * active + * fully active: IO and resync can be happening. + * When written to inactive array, starts with resync + * + * write-pending + * clean, but writes are blocked waiting for 'active' to be written. + * + * active-idle + * like active, but no writes have been seen for a while (100msec). + * + * broken +* Array is failed. It's useful because mounted-arrays aren't stopped +* when array is failed, so this state will at least alert the user that +* something is wrong. + */ +enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, + write_pending, active_idle, broken, bad_word}; +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", + "write-pending", "active-idle", "broken", NULL }; + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (cmd_match(word, list[n])) + break; + return n; +} + +static ssize_t +array_state_show(struct mddev *mddev, char *page) +{ + enum array_state st = inactive; + + if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { + switch(mddev->ro) { + case MD_RDONLY: + st = readonly; + break; + case MD_AUTO_READ: + st = read_auto; + break; + case MD_RDWR: + spin_lock(&mddev->lock); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + st = write_pending; + else if (mddev->in_sync) + st = clean; + else if (mddev->safemode) + st = active_idle; + else + st = active; + spin_unlock(&mddev->lock); + } + + if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) + st = broken; + } else { + if (list_empty(&mddev->disks) && + mddev->raid_disks == 0 && + mddev->dev_sectors == 0) + st = clear; + else + st = inactive; + } + return sprintf(page, "%s\n", array_states[st]); +} + +static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); +static int restart_array(struct mddev *mddev); + +static ssize_t +array_state_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err = 0; + enum array_state st = match_word(buf, array_states); + + if (mddev->pers && (st == active || st == clean) && + mddev->ro != MD_RDONLY) { + /* don't take reconfig_mutex when toggling between + * clean and active + */ + spin_lock(&mddev->lock); + if (st == active) { + restart_array(mddev); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); + } else /* st == clean */ { + restart_array(mddev); + if (!set_in_sync(mddev)) + err = -EBUSY; + } + if (!err) + sysfs_notify_dirent_safe(mddev->sysfs_state); + spin_unlock(&mddev->lock); + return err ?: len; + } + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + switch(st) { + case bad_word: + break; + case clear: + /* stopping an active array */ + err = do_md_stop(mddev, 0, NULL); + break; + case inactive: + /* stopping an active array */ + if (mddev->pers) + err = do_md_stop(mddev, 2, NULL); + else + err = 0; /* already inactive */ + break; + case suspended: + break; /* not supported yet */ + case readonly: + if (mddev->pers) + err = md_set_readonly(mddev, NULL); + else { + mddev->ro = MD_RDONLY; + set_disk_ro(mddev->gendisk, 1); + err = do_md_run(mddev); + } + break; + case read_auto: + if (mddev->pers) { + if (md_is_rdwr(mddev)) + err = md_set_readonly(mddev, NULL); + else if (mddev->ro == MD_RDONLY) + err = restart_array(mddev); + if (err == 0) { + mddev->ro = MD_AUTO_READ; + set_disk_ro(mddev->gendisk, 0); + } + } else { + mddev->ro = MD_AUTO_READ; + err = do_md_run(mddev); + } + break; + case clean: + if (mddev->pers) { + err = restart_array(mddev); + if (err) + break; + spin_lock(&mddev->lock); + if (!set_in_sync(mddev)) + err = -EBUSY; + spin_unlock(&mddev->lock); + } else + err = -EINVAL; + break; + case active: + if (mddev->pers) { + err = restart_array(mddev); + if (err) + break; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + wake_up(&mddev->sb_wait); + err = 0; + } else { + mddev->ro = MD_RDWR; + set_disk_ro(mddev->gendisk, 0); + err = do_md_run(mddev); + } + break; + case write_pending: + case active_idle: + case broken: + /* these cannot be set */ + break; + } + + if (!err) { + if (mddev->hold_active == UNTIL_IOCTL) + mddev->hold_active = 0; + sysfs_notify_dirent_safe(mddev->sysfs_state); + } + mddev_unlock(mddev); + return err ?: len; +} +static struct md_sysfs_entry md_array_state = +__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); + +static ssize_t +max_corrected_read_errors_show(struct mddev *mddev, char *page) { + return sprintf(page, "%d\n", + atomic_read(&mddev->max_corr_read_errors)); +} + +static ssize_t +max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int rv; + + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + if (n > INT_MAX) + return -EINVAL; + atomic_set(&mddev->max_corr_read_errors, n); + return len; +} + +static struct md_sysfs_entry max_corr_read_errors = +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, + max_corrected_read_errors_store); + +static ssize_t +null_show(struct mddev *mddev, char *page) +{ + return -EINVAL; +} + +/* need to ensure rdev_delayed_delete() has completed */ +static void flush_rdev_wq(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (work_pending(&rdev->del_work)) { + flush_workqueue(md_rdev_misc_wq); + break; + } + rcu_read_unlock(); +} + +static ssize_t +new_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* buf must be %d:%d\n? giving major and minor numbers */ + /* The new device is added to the array. + * If the array has a persistent superblock, we read the + * superblock to initialise info and check validity. + * Otherwise, only checking done is that in bind_rdev_to_array, + * which mainly checks size. + */ + char *e; + int major = simple_strtoul(buf, &e, 10); + int minor; + dev_t dev; + struct md_rdev *rdev; + int err; + + if (!*buf || *e != ':' || !e[1] || e[1] == '\n') + return -EINVAL; + minor = simple_strtoul(e+1, &e, 10); + if (*e && *e != '\n') + return -EINVAL; + dev = MKDEV(major, minor); + if (major != MAJOR(dev) || + minor != MINOR(dev)) + return -EOVERFLOW; + + flush_rdev_wq(mddev); + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->persistent) { + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); + err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) + goto out; + } + } else if (mddev->external) + rdev = md_import_device(dev, -2, -1); + else + rdev = md_import_device(dev, -1, -1); + + if (IS_ERR(rdev)) { + mddev_unlock(mddev); + return PTR_ERR(rdev); + } + err = bind_rdev_to_array(rdev, mddev); + out: + if (err) + export_rdev(rdev); + mddev_unlock(mddev); + if (!err) + md_new_event(); + return err ? err : len; +} + +static struct md_sysfs_entry md_new_device = +__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); + +static ssize_t +bitmap_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *end; + unsigned long chunk, end_chunk; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + if (!mddev->bitmap) + goto out; + /* buf should be ... or - ... (range) */ + while (*buf) { + chunk = end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + if (*end == '-') { /* range */ + buf = end + 1; + end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + } + if (*end && !isspace(*end)) break; + md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + buf = skip_spaces(end); + } + md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ +out: + mddev_unlock(mddev); + return len; +} + +static struct md_sysfs_entry md_bitmap = +__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); + +static ssize_t +size_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); +} + +static int update_size(struct mddev *mddev, sector_t num_sectors); + +static ssize_t +size_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* If array is inactive, we can reduce the component size, but + * not increase it (except from 0). + * If array is active, we can try an on-line resize + */ + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); + + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers) { + err = update_size(mddev, sectors); + if (err == 0) + md_update_sb(mddev, 1); + } else { + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; + else + err = -ENOSPC; + } + mddev_unlock(mddev); + return err ? err : len; +} + +static struct md_sysfs_entry md_size = +__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); + +/* Metadata version. + * This is one of + * 'none' for arrays with no metadata (good luck...) + * 'external' for arrays with externally managed metadata, + * or N.M for internally known formats + */ +static ssize_t +metadata_show(struct mddev *mddev, char *page) +{ + if (mddev->persistent) + return sprintf(page, "%d.%d\n", + mddev->major_version, mddev->minor_version); + else if (mddev->external) + return sprintf(page, "external:%s\n", mddev->metadata_type); + else + return sprintf(page, "none\n"); +} + +static ssize_t +metadata_store(struct mddev *mddev, const char *buf, size_t len) +{ + int major, minor; + char *e; + int err; + /* Changing the details of 'external' metadata is + * always permitted. Otherwise there must be + * no devices attached to the array. + */ + + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->external && strncmp(buf, "external:", 9) == 0) + ; + else if (!list_empty(&mddev->disks)) + goto out_unlock; + + err = 0; + if (cmd_match(buf, "none")) { + mddev->persistent = 0; + mddev->external = 0; + mddev->major_version = 0; + mddev->minor_version = 90; + goto out_unlock; + } + if (strncmp(buf, "external:", 9) == 0) { + size_t namelen = len-9; + if (namelen >= sizeof(mddev->metadata_type)) + namelen = sizeof(mddev->metadata_type)-1; + strncpy(mddev->metadata_type, buf+9, namelen); + mddev->metadata_type[namelen] = 0; + if (namelen && mddev->metadata_type[namelen-1] == '\n') + mddev->metadata_type[--namelen] = 0; + mddev->persistent = 0; + mddev->external = 1; + mddev->major_version = 0; + mddev->minor_version = 90; + goto out_unlock; + } + major = simple_strtoul(buf, &e, 10); + err = -EINVAL; + if (e==buf || *e != '.') + goto out_unlock; + buf = e+1; + minor = simple_strtoul(buf, &e, 10); + if (e==buf || (*e && *e != '\n') ) + goto out_unlock; + err = -ENOENT; + if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) + goto out_unlock; + mddev->major_version = major; + mddev->minor_version = minor; + mddev->persistent = 1; + mddev->external = 0; + err = 0; +out_unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_metadata = +__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t +action_show(struct mddev *mddev, char *page) +{ + char *type = "idle"; + unsigned long recovery = mddev->recovery; + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) + type = "reshape"; + else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) + type = "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) + type = "check"; + else + type = "repair"; + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + type = "recover"; + else if (mddev->reshape_position != MaxSector) + type = "reshape"; + } + return sprintf(page, "%s\n", type); +} + +static ssize_t +action_store(struct mddev *mddev, const char *page, size_t len) +{ + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + mddev_lock(mddev) == 0) { + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); + if (mddev->sync_thread) { + sector_t save_rp = mddev->reshape_position; + + mddev_unlock(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + mddev_lock_nointr(mddev); + /* + * set RECOVERY_INTR again and restore reshape + * position in case others changed them after + * got lock, eg, reshape_position_store and + * md_check_recovery. + */ + mddev->reshape_position = save_rp; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + mddev_unlock(mddev); + } + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + else if (cmd_match(page, "resync")) + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else if (cmd_match(page, "recover")) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if (cmd_match(page, "reshape")) { + int err; + if (mddev->pers->start_reshape == NULL) + return -EINVAL; + err = mddev_lock(mddev); + if (!err) { + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + err = -EBUSY; + else { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + err = mddev->pers->start_reshape(mddev); + } + mddev_unlock(mddev); + } + if (err) + return err; + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + } else { + if (cmd_match(page, "check")) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (!cmd_match(page, "repair")) + return -EINVAL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } + if (mddev->ro == MD_AUTO_READ) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = MD_RDWR; + md_wakeup_thread(mddev->sync_thread); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); + return len; +} + +static struct md_sysfs_entry md_scan_mode = +__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + +static ssize_t +last_sync_action_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", mddev->last_sync_action); +} + +static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); + +static ssize_t +mismatch_cnt_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); +} + +static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); + +static ssize_t +sync_min_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", speed_min(mddev), + mddev->sync_speed_min ? "local": "system"); +} + +static ssize_t +sync_min_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int min; + int rv; + + if (strncmp(buf, "system", 6)==0) { + min = 0; + } else { + rv = kstrtouint(buf, 10, &min); + if (rv < 0) + return rv; + if (min == 0) + return -EINVAL; + } + mddev->sync_speed_min = min; + return len; +} + +static struct md_sysfs_entry md_sync_min = +__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); + +static ssize_t +sync_max_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", speed_max(mddev), + mddev->sync_speed_max ? "local": "system"); +} + +static ssize_t +sync_max_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6)==0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_speed_max = max; + return len; +} + +static struct md_sysfs_entry md_sync_max = +__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); + +static ssize_t +degraded_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->degraded); +} +static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); + +static ssize_t +sync_force_parallel_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->parallel_resync); +} + +static ssize_t +sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) +{ + long n; + + if (kstrtol(buf, 10, &n)) + return -EINVAL; + + if (n != 0 && n != 1) + return -EINVAL; + + mddev->parallel_resync = n; + + if (mddev->sync_thread) + wake_up(&resync_wait); + + return len; +} + +/* force parallel resync, even with shared block devices */ +static struct md_sysfs_entry md_sync_force_parallel = +__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, + sync_force_parallel_show, sync_force_parallel_store); + +static ssize_t +sync_speed_show(struct mddev *mddev, char *page) +{ + unsigned long resync, dt, db; + if (mddev->curr_resync == MD_RESYNC_NONE) + return sprintf(page, "none\n"); + resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); + dt = (jiffies - mddev->resync_mark) / HZ; + if (!dt) dt++; + db = resync - mddev->resync_mark_cnt; + return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ +} + +static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); + +static ssize_t +sync_completed_show(struct mddev *mddev, char *page) +{ + unsigned long long max_sectors, resync; + + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return sprintf(page, "none\n"); + + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) + return sprintf(page, "delayed\n"); + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; + else + max_sectors = mddev->dev_sectors; + + resync = mddev->curr_resync_completed; + return sprintf(page, "%llu / %llu\n", resync, max_sectors); +} + +static struct md_sysfs_entry md_sync_completed = + __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); + +static ssize_t +min_sync_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_min); +} +static ssize_t +min_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long long min; + int err; + + if (kstrtoull(buf, 10, &min)) + return -EINVAL; + + spin_lock(&mddev->lock); + err = -EINVAL; + if (min > mddev->resync_max) + goto out_unlock; + + err = -EBUSY; + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + goto out_unlock; + + /* Round down to multiple of 4K for safety */ + mddev->resync_min = round_down(min, 8); + err = 0; + +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; +} + +static struct md_sysfs_entry md_min_sync = +__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); + +static ssize_t +max_sync_show(struct mddev *mddev, char *page) +{ + if (mddev->resync_max == MaxSector) + return sprintf(page, "max\n"); + else + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_max); +} +static ssize_t +max_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + spin_lock(&mddev->lock); + if (strncmp(buf, "max", 3) == 0) + mddev->resync_max = MaxSector; + else { + unsigned long long max; + int chunk; + + err = -EINVAL; + if (kstrtoull(buf, 10, &max)) + goto out_unlock; + if (max < mddev->resync_min) + goto out_unlock; + + err = -EBUSY; + if (max < mddev->resync_max && md_is_rdwr(mddev) && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + goto out_unlock; + + /* Must be a multiple of chunk_size */ + chunk = mddev->chunk_sectors; + if (chunk) { + sector_t temp = max; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; + } + mddev->resync_max = max; + } + wake_up(&mddev->recovery_wait); + err = 0; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; +} + +static struct md_sysfs_entry md_max_sync = +__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); + +static ssize_t +suspend_lo_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); +} + +static ssize_t +suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long long new; + int err; + + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + mddev_suspend(mddev); + mddev->suspend_lo = new; + mddev_resume(mddev); + + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; +} +static struct md_sysfs_entry md_suspend_lo = +__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); + +static ssize_t +suspend_hi_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); +} + +static ssize_t +suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long long new; + int err; + + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL) + goto unlock; + + mddev_suspend(mddev); + mddev->suspend_hi = new; + mddev_resume(mddev); + + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; +} +static struct md_sysfs_entry md_suspend_hi = +__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); + +static ssize_t +reshape_position_show(struct mddev *mddev, char *page) +{ + if (mddev->reshape_position != MaxSector) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->reshape_position); + strcpy(page, "none\n"); + return 5; +} + +static ssize_t +reshape_position_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_rdev *rdev; + unsigned long long new; + int err; + + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) + return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->pers) + goto unlock; + mddev->reshape_position = new; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_reshape_position = +__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, + reshape_position_store); + +static ssize_t +reshape_direction_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", + mddev->reshape_backwards ? "backwards" : "forwards"); +} + +static ssize_t +reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) +{ + int backwards = 0; + int err; + + if (cmd_match(buf, "forwards")) + backwards = 0; + else if (cmd_match(buf, "backwards")) + backwards = 1; + else + return -EINVAL; + if (mddev->reshape_backwards == backwards) + return len; + + err = mddev_lock(mddev); + if (err) + return err; + /* check if we are allowed to change */ + if (mddev->delta_disks) + err = -EBUSY; + else if (mddev->persistent && + mddev->major_version == 0) + err = -EINVAL; + else + mddev->reshape_backwards = backwards; + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_reshape_direction = +__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, + reshape_direction_store); + +static ssize_t +array_size_show(struct mddev *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(struct mddev *mddev, const char *buf, size_t len) +{ + sector_t sectors; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + + /* cluster raid doesn't support change array_sectors */ + if (mddev_is_clustered(mddev)) { + mddev_unlock(mddev); + return -EINVAL; + } + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + err = -EINVAL; + else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + err = -E2BIG; + else + mddev->external_size = 1; + } + + if (!err) { + mddev->array_sectors = sectors; + if (mddev->pers) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); + +static ssize_t +consistency_policy_show(struct mddev *mddev, char *page) +{ + int ret; + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + ret = sprintf(page, "journal\n"); + } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { + ret = sprintf(page, "ppl\n"); + } else if (mddev->bitmap) { + ret = sprintf(page, "bitmap\n"); + } else if (mddev->pers) { + if (mddev->pers->sync_request) + ret = sprintf(page, "resync\n"); + else + ret = sprintf(page, "none\n"); + } else { + ret = sprintf(page, "unknown\n"); + } + + return ret; +} + +static ssize_t +consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err = 0; + + if (mddev->pers) { + if (mddev->pers->change_consistency_policy) + err = mddev->pers->change_consistency_policy(mddev, buf); + else + err = -EBUSY; + } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { + set_bit(MD_HAS_PPL, &mddev->flags); + } else { + err = -EINVAL; + } + + return err ? err : len; +} + +static struct md_sysfs_entry md_consistency_policy = +__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, + consistency_policy_store); + +static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->fail_last_dev); +} + +/* + * Setting fail_last_dev to true to allow last device to be forcibly removed + * from RAID1/RAID10. + */ +static ssize_t +fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + int ret; + bool value; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + if (value != mddev->fail_last_dev) + mddev->fail_last_dev = value; + + return len; +} +static struct md_sysfs_entry md_fail_last_dev = +__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, + fail_last_dev_store); + +static ssize_t serialize_policy_show(struct mddev *mddev, char *page) +{ + if (mddev->pers == NULL || (mddev->pers->level != 1)) + return sprintf(page, "n/a\n"); + else + return sprintf(page, "%d\n", mddev->serialize_policy); +} + +/* + * Setting serialize_policy to true to enforce write IO is not reordered + * for raid1. + */ +static ssize_t +serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return err; + + if (value == mddev->serialize_policy) + return len; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers == NULL || (mddev->pers->level != 1)) { + pr_err("md: serialize_policy is only effective for raid1\n"); + err = -EINVAL; + goto unlock; + } + + mddev_suspend(mddev); + if (value) + mddev_create_serial_pool(mddev, NULL, true); + else + mddev_destroy_serial_pool(mddev, NULL, true); + mddev->serialize_policy = value; + mddev_resume(mddev); +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_serialize_policy = +__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, + serialize_policy_store); + + +static struct attribute *md_default_attrs[] = { + &md_level.attr, + &md_layout.attr, + &md_raid_disks.attr, + &md_uuid.attr, + &md_chunk_size.attr, + &md_size.attr, + &md_resync_start.attr, + &md_metadata.attr, + &md_new_device.attr, + &md_safe_delay.attr, + &md_array_state.attr, + &md_reshape_position.attr, + &md_reshape_direction.attr, + &md_array_size.attr, + &max_corr_read_errors.attr, + &md_consistency_policy.attr, + &md_fail_last_dev.attr, + &md_serialize_policy.attr, + NULL, +}; + +static const struct attribute_group md_default_group = { + .attrs = md_default_attrs, +}; + +static struct attribute *md_redundancy_attrs[] = { + &md_scan_mode.attr, + &md_last_scan_mode.attr, + &md_mismatches.attr, + &md_sync_min.attr, + &md_sync_max.attr, + &md_sync_speed.attr, + &md_sync_force_parallel.attr, + &md_sync_completed.attr, + &md_min_sync.attr, + &md_max_sync.attr, + &md_suspend_lo.attr, + &md_suspend_hi.attr, + &md_bitmap.attr, + &md_degraded.attr, + NULL, +}; +static const struct attribute_group md_redundancy_group = { + .name = NULL, + .attrs = md_redundancy_attrs, +}; + +static const struct attribute_group *md_attr_groups[] = { + &md_default_group, + &md_bitmap_group, + NULL, +}; + +static ssize_t +md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); + ssize_t rv; + + if (!entry->show) + return -EIO; + spin_lock(&all_mddevs_lock); + if (!mddev_get(mddev)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + spin_unlock(&all_mddevs_lock); + + rv = entry->show(mddev, page); + mddev_put(mddev); + return rv; +} + +static ssize_t +md_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); + ssize_t rv; + + if (!entry->store) + return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + spin_lock(&all_mddevs_lock); + if (!mddev_get(mddev)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + spin_unlock(&all_mddevs_lock); + rv = entry->store(mddev, page, length); + mddev_put(mddev); + return rv; +} + +static void md_kobj_release(struct kobject *ko) +{ + struct mddev *mddev = container_of(ko, struct mddev, kobj); + + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); +} + +static const struct sysfs_ops md_sysfs_ops = { + .show = md_attr_show, + .store = md_attr_store, +}; +static struct kobj_type md_ktype = { + .release = md_kobj_release, + .sysfs_ops = &md_sysfs_ops, + .default_groups = md_attr_groups, +}; + +int mdp_major = 0; + +static void mddev_delayed_delete(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, del_work); + + kobject_put(&mddev->kobj); +} + +static void no_op(struct percpu_ref *r) {} + +int mddev_init_writes_pending(struct mddev *mddev) +{ + if (mddev->writes_pending.percpu_count_ptr) + return 0; + if (percpu_ref_init(&mddev->writes_pending, no_op, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) + return -ENOMEM; + /* We want to start with the refcount at zero */ + percpu_ref_put(&mddev->writes_pending); + return 0; +} +EXPORT_SYMBOL_GPL(mddev_init_writes_pending); + +struct mddev *md_alloc(dev_t dev, char *name) +{ + /* + * If dev is zero, name is the name of a device to allocate with + * an arbitrary minor number. It will be "md_???" + * If dev is non-zero it must be a device number with a MAJOR of + * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then + * the device is being created by opening a node in /dev. + * If "name" is not NULL, the device is being created by + * writing to /sys/module/md_mod/parameters/new_array. + */ + static DEFINE_MUTEX(disks_mutex); + struct mddev *mddev; + struct gendisk *disk; + int partitioned; + int shift; + int unit; + int error ; + + /* + * Wait for any previous instance of this device to be completely + * removed (mddev_delayed_delete). + */ + flush_workqueue(md_misc_wq); + flush_workqueue(md_rdev_misc_wq); + + mutex_lock(&disks_mutex); + mddev = mddev_alloc(dev); + if (IS_ERR(mddev)) { + error = PTR_ERR(mddev); + goto out_unlock; + } + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; + + if (name && !dev) { + /* Need to ensure that 'name' is not a duplicate. + */ + struct mddev *mddev2; + spin_lock(&all_mddevs_lock); + + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) + if (mddev2->gendisk && + strcmp(mddev2->gendisk->disk_name, name) == 0) { + spin_unlock(&all_mddevs_lock); + error = -EEXIST; + goto out_free_mddev; + } + spin_unlock(&all_mddevs_lock); + } + if (name && dev) + /* + * Creating /dev/mdNNN via "newarray", so adjust hold_active. + */ + mddev->hold_active = UNTIL_STOP; + + error = -ENOMEM; + disk = blk_alloc_disk(NUMA_NO_NODE); + if (!disk) + goto out_free_mddev; + + disk->major = MAJOR(mddev->unit); + disk->first_minor = unit << shift; + disk->minors = 1 << shift; + if (name) + strcpy(disk->disk_name, name); + else if (partitioned) + sprintf(disk->disk_name, "md_d%d", unit); + else + sprintf(disk->disk_name, "md%d", unit); + disk->fops = &md_fops; + disk->private_data = mddev; + + mddev->queue = disk->queue; + blk_set_stacking_limits(&mddev->queue->limits); + blk_queue_write_cache(mddev->queue, true, true); + disk->events |= DISK_EVENT_MEDIA_CHANGE; + mddev->gendisk = disk; + error = add_disk(disk); + if (error) + goto out_put_disk; + + kobject_init(&mddev->kobj, &md_ktype); + error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); + if (error) { + /* + * The disk is already live at this point. Clear the hold flag + * and let mddev_put take care of the deletion, as it isn't any + * different from a normal close on last release now. + */ + mddev->hold_active = 0; + mutex_unlock(&disks_mutex); + mddev_put(mddev); + return ERR_PTR(error); + } + + kobject_uevent(&mddev->kobj, KOBJ_ADD); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); + mutex_unlock(&disks_mutex); + return mddev; + +out_put_disk: + put_disk(disk); +out_free_mddev: + mddev_free(mddev); +out_unlock: + mutex_unlock(&disks_mutex); + return ERR_PTR(error); +} + +static int md_alloc_and_put(dev_t dev, char *name) +{ + struct mddev *mddev = md_alloc(dev, name); + + if (IS_ERR(mddev)) + return PTR_ERR(mddev); + mddev_put(mddev); + return 0; +} + +static void md_probe(dev_t dev) +{ + if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) + return; + if (create_on_open) + md_alloc_and_put(dev, NULL); +} + +static int add_named_array(const char *val, const struct kernel_param *kp) +{ + /* + * val must be "md_*" or "mdNNN". + * For "md_*" we allocate an array with a large free minor number, and + * set the name to val. val must not already be an active name. + * For "mdNNN" we allocate an array with the minor number NNN + * which must not already be in use. + */ + int len = strlen(val); + char buf[DISK_NAME_LEN]; + unsigned long devnum; + + while (len && val[len-1] == '\n') + len--; + if (len >= DISK_NAME_LEN) + return -E2BIG; + strscpy(buf, val, len+1); + if (strncmp(buf, "md_", 3) == 0) + return md_alloc_and_put(0, buf); + if (strncmp(buf, "md", 2) == 0 && + isdigit(buf[2]) && + kstrtoul(buf+2, 10, &devnum) == 0 && + devnum <= MINORMASK) + return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); + + return -EINVAL; +} + +static void md_safemode_timeout(struct timer_list *t) +{ + struct mddev *mddev = from_timer(mddev, t, safemode_timer); + + mddev->safemode = 1; + if (mddev->external) + sysfs_notify_dirent_safe(mddev->sysfs_state); + + md_wakeup_thread(mddev->thread); +} + +static int start_dirty_degraded; +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} + +int md_run(struct mddev *mddev) +{ + int err; + struct md_rdev *rdev; + struct md_personality *pers; + bool nowait = true; + + if (list_empty(&mddev->disks)) + /* cannot run an array with no devices.. */ + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; + + /* + * Analyze all RAID superblock(s) + */ + if (!mddev->raid_disks) { + if (!mddev->persistent) + return -EINVAL; + err = analyze_sbs(mddev); + if (err) + return -EINVAL; + } + + if (mddev->level != LEVEL_NONE) + request_module("md-level-%d", mddev->level); + else if (mddev->clevel[0]) + request_module("md-%s", mddev->clevel); + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + */ + mddev->has_superblocks = false; + rdev_for_each(rdev, mddev) { + if (test_bit(Faulty, &rdev->flags)) + continue; + sync_blockdev(rdev->bdev); + invalidate_bdev(rdev->bdev); + if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { + mddev->ro = MD_RDONLY; + if (mddev->gendisk) + set_disk_ro(mddev->gendisk, 1); + } + + if (rdev->sb_page) + mddev->has_superblocks = true; + + /* perform some consistency tests on the device. + * We don't want the data to overlap the metadata, + * Internal Bitmap issues have been handled elsewhere. + */ + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors + > rdev->sb_start) { + pr_warn("md: %s: data overlaps metadata\n", + mdname(mddev)); + return -EINVAL; + } + } else { + if (rdev->sb_start + rdev->sb_size/512 + > rdev->data_offset) { + pr_warn("md: %s: metadata overlaps data\n", + mdname(mddev)); + return -EINVAL; + } + } + sysfs_notify_dirent_safe(rdev->sysfs_state); + nowait = nowait && bdev_nowait(rdev->bdev); + } + + err = percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (err) + return err; + + if (!bioset_initialized(&mddev->bio_set)) { + err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_active_io; + } + if (!bioset_initialized(&mddev->sync_set)) { + err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_bio_set; + } + + spin_lock(&pers_lock); + pers = find_pers(mddev->level, mddev->clevel); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + if (mddev->level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + mddev->level); + else + pr_warn("md: personality for level %s is not loaded!\n", + mddev->clevel); + err = -EINVAL; + goto abort; + } + spin_unlock(&pers_lock); + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } + strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + + if (mddev->reshape_position != MaxSector && + pers->start_reshape == NULL) { + /* This personality cannot handle reshaping... */ + module_put(pers->owner); + err = -EINVAL; + goto abort; + } + + if (pers->sync_request) { + /* Warn if this is a potentially silly + * configuration. + */ + struct md_rdev *rdev2; + int warned = 0; + + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { + if (rdev < rdev2 && + rdev->bdev->bd_disk == + rdev2->bdev->bd_disk) { + pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", + mdname(mddev), + rdev->bdev, + rdev2->bdev); + warned = 1; + } + } + + if (warned) + pr_warn("True protection against single-disk failure might be compromised.\n"); + } + + mddev->recovery = 0; + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + + mddev->ok_start_degraded = start_dirty_degraded; + + if (start_readonly && md_is_rdwr(mddev)) + mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ + + err = pers->run(mddev); + if (err) + pr_warn("md: pers->run() failed ...\n"); + else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, + "%s: default size too small, but 'external_size' not in effect?\n", + __func__); + pr_warn("md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + } + if (err == 0 && pers->sync_request && + (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { + struct bitmap *bitmap; + + bitmap = md_bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) { + err = PTR_ERR(bitmap); + pr_warn("%s: failed to create bitmap (%d)\n", + mdname(mddev), err); + } else + mddev->bitmap = bitmap; + + } + if (err) + goto bitmap_abort; + + if (mddev->bitmap_info.max_write_behind > 0) { + bool create_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_serial(rdev)) + create_pool = true; + } + if (create_pool && mddev->serial_info_pool == NULL) { + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { + err = -ENOMEM; + goto bitmap_abort; + } + } + } + + if (mddev->queue) { + bool nonrot = true; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { + nonrot = false; + break; + } + } + if (mddev->degraded) + nonrot = false; + if (nonrot) + blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); + else + blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); + + /* Set the NOWAIT flags if all underlying devices support it */ + if (nowait) + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); + } + if (pers->sync_request) { + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + pr_warn("md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + } else if (mddev->ro == MD_AUTO_READ) + mddev->ro = MD_RDWR; + + atomic_set(&mddev->max_corr_read_errors, + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); + mddev->safemode = 0; + if (mddev_is_clustered(mddev)) + mddev->safemode_delay = 0; + else + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; + mddev->in_sync = 1; + smp_wmb(); + spin_lock(&mddev->lock); + mddev->pers = pers; + spin_unlock(&mddev->lock); + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + sysfs_link_rdev(mddev, rdev); /* failure here is OK */ + + if (mddev->degraded && md_is_rdwr(mddev)) + /* This ensures that recovering status is reported immediately + * via sysfs - until a lack of spares is confirmed. + */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + + if (mddev->sb_flags) + md_update_sb(mddev, 0); + + md_new_event(); + return 0; + +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); +abort: + bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); +exit_active_io: + percpu_ref_exit(&mddev->active_io); + return err; +} +EXPORT_SYMBOL_GPL(md_run); + +int do_md_run(struct mddev *mddev) +{ + int err; + + set_bit(MD_NOT_READY, &mddev->flags); + err = md_run(mddev); + if (err) + goto out; + err = md_bitmap_load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } + + if (mddev_is_clustered(mddev)) + md_allow_write(mddev); + + /* run start up tasks that require md_thread */ + md_start(mddev); + + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + clear_bit(MD_NOT_READY, &mddev->flags); + mddev->changed = 1; + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); +out: + clear_bit(MD_NOT_READY, &mddev->flags); + return err; +} + +int md_start(struct mddev *mddev) +{ + int ret = 0; + + if (mddev->pers->start) { + set_bit(MD_RECOVERY_WAIT, &mddev->recovery); + md_wakeup_thread(mddev->thread); + ret = mddev->pers->start(mddev); + clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); + md_wakeup_thread(mddev->sync_thread); + } + return ret; +} +EXPORT_SYMBOL_GPL(md_start); + +static int restart_array(struct mddev *mddev) +{ + struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + bool has_journal = false; + bool has_readonly = false; + + /* Complain if it has no devices */ + if (list_empty(&mddev->disks)) + return -ENXIO; + if (!mddev->pers) + return -EINVAL; + if (md_is_rdwr(mddev)) + return -EBUSY; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + has_journal = true; + if (rdev_read_only(rdev)) + has_readonly = true; + } + rcu_read_unlock(); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) + /* Don't restart rw with journal missing/faulty */ + return -EINVAL; + if (has_readonly) + return -EROFS; + + mddev->safemode = 0; + mddev->ro = MD_RDWR; + set_disk_ro(disk, 0); + pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); + /* Kick recovery or resync if necessary */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; +} + +static void md_clean(struct mddev *mddev) +{ + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->sb_flags = 0; + mddev->ro = MD_RDWR; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->can_decrease_events = 0; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + atomic64_set(&mddev->resync_mismatches, 0); + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->changed = 0; + mddev->degraded = 0; + mddev->safemode = 0; + mddev->private = NULL; + mddev->cluster_info = NULL; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.default_space = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; + mddev->bitmap_info.nodes = 0; +} + +static void __md_stop_writes(struct mddev *mddev) +{ + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + md_reap_sync_thread(mddev); + } + + del_timer_sync(&mddev->safemode_timer); + + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_bitmap_flush(mddev); + + if (md_is_rdwr(mddev) && + ((!mddev->in_sync && !mddev_is_clustered(mddev)) || + mddev->sb_flags)) { + /* mark array as shutdown cleanly */ + if (!mddev_is_clustered(mddev)) + mddev->in_sync = 1; + md_update_sb(mddev, 1); + } + /* disable policy to guarantee rdevs free resources for serialization */ + mddev->serialize_policy = 0; + mddev_destroy_serial_pool(mddev, NULL, true); +} + +void md_stop_writes(struct mddev *mddev) +{ + mddev_lock_nointr(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} +EXPORT_SYMBOL_GPL(md_stop_writes); + +static void mddev_detach(struct mddev *mddev) +{ + md_bitmap_wait_behind_writes(mddev); + if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_unregister_thread(&mddev->thread); + if (mddev->queue) + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +} + +static void __md_stop(struct mddev *mddev) +{ + struct md_personality *pers = mddev->pers; + md_bitmap_destroy(mddev); + mddev_detach(mddev); + /* Ensure ->event_work is done */ + if (mddev->event_work.func) + flush_workqueue(md_misc_wq); + spin_lock(&mddev->lock); + mddev->pers = NULL; + spin_unlock(&mddev->lock); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + if (pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(pers->owner); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + percpu_ref_exit(&mddev->active_io); + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); +} + +void md_stop(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop_writes(mddev); + __md_stop(mddev); + percpu_ref_exit(&mddev->writes_pending); +} + +EXPORT_SYMBOL_GPL(md_stop); + +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) +{ + int err = 0; + int did_freeze = 0; + + if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + return -EBUSY; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + if (mddev->sync_thread) + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + + mddev_unlock(mddev); + wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, + &mddev->recovery)); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_lock_nointr(mddev); + + mutex_lock(&mddev->open_mutex); + if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || + mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + pr_warn("md: %s still in use.\n",mdname(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + __md_stop_writes(mddev); + + if (mddev->ro == MD_RDONLY) { + err = -ENXIO; + goto out; + } + + mddev->ro = MD_RDONLY; + set_disk_ro(mddev->gendisk, 1); + } + +out: + if ((mddev->pers && !err) || did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_state); + } + + mutex_unlock(&mddev->open_mutex); + return err; +} + +/* mode: + * 0 - completely stop and dis-assemble array + * 2 - stop but do not disassemble array + */ +static int do_md_stop(struct mddev *mddev, int mode, + struct block_device *bdev) +{ + struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + if (mddev->sync_thread) + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + + mddev_unlock(mddev); + wait_event(resync_wait, (mddev->sync_thread == NULL && + !test_bit(MD_RECOVERY_RUNNING, + &mddev->recovery))); + mddev_lock_nointr(mddev); + + mutex_lock(&mddev->open_mutex); + if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || + mddev->sysfs_active || + mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + pr_warn("md: %s still in use.\n",mdname(mddev)); + mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + return -EBUSY; + } + if (mddev->pers) { + if (!md_is_rdwr(mddev)) + set_disk_ro(disk, 0); + + __md_stop_writes(mddev); + __md_stop(mddev); + + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent_safe(mddev->sysfs_state); + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + sysfs_unlink_rdev(mddev, rdev); + + set_capacity_and_notify(disk, 0); + mutex_unlock(&mddev->open_mutex); + mddev->changed = 1; + + if (!md_is_rdwr(mddev)) + mddev->ro = MD_RDWR; + } else + mutex_unlock(&mddev->open_mutex); + /* + * Free resources if final stop + */ + if (mode == 0) { + pr_info("md: %s stopped.\n", mdname(mddev)); + + if (mddev->bitmap_info.file) { + struct file *f = mddev->bitmap_info.file; + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); + } + mddev->bitmap_info.offset = 0; + + export_array(mddev); + + md_clean(mddev); + if (mddev->hold_active == UNTIL_STOP) + mddev->hold_active = 0; + } + md_new_event(); + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; +} + +#ifndef MODULE +static void autorun_array(struct mddev *mddev) +{ + struct md_rdev *rdev; + int err; + + if (list_empty(&mddev->disks)) + return; + + pr_info("md: running: "); + + rdev_for_each(rdev, mddev) { + pr_cont("<%pg>", rdev->bdev); + } + pr_cont("\n"); + + err = do_md_run(mddev); + if (err) { + pr_warn("md: do_md_run() returned %d\n", err); + do_md_stop(mddev, 0, NULL); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(int part) +{ + struct md_rdev *rdev0, *rdev, *tmp; + struct mddev *mddev; + + pr_info("md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + int unit; + dev_t dev; + LIST_HEAD(candidates); + rdev0 = list_entry(pending_raid_disks.next, + struct md_rdev, same_set); + + pr_debug("md: considering %pg ...\n", rdev0->bdev); + INIT_LIST_HEAD(&candidates); + rdev_for_each_list(rdev, tmp, &pending_raid_disks) + if (super_90_load(rdev, rdev0, 0) >= 0) { + pr_debug("md: adding %pg ...\n", + rdev->bdev); + list_move(&rdev->same_set, &candidates); + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + if (part) { + dev = MKDEV(mdp_major, + rdev0->preferred_minor << MdpMinorShift); + unit = MINOR(dev) >> MdpMinorShift; + } else { + dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); + unit = MINOR(dev); + } + if (rdev0->preferred_minor != unit) { + pr_warn("md: unit number in %pg is bad: %d\n", + rdev0->bdev, rdev0->preferred_minor); + break; + } + + mddev = md_alloc(dev, NULL); + if (IS_ERR(mddev)) + break; + + if (mddev_lock(mddev)) + pr_warn("md: %s locked, cannot run\n", mdname(mddev)); + else if (mddev->raid_disks || mddev->major_version + || !list_empty(&mddev->disks)) { + pr_warn("md: %s already running, cannot run %pg\n", + mdname(mddev), rdev0->bdev); + mddev_unlock(mddev); + } else { + pr_debug("md: created %s\n", mdname(mddev)); + mddev->persistent = 1; + rdev_for_each_list(rdev, tmp, &candidates) { + list_del_init(&rdev->same_set); + if (bind_rdev_to_array(rdev, mddev)) + export_rdev(rdev); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it won't... + */ + rdev_for_each_list(rdev, tmp, &candidates) { + list_del_init(&rdev->same_set); + export_rdev(rdev); + } + mddev_put(mddev); + } + pr_info("md: ... autorun DONE.\n"); +} +#endif /* !MODULE */ + +static int get_version(void __user *arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +static int get_array_info(struct mddev *mddev, void __user *arg) +{ + mdu_array_info_t info; + int nr,working,insync,failed,spare; + struct md_rdev *rdev; + + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + nr++; + if (test_bit(Faulty, &rdev->flags)) + failed++; + else { + working++; + if (test_bit(In_sync, &rdev->flags)) + insync++; + else if (test_bit(Journal, &rdev->flags)) + /* TODO: add journal count to md_u.h */ + ; + else + spare++; + } + } + rcu_read_unlock(); + + info.major_version = mddev->major_version; + info.minor_version = mddev->minor_version; + info.patch_version = MD_PATCHLEVEL_VERSION; + info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); + info.level = mddev->level; + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ + info.size = -1; + info.nr_disks = nr; + info.raid_disks = mddev->raid_disks; + info.md_minor = mddev->md_minor; + info.not_persistent= !mddev->persistent; + + info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); + info.state = 0; + if (mddev->in_sync) + info.state = (1<bitmap && mddev->bitmap_info.offset) + info.state |= (1<layout; + info.chunk_size = mddev->chunk_sectors << 9; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int get_bitmap_file(struct mddev *mddev, void __user * arg) +{ + mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ + char *ptr; + int err; + + file = kzalloc(sizeof(*file), GFP_NOIO); + if (!file) + return -ENOMEM; + + err = 0; + spin_lock(&mddev->lock); + /* bitmap enabled */ + if (mddev->bitmap_info.file) { + ptr = file_path(mddev->bitmap_info.file, file->pathname, + sizeof(file->pathname)); + if (IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + } + spin_unlock(&mddev->lock); + + if (err == 0 && + copy_to_user(arg, file, sizeof(*file))) + err = -EFAULT; + + kfree(file); + return err; +} + +static int get_disk_info(struct mddev *mddev, void __user * arg) +{ + mdu_disk_info_t info; + struct md_rdev *rdev; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, info.number); + if (rdev) { + info.major = MAJOR(rdev->bdev->bd_dev); + info.minor = MINOR(rdev->bdev->bd_dev); + info.raid_disk = rdev->raid_disk; + info.state = 0; + if (test_bit(Faulty, &rdev->flags)) + info.state |= (1<flags)) { + info.state |= (1<flags)) + info.state |= (1<flags)) + info.state |= (1<flags)) + info.state |= (1<major,info->minor); + + if (mddev_is_clustered(mddev) && + !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { + pr_warn("%s: Cannot add to clustered mddev.\n", + mdname(mddev)); + return -EINVAL; + } + + if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) + return -EOVERFLOW; + + if (!mddev->raid_disks) { + int err; + /* expecting a device which has a superblock */ + rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); + if (IS_ERR(rdev)) { + pr_warn("md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + if (!list_empty(&mddev->disks)) { + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); + err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) { + pr_warn("md: %pg has different UUID to %pg\n", + rdev->bdev, + rdev0->bdev); + export_rdev(rdev); + return -EINVAL; + } + } + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + return err; + } + + /* + * md_add_new_disk can be used once the array is assembled + * to add "hot spares". They must already have a superblock + * written + */ + if (mddev->pers) { + int err; + if (!mddev->pers->hot_add_disk) { + pr_warn("%s: personality does not support diskops!\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->persistent) + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + else + rdev = md_import_device(dev, -1, -1); + if (IS_ERR(rdev)) { + pr_warn("md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + /* set saved_raid_disk if appropriate */ + if (!mddev->persistent) { + if (info->state & (1<raid_disk < mddev->raid_disks) { + rdev->raid_disk = info->raid_disk; + set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); + } else + rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; + } else + super_types[mddev->major_version]. + validate_super(mddev, rdev); + if ((info->state & (1<raid_disk != info->raid_disk) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } + + clear_bit(In_sync, &rdev->flags); /* just to be sure */ + if (info->state & (1<flags); + else + clear_bit(WriteMostly, &rdev->flags); + if (info->state & (1<flags); + else + clear_bit(FailFast, &rdev->flags); + + if (info->state & (1<flags)) { + has_journal = true; + break; + } + } + if (has_journal || mddev->bitmap) { + export_rdev(rdev); + return -EBUSY; + } + set_bit(Journal, &rdev->flags); + } + /* + * check whether the device shows up in other nodes + */ + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + set_bit(Candidate, &rdev->flags); + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + /* --add initiated by this node */ + err = md_cluster_ops->add_new_disk(mddev, rdev); + if (err) { + export_rdev(rdev); + return err; + } + } + } + + rdev->raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + + if (err) + export_rdev(rdev); + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + if (!err) { + err = md_cluster_ops->new_disk_ack(mddev, + err == 0); + if (err) + md_kick_rdev_from_array(rdev); + } + } else { + if (err) + md_cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) + err = add_bound_rdev(rdev); + + return err; + } + + /* otherwise, md_add_new_disk is only allowed + * for major_version==0 superblocks + */ + if (mddev->major_version != 0) { + pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); + return -EINVAL; + } + + if (!(info->state & (1<desc_nr = info->number; + if (info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + + if (rdev->raid_disk < mddev->raid_disks) + if (info->state & (1<flags); + + if (info->state & (1<flags); + if (info->state & (1<flags); + + if (!mddev->persistent) { + pr_debug("md: nonpersistent superblock ...\n"); + rdev->sb_start = bdev_nr_sectors(rdev->bdev); + } else + rdev->sb_start = calc_dev_sboffset(rdev); + rdev->sectors = rdev->sb_start; + + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } + } + + return 0; +} + +static int hot_remove_disk(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + if (!mddev->pers) + return -ENODEV; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->raid_disk < 0) + goto kick_rdev; + + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + + if (rdev->raid_disk >= 0) + goto busy; + +kick_rdev: + if (mddev_is_clustered(mddev)) { + if (md_cluster_ops->remove_disk(mddev, rdev)) + goto busy; + } + + md_kick_rdev_from_array(rdev); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (mddev->thread) + md_wakeup_thread(mddev->thread); + else + md_update_sb(mddev, 1); + md_new_event(); + + return 0; +busy: + pr_debug("md: cannot remove active disk %pg from %s ...\n", + rdev->bdev, mdname(mddev)); + return -EBUSY; +} + +static int hot_add_disk(struct mddev *mddev, dev_t dev) +{ + int err; + struct md_rdev *rdev; + + if (!mddev->pers) + return -ENODEV; + + if (mddev->major_version != 0) { + pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", + mdname(mddev)); + return -EINVAL; + } + if (!mddev->pers->hot_add_disk) { + pr_warn("%s: personality does not support diskops!\n", + mdname(mddev)); + return -EINVAL; + } + + rdev = md_import_device(dev, -1, 0); + if (IS_ERR(rdev)) { + pr_warn("md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return -EINVAL; + } + + if (mddev->persistent) + rdev->sb_start = calc_dev_sboffset(rdev); + else + rdev->sb_start = bdev_nr_sectors(rdev->bdev); + + rdev->sectors = rdev->sb_start; + + if (test_bit(Faulty, &rdev->flags)) { + pr_warn("md: can not hot-add faulty %pg disk to %s!\n", + rdev->bdev, mdname(mddev)); + err = -EINVAL; + goto abort_export; + } + + clear_bit(In_sync, &rdev->flags); + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (err) + goto abort_export; + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + + rdev->raid_disk = -1; + + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (!mddev->thread) + md_update_sb(mddev, 1); + /* + * If the new disk does not support REQ_NOWAIT, + * disable on the whole MD. + */ + if (!bdev_nowait(rdev->bdev)) { + pr_info("%s: Disabling nowait because %pg does not support nowait\n", + mdname(mddev), rdev->bdev); + blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); + } + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_new_event(); + return 0; + +abort_export: + export_rdev(rdev); + return err; +} + +static int set_bitmap_file(struct mddev *mddev, int fd) +{ + int err = 0; + + if (mddev->pers) { + if (!mddev->pers->quiesce || !mddev->thread) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + /* we should be able to change the bitmap.. */ + } + + if (fd >= 0) { + struct inode *inode; + struct file *f; + + if (mddev->bitmap || mddev->bitmap_info.file) + return -EEXIST; /* cannot add when bitmap is present */ + f = fget(fd); + + if (f == NULL) { + pr_warn("%s: error: failed to get bitmap file\n", + mdname(mddev)); + return -EBADF; + } + + inode = f->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + pr_warn("%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(f->f_mode & FMODE_WRITE)) { + pr_warn("%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { + pr_warn("%s: error: bitmap file is already in use\n", + mdname(mddev)); + err = -EBUSY; + } + if (err) { + fput(f); + return err; + } + mddev->bitmap_info.file = f; + mddev->bitmap_info.offset = 0; /* file overrides offset */ + } else if (mddev->bitmap == NULL) + return -ENOENT; /* cannot remove what isn't there */ + err = 0; + if (mddev->pers) { + if (fd >= 0) { + struct bitmap *bitmap; + + bitmap = md_bitmap_create(mddev, -1); + mddev_suspend(mddev); + if (!IS_ERR(bitmap)) { + mddev->bitmap = bitmap; + err = md_bitmap_load(mddev); + } else + err = PTR_ERR(bitmap); + if (err) { + md_bitmap_destroy(mddev); + fd = -1; + } + mddev_resume(mddev); + } else if (fd < 0) { + mddev_suspend(mddev); + md_bitmap_destroy(mddev); + mddev_resume(mddev); + } + } + if (fd < 0) { + struct file *f = mddev->bitmap_info.file; + if (f) { + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); + } + } + + return err; +} + +/* + * md_set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > 0 and it together with + * level, size, not_persistent,layout,chunksize determine the + * shape of the array. + * This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + * In this case raid_disks will be 0, and the major_version field is + * use to determine which style super-blocks are to be found on the devices. + * The minor and patch _version numbers are also kept incase the + * super_block handler wishes to interpret them. + */ +int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) +{ + if (info->raid_disks == 0) { + /* just setting version number for superblock loading */ + if (info->major_version < 0 || + info->major_version >= ARRAY_SIZE(super_types) || + super_types[info->major_version].name == NULL) { + /* maybe try to auto-load a module? */ + pr_warn("md: superblock version %d not known\n", + info->major_version); + return -EINVAL; + } + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; + mddev->persistent = !info->not_persistent; + /* ensure mddev_put doesn't delete this now that there + * is some minimal configuration. + */ + mddev->ctime = ktime_get_real_seconds(); + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; + mddev->minor_version = MD_MINOR_VERSION; + mddev->patch_version = MD_PATCHLEVEL_VERSION; + mddev->ctime = ktime_get_real_seconds(); + + mddev->level = info->level; + mddev->clevel[0] = 0; + mddev->dev_sectors = 2 * (sector_t)info->size; + mddev->raid_disks = info->raid_disks; + /* don't set md_minor, it is determined by which /dev/md* was + * openned + */ + if (info->state & (1<recovery_cp = MaxSector; + else + mddev->recovery_cp = 0; + mddev->persistent = ! info->not_persistent; + mddev->external = 0; + + mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; + mddev->chunk_sectors = info->chunk_size >> 9; + + if (mddev->persistent) { + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; + } + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->bitmap_info.offset = 0; + + mddev->reshape_position = MaxSector; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(mddev->uuid, 16); + + mddev->new_level = mddev->level; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->new_layout = mddev->layout; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + + return 0; +} + +void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + if (mddev->external_size) + return; + + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + +static int update_size(struct mddev *mddev, sector_t num_sectors) +{ + struct md_rdev *rdev; + int rv; + int fit = (num_sectors == 0); + sector_t old_dev_sectors = mddev->dev_sectors; + + if (mddev->pers->resize == NULL) + return -EINVAL; + /* The "num_sectors" is the number of sectors of each device that + * is used. This can only make sense for arrays with redundancy. + * linear and raid0 always use whatever space is available. We can only + * consider changing this number if no resync or reconstruction is + * happening, and if the new size is acceptable. It must fit before the + * sb_start or, if that is recovery) || + mddev->sync_thread) + return -EBUSY; + if (!md_is_rdwr(mddev)) + return -EROFS; + + rdev_for_each(rdev, mddev) { + sector_t avail = rdev->sectors; + + if (fit && (num_sectors == 0 || num_sectors > avail)) + num_sectors = avail; + if (avail < num_sectors) + return -ENOSPC; + } + rv = mddev->pers->resize(mddev, num_sectors); + if (!rv) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->update_size(mddev, old_dev_sectors); + else if (mddev->queue) { + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } + } + return rv; +} + +static int update_raid_disks(struct mddev *mddev, int raid_disks) +{ + int rv; + struct md_rdev *rdev; + /* change the number of raid disks */ + if (mddev->pers->check_reshape == NULL) + return -EINVAL; + if (!md_is_rdwr(mddev)) + return -EROFS; + if (raid_disks <= 0 || + (mddev->max_disks && raid_disks >= mddev->max_disks)) + return -EINVAL; + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || + mddev->reshape_position != MaxSector) + return -EBUSY; + + rdev_for_each(rdev, mddev) { + if (mddev->raid_disks < raid_disks && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (mddev->raid_disks > raid_disks && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } + + mddev->delta_disks = raid_disks - mddev->raid_disks; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; + else if (mddev->delta_disks > 0) + mddev->reshape_backwards = 0; + + rv = mddev->pers->check_reshape(mddev); + if (rv < 0) { + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } + return rv; +} + +/* + * update_array_info is used to change the configuration of an + * on-line array. + * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size + * fields in the info are checked against the array. + * Any differences that cannot be handled will cause an error. + * Normally, only one change can be managed at a time. + */ +static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) +{ + int rv = 0; + int cnt = 0; + int state = 0; + + /* calculate expected state,ignoring low bits */ + if (mddev->bitmap && mddev->bitmap_info.offset) + state |= (1 << MD_SB_BITMAP_PRESENT); + + if (mddev->major_version != info->major_version || + mddev->minor_version != info->minor_version || +/* mddev->patch_version != info->patch_version || */ + mddev->ctime != info->ctime || + mddev->level != info->level || +/* mddev->layout != info->layout || */ + mddev->persistent != !info->not_persistent || + mddev->chunk_sectors != info->chunk_size >> 9 || + /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ + ((state^info->state) & 0xfffffe00) + ) + return -EINVAL; + /* Check there is only one change */ + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1< 1) + return -EINVAL; + + if (mddev->layout != info->layout) { + /* Change layout + * we don't need to do anything at the md level, the + * personality will take care of it all. + */ + if (mddev->pers->check_reshape == NULL) + return -EINVAL; + else { + mddev->new_layout = info->layout; + rv = mddev->pers->check_reshape(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } + } + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + rv = update_size(mddev, (sector_t)info->size * 2); + + if (mddev->raid_disks != info->raid_disks) + rv = update_raid_disks(mddev, info->raid_disks); + + if ((state ^ info->state) & (1<pers->quiesce == NULL || mddev->thread == NULL) { + rv = -EINVAL; + goto err; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto err; + } + if (info->state & (1<bitmap) { + rv = -EEXIST; + goto err; + } + if (mddev->bitmap_info.default_offset == 0) { + rv = -EINVAL; + goto err; + } + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; + bitmap = md_bitmap_create(mddev, -1); + mddev_suspend(mddev); + if (!IS_ERR(bitmap)) { + mddev->bitmap = bitmap; + rv = md_bitmap_load(mddev); + } else + rv = PTR_ERR(bitmap); + if (rv) + md_bitmap_destroy(mddev); + mddev_resume(mddev); + } else { + /* remove the bitmap */ + if (!mddev->bitmap) { + rv = -ENOENT; + goto err; + } + if (mddev->bitmap->storage.file) { + rv = -EINVAL; + goto err; + } + if (mddev->bitmap_info.nodes) { + /* hold PW on all the bitmap lock */ + if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); + rv = -EPERM; + md_cluster_ops->unlock_all_bitmaps(mddev); + goto err; + } + + mddev->bitmap_info.nodes = 0; + md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; + } + mddev_suspend(mddev); + md_bitmap_destroy(mddev); + mddev_resume(mddev); + mddev->bitmap_info.offset = 0; + } + } + md_update_sb(mddev, 1); + return rv; +err: + return rv; +} + +static int set_disk_faulty(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + int err = 0; + + if (mddev->pers == NULL) + return -ENODEV; + + rcu_read_lock(); + rdev = md_find_rdev_rcu(mddev, dev); + if (!rdev) + err = -ENODEV; + else { + md_error(mddev, rdev); + if (test_bit(MD_BROKEN, &mddev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; +} + +/* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ +static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct mddev *mddev = bdev->bd_disk->private_data; + + geo->heads = 2; + geo->sectors = 4; + geo->cylinders = mddev->array_sectors / 8; + return 0; +} + +static inline bool md_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case GET_ARRAY_INFO: + case GET_BITMAP_FILE: + case GET_DISK_INFO: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case RAID_VERSION: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + case CLUSTERED_DISK_NACK: + return true; + default: + return false; + } +} + +static int md_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + void __user *argp = (void __user *)arg; + struct mddev *mddev = NULL; + bool did_set_md_closing = false; + + if (!md_ioctl_valid(cmd)) + return -ENOTTY; + + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) { + case RAID_VERSION: + err = get_version(argp); + goto out; + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = bdev->bd_disk->private_data; + + if (!mddev) { + BUG(); + goto out; + } + + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto out; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto out; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto out; + + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto out; + + } + + if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) + flush_rdev_wq(mddev); + + if (cmd == HOT_REMOVE_DISK) + /* need to ensure recovery thread has run */ + wait_event_interruptible_timeout(mddev->sb_wait, + !test_bit(MD_RECOVERY_NEEDED, + &mddev->recovery), + msecs_to_jiffies(5000)); + if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { + /* Need to flush page cache, and ensure no-one else opens + * and writes + */ + mutex_lock(&mddev->open_mutex); + if (mddev->pers && atomic_read(&mddev->openers) > 1) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } + did_set_md_closing = true; + mutex_unlock(&mddev->open_mutex); + sync_blockdev(bdev); + } + err = mddev_lock(mddev); + if (err) { + pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto out; + } + + if (cmd == SET_ARRAY_INFO) { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + pr_warn("md: couldn't update array info. %d\n", err); + goto unlock; + } + goto unlock; + } + if (!list_empty(&mddev->disks)) { + pr_warn("md: array %s already has disks!\n", mdname(mddev)); + err = -EBUSY; + goto unlock; + } + if (mddev->raid_disks) { + pr_warn("md: array %s already initialised!\n", mdname(mddev)); + err = -EBUSY; + goto unlock; + } + err = md_set_array_info(mddev, &info); + if (err) { + pr_warn("md: couldn't set array info. %d\n", err); + goto unlock; + } + goto unlock; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, + * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ + if ((!mddev->raid_disks && !mddev->external) + && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE + && cmd != GET_BITMAP_FILE) { + err = -ENODEV; + goto unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) { + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto unlock; + + case STOP_ARRAY: + err = do_md_stop(mddev, 0, bdev); + goto unlock; + + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, bdev); + goto unlock; + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto unlock; + + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * only if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<pers) { + if (mddev->ro != MD_AUTO_READ) { + err = -EROFS; + goto unlock; + } + mddev->ro = MD_RDWR; + sysfs_notify_dirent_safe(mddev->sysfs_state); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_lock_nointr(mddev); + } + } + + switch (cmd) { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = md_add_new_disk(mddev, &info); + goto unlock; + } + + case CLUSTERED_DISK_NACK: + if (mddev_is_clustered(mddev)) + md_cluster_ops->new_disk_ack(mddev, false); + else + err = -EINVAL; + goto unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto unlock; + + case RUN_ARRAY: + err = do_md_run(mddev); + goto unlock; + + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto unlock; + + default: + err = -EINVAL; + goto unlock; + } + +unlock: + if (mddev->hold_active == UNTIL_IOCTL && + err != -EINVAL) + mddev->hold_active = 0; + mddev_unlock(mddev); +out: + if(did_set_md_closing) + clear_bit(MD_CLOSING, &mddev->flags); + return err; +} +#ifdef CONFIG_COMPAT +static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* These take in integer arg, do not convert */ + break; + default: + arg = (unsigned long)compat_ptr(arg); + break; + } + + return md_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static int md_set_read_only(struct block_device *bdev, bool ro) +{ + struct mddev *mddev = bdev->bd_disk->private_data; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + + if (!mddev->raid_disks && !mddev->external) { + err = -ENODEV; + goto out_unlock; + } + + /* + * Transitioning to read-auto need only happen for arrays that call + * md_write_start and which are not ready for writes yet. + */ + if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { + err = restart_array(mddev); + if (err) + goto out_unlock; + mddev->ro = MD_AUTO_READ; + } + +out_unlock: + mddev_unlock(mddev); + return err; +} + +static int md_open(struct block_device *bdev, fmode_t mode) +{ + struct mddev *mddev; + int err; + + spin_lock(&all_mddevs_lock); + mddev = mddev_get(bdev->bd_disk->private_data); + spin_unlock(&all_mddevs_lock); + if (!mddev) + return -ENODEV; + + err = mutex_lock_interruptible(&mddev->open_mutex); + if (err) + goto out; + + err = -ENODEV; + if (test_bit(MD_CLOSING, &mddev->flags)) + goto out_unlock; + + atomic_inc(&mddev->openers); + mutex_unlock(&mddev->open_mutex); + + bdev_check_media_change(bdev); + return 0; + +out_unlock: + mutex_unlock(&mddev->open_mutex); +out: + mddev_put(mddev); + return err; +} + +static void md_release(struct gendisk *disk, fmode_t mode) +{ + struct mddev *mddev = disk->private_data; + + BUG_ON(!mddev); + atomic_dec(&mddev->openers); + mddev_put(mddev); +} + +static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) +{ + struct mddev *mddev = disk->private_data; + unsigned int ret = 0; + + if (mddev->changed) + ret = DISK_EVENT_MEDIA_CHANGE; + mddev->changed = 0; + return ret; +} + +static void md_free_disk(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + percpu_ref_exit(&mddev->writes_pending); + mddev_free(mddev); +} + +const struct block_device_operations md_fops = +{ + .owner = THIS_MODULE, + .submit_bio = md_submit_bio, + .open = md_open, + .release = md_release, + .ioctl = md_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = md_compat_ioctl, +#endif + .getgeo = md_getgeo, + .check_events = md_check_events, + .set_read_only = md_set_read_only, + .free_disk = md_free_disk, +}; + +static int md_thread(void *arg) +{ + struct md_thread *thread = arg; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + + allow_signal(SIGKILL); + while (!kthread_should_stop()) { + + /* We need to wait INTERRUPTIBLE so that + * we don't add to the load-average. + * That means we need to be sure no signals are + * pending + */ + if (signal_pending(current)) + flush_signals(current); + + wait_event_interruptible_timeout + (thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags) + || kthread_should_stop() || kthread_should_park(), + thread->timeout); + + clear_bit(THREAD_WAKEUP, &thread->flags); + if (kthread_should_park()) + kthread_parkme(); + if (!kthread_should_stop()) + thread->run(thread); + } + + return 0; +} + +void md_wakeup_thread(struct md_thread *thread) +{ + if (thread) { + pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); + } +} +EXPORT_SYMBOL(md_wakeup_thread); + +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) +{ + struct md_thread *thread; + + thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); + if (!thread) + return NULL; + + init_waitqueue_head(&thread->wqueue); + + thread->run = run; + thread->mddev = mddev; + thread->timeout = MAX_SCHEDULE_TIMEOUT; + thread->tsk = kthread_run(md_thread, thread, + "%s_%s", + mdname(thread->mddev), + name); + if (IS_ERR(thread->tsk)) { + kfree(thread); + return NULL; + } + return thread; +} +EXPORT_SYMBOL(md_register_thread); + +void md_unregister_thread(struct md_thread **threadp) +{ + struct md_thread *thread; + + /* + * Locking ensures that mddev_unlock does not wake_up a + * non-existent thread + */ + spin_lock(&pers_lock); + thread = *threadp; + if (!thread) { + spin_unlock(&pers_lock); + return; + } + *threadp = NULL; + spin_unlock(&pers_lock); + + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + kthread_stop(thread->tsk); + kfree(thread); +} +EXPORT_SYMBOL(md_unregister_thread); + +void md_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!rdev || test_bit(Faulty, &rdev->flags)) + return; + + if (!mddev->pers || !mddev->pers->error_handler) + return; + mddev->pers->error_handler(mddev, rdev); + + if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + return; + + if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + if (!test_bit(MD_BROKEN, &mddev->flags)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); + md_new_event(); +} +EXPORT_SYMBOL(md_error); + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + struct md_rdev *rdev; + + seq_printf(seq, "unused devices: "); + + list_for_each_entry(rdev, &pending_raid_disks, same_set) { + i++; + seq_printf(seq, "%pg ", rdev->bdev); + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + +static int status_resync(struct seq_file *seq, struct mddev *mddev) +{ + sector_t max_sectors, resync, res; + unsigned long dt, db = 0; + sector_t rt, curr_mark_cnt, resync_mark_cnt; + int scale, recovery_active; + unsigned int per_milli; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; + else + max_sectors = mddev->dev_sectors; + + resync = mddev->curr_resync; + if (resync < MD_RESYNC_ACTIVE) { + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* Still cleaning up */ + resync = max_sectors; + } else if (resync > max_sectors) { + resync = max_sectors; + } else { + res = atomic_read(&mddev->recovery_active); + /* + * Resync has started, but the subtraction has overflowed or + * yielded one of the special values. Force it to active to + * ensure the status reports an active resync. + */ + if (resync < res || resync - res < MD_RESYNC_ACTIVE) + resync = MD_RESYNC_ACTIVE; + else + resync -= res; + } + + if (resync == MD_RESYNC_NONE) { + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset != MaxSector && + rdev->recovery_offset) { + seq_printf(seq, "\trecover=REMOTE"); + return 1; + } + if (mddev->reshape_position != MaxSector) + seq_printf(seq, "\treshape=REMOTE"); + else + seq_printf(seq, "\tresync=REMOTE"); + return 1; + } + if (mddev->recovery_cp < MaxSector) { + seq_printf(seq, "\tresync=PENDING"); + return 1; + } + return 0; + } + if (resync < MD_RESYNC_ACTIVE) { + seq_printf(seq, "\tresync=DELAYED"); + return 1; + } + + WARN_ON(max_sectors == 0); + /* Pick 'scale' such that (resync>>scale)*1000 will fit + * in a sector_t, and (max_sectors>>scale) will fit in a + * u32, as those are the requirements for sector_div. + * Thus 'scale' must be at least 10 + */ + scale = 10; + if (sizeof(sector_t) > sizeof(unsigned long)) { + while ( max_sectors/2 > (1ULL<<(scale+32))) + scale++; + } + res = (resync>>scale)*1000; + sector_div(res, (u32)((max_sectors>>scale)+1)); + + per_milli = res; + { + int i, x = per_milli/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", + (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? + "reshape" : + (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? + "check" : + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"))), + per_milli/10, per_milli % 10, + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); + + /* + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + * + * rt is a sector_t, which is always 64bit now. We are keeping + * the original algorithm, but it is not really necessary. + * + * Original algorithm: + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + + curr_mark_cnt = mddev->curr_mark_cnt; + recovery_active = atomic_read(&mddev->recovery_active); + resync_mark_cnt = mddev->resync_mark_cnt; + + if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) + db = curr_mark_cnt - (recovery_active + resync_mark_cnt); + + rt = max_sectors - resync; /* number of remaining sectors */ + rt = div64_u64(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/2/dt); + return 1; +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + struct mddev *mddev; + + if (l == 0x10000) { + ++*pos; + return (void *)2; + } + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + spin_lock(&all_mddevs_lock); + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, struct mddev, all_mddevs); + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); + return mddev; + } + spin_unlock(&all_mddevs_lock); + if (!l--) + return (void*)2;/* tail */ + return NULL; +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + struct mddev *next_mddev, *mddev = v; + struct mddev *to_put = NULL; + + ++*pos; + if (v == (void*)2) + return NULL; + + spin_lock(&all_mddevs_lock); + if (v == (void*)1) { + tmp = all_mddevs.next; + } else { + to_put = mddev; + tmp = mddev->all_mddevs.next; + } + + for (;;) { + if (tmp == &all_mddevs) { + next_mddev = (void*)2; + *pos = 0x10000; + break; + } + next_mddev = list_entry(tmp, struct mddev, all_mddevs); + if (mddev_get(next_mddev)) + break; + mddev = next_mddev; + tmp = mddev->all_mddevs.next; + } + spin_unlock(&all_mddevs_lock); + + if (to_put) + mddev_put(to_put); + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + struct mddev *mddev = v; + + if (mddev && v != (void*)1 && v != (void*)2) + mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + struct mddev *mddev = v; + sector_t sectors; + struct md_rdev *rdev; + + if (v == (void*)1) { + struct md_personality *pers; + seq_printf(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_printf(seq, "\n"); + seq->poll_event = atomic_read(&md_event_count); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + spin_lock(&mddev->lock); + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { + seq_printf(seq, "%s : %sactive", mdname(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro == MD_RDONLY) + seq_printf(seq, " (read-only)"); + if (mddev->ro == MD_AUTO_READ) + seq_printf(seq, " (auto-read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + sectors = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); + + if (test_bit(WriteMostly, &rdev->flags)) + seq_printf(seq, "(W)"); + if (test_bit(Journal, &rdev->flags)) + seq_printf(seq, "(J)"); + if (test_bit(Faulty, &rdev->flags)) { + seq_printf(seq, "(F)"); + continue; + } + if (rdev->raid_disk < 0) + seq_printf(seq, "(S)"); /* spare */ + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); + sectors += rdev->sectors; + } + rcu_read_unlock(); + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %llu blocks", + (unsigned long long) + mddev->array_sectors / 2); + else + seq_printf(seq, "\n %llu blocks", + (unsigned long long)sectors / 2); + } + if (mddev->persistent) { + if (mddev->major_version != 0 || + mddev->minor_version != 90) { + seq_printf(seq," super %d.%d", + mddev->major_version, + mddev->minor_version); + } + } else if (mddev->external) + seq_printf(seq, " super external:%s", + mddev->metadata_type); + else + seq_printf(seq, " super non-persistent"); + + if (mddev->pers) { + mddev->pers->status(seq, mddev); + seq_printf(seq, "\n "); + if (mddev->pers->sync_request) { + if (status_resync(seq, mddev)) + seq_printf(seq, "\n "); + } + } else + seq_printf(seq, "\n "); + + md_bitmap_status(seq, mddev->bitmap); + + seq_printf(seq, "\n"); + } + spin_unlock(&mddev->lock); + + return 0; +} + +static const struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int error; + + error = seq_open(file, &md_seq_ops); + if (error) + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); + return error; +} + +static int md_unloading; +static __poll_t mdstat_poll(struct file *filp, poll_table *wait) +{ + struct seq_file *seq = filp->private_data; + __poll_t mask; + + if (md_unloading) + return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; + poll_wait(filp, &md_event_waiters, wait); + + /* always allow read */ + mask = EPOLLIN | EPOLLRDNORM; + + if (seq->poll_event != atomic_read(&md_event_count)) + mask |= EPOLLERR | EPOLLPRI; + return mask; +} + +static const struct proc_ops mdstat_proc_ops = { + .proc_open = md_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, + .proc_poll = mdstat_poll, +}; + +int register_md_personality(struct md_personality *p) +{ + pr_debug("md: %s personality registered for level %d\n", + p->name, p->level); + spin_lock(&pers_lock); + list_add_tail(&p->list, &pers_list); + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(register_md_personality); + +int unregister_md_personality(struct md_personality *p) +{ + pr_debug("md: %s personality unregistered\n", p->name); + spin_lock(&pers_lock); + list_del_init(&p->list); + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(unregister_md_personality); + +int register_md_cluster_operations(struct md_cluster_operations *ops, + struct module *module) +{ + int ret = 0; + spin_lock(&pers_lock); + if (md_cluster_ops != NULL) + ret = -EALREADY; + else { + md_cluster_ops = ops; + md_cluster_mod = module; + } + spin_unlock(&pers_lock); + return ret; +} +EXPORT_SYMBOL(register_md_cluster_operations); + +int unregister_md_cluster_operations(void) +{ + spin_lock(&pers_lock); + md_cluster_ops = NULL; + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(unregister_md_cluster_operations); + +int md_setup_cluster(struct mddev *mddev, int nodes) +{ + int ret; + if (!md_cluster_ops) + request_module("md-cluster"); + spin_lock(&pers_lock); + /* ensure module won't be unloaded */ + if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + pr_warn("can't find md-cluster module or get its reference.\n"); + spin_unlock(&pers_lock); + return -ENOENT; + } + spin_unlock(&pers_lock); + + ret = md_cluster_ops->join(mddev, nodes); + if (!ret) + mddev->safemode_delay = 0; + return ret; +} + +void md_cluster_stop(struct mddev *mddev) +{ + if (!md_cluster_ops) + return; + md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); +} + +static int is_mddev_idle(struct mddev *mddev, int init) +{ + struct md_rdev *rdev; + int idle; + int curr_events; + + idle = 1; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + struct gendisk *disk = rdev->bdev->bd_disk; + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - + atomic_read(&disk->sync_io); + /* sync IO will cause sync_io to increase before the disk_stats + * as sync_io is counted when a request starts, and + * disk_stats is counted when it completes. + * So resync activity will cause curr_events to be smaller than + * when there was no such activity. + * non-sync IO will cause disk_stat to increase without + * increasing sync_io so curr_events will (eventually) + * be larger than it was before. Once it becomes + * substantially larger, the test below will cause + * the array to appear non-idle, and resync will slow + * down. + * If there is a lot of outstanding resync activity when + * we set last_event to curr_events, then all that activity + * completing might cause the array to appear non-idle + * and resync will be slowed down even though there might + * not have been non-resync activity. This will only + * happen once though. 'last_events' will soon reflect + * the state where there is little or no outstanding + * resync requests, and further resync activity will + * always make curr_events less than last_events. + * + */ + if (init || curr_events - rdev->last_events > 64) { + rdev->last_events = curr_events; + idle = 0; + } + } + rcu_read_unlock(); + return idle; +} + +void md_done_sync(struct mddev *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); + md_wakeup_thread(mddev->thread); + // stop recovery, signal do_sync .... + } +} +EXPORT_SYMBOL(md_done_sync); + +/* md_write_start(mddev, bi) + * If we need to update some array metadata (e.g. 'active' flag + * in superblock) before writing, schedule a superblock update + * and wait for it to complete. + * A return value of 'false' means that the write wasn't recorded + * and cannot proceed as the array is being suspend. + */ +bool md_write_start(struct mddev *mddev, struct bio *bi) +{ + int did_change = 0; + + if (bio_data_dir(bi) != WRITE) + return true; + + BUG_ON(mddev->ro == MD_RDONLY); + if (mddev->ro == MD_AUTO_READ) { + /* need to switch to read/write */ + mddev->ro = MD_RDWR; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); + did_change = 1; + } + rcu_read_lock(); + percpu_ref_get(&mddev->writes_pending); + smp_mb(); /* Match smp_mb in set_in_sync() */ + if (mddev->safemode == 1) + mddev->safemode = 0; + /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ + if (mddev->in_sync || mddev->sync_checkers) { + spin_lock(&mddev->lock); + if (mddev->in_sync) { + mddev->in_sync = 0; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + did_change = 1; + } + spin_unlock(&mddev->lock); + } + rcu_read_unlock(); + if (did_change) + sysfs_notify_dirent_safe(mddev->sysfs_state); + if (!mddev->has_superblocks) + return true; + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + is_md_suspended(mddev)); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + percpu_ref_put(&mddev->writes_pending); + return false; + } + return true; +} +EXPORT_SYMBOL(md_write_start); + +/* md_write_inc can only be called when md_write_start() has + * already been called at least once of the current request. + * It increments the counter and is useful when a single request + * is split into several parts. Each part causes an increment and + * so needs a matching md_write_end(). + * Unlike md_write_start(), it is safe to call md_write_inc() inside + * a spinlocked region. + */ +void md_write_inc(struct mddev *mddev, struct bio *bi) +{ + if (bio_data_dir(bi) != WRITE) + return; + WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); + percpu_ref_get(&mddev->writes_pending); +} +EXPORT_SYMBOL(md_write_inc); + +void md_write_end(struct mddev *mddev) +{ + percpu_ref_put(&mddev->writes_pending); + + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else if (mddev->safemode_delay) + /* The roundup() ensures this only performs locking once + * every ->safemode_delay jiffies + */ + mod_timer(&mddev->safemode_timer, + roundup(jiffies, mddev->safemode_delay) + + mddev->safemode_delay); +} + +EXPORT_SYMBOL(md_write_end); + +/* This is used by raid0 and raid10 */ +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size) +{ + struct bio *discard_bio = NULL; + + if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, + &discard_bio) || !discard_bio) + return; + + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(discard_bio, + disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); +} +EXPORT_SYMBOL_GPL(md_submit_discard_bio); + +int acct_bioset_init(struct mddev *mddev) +{ + int err = 0; + + if (!bioset_initialized(&mddev->io_acct_set)) + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, + offsetof(struct md_io_acct, bio_clone), 0); + return err; +} +EXPORT_SYMBOL_GPL(acct_bioset_init); + +void acct_bioset_exit(struct mddev *mddev) +{ + bioset_exit(&mddev->io_acct_set); +} +EXPORT_SYMBOL_GPL(acct_bioset_exit); + +static void md_end_io_acct(struct bio *bio) +{ + struct md_io_acct *md_io_acct = bio->bi_private; + struct bio *orig_bio = md_io_acct->orig_bio; + + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; + + bio_end_io_acct(orig_bio, md_io_acct->start_time); + bio_put(bio); + bio_endio(orig_bio); +} + +/* + * Used by personalities that don't already clone the bio and thus can't + * easily add the timestamp to their extended bio structure. + */ +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ + struct block_device *bdev = (*bio)->bi_bdev; + struct md_io_acct *md_io_acct; + struct bio *clone; + + if (!blk_queue_io_stat(bdev->bd_disk->queue)) + return; + + clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(clone, struct md_io_acct, bio_clone); + md_io_acct->orig_bio = *bio; + md_io_acct->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_io_acct; + clone->bi_private = md_io_acct; + *bio = clone; +} +EXPORT_SYMBOL_GPL(md_account_bio); + +/* md_allow_write(mddev) + * Calling this ensures that the array is marked 'active' so that writes + * may proceed without blocking. It is important to call this before + * attempting a GFP_KERNEL allocation while holding the mddev lock. + * Must be called with mddev_lock held. + */ +void md_allow_write(struct mddev *mddev) +{ + if (!mddev->pers) + return; + if (!md_is_rdwr(mddev)) + return; + if (!mddev->pers->sync_request) + return; + + spin_lock(&mddev->lock); + if (mddev->in_sync) { + mddev->in_sync = 0; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + if (mddev->safemode_delay && + mddev->safemode == 0) + mddev->safemode = 1; + spin_unlock(&mddev->lock); + md_update_sb(mddev, 0); + sysfs_notify_dirent_safe(mddev->sysfs_state); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + } else + spin_unlock(&mddev->lock); +} +EXPORT_SYMBOL_GPL(md_allow_write); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +#define UPDATE_FREQUENCY (5*60*HZ) +void md_do_sync(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct mddev *mddev2; + unsigned int currspeed = 0, window; + sector_t max_sectors,j, io_sectors, recovery_done; + unsigned long mark[SYNC_MARKS]; + unsigned long update_time; + sector_t mark_cnt[SYNC_MARKS]; + int last_mark,m; + sector_t last_check; + int skipped = 0; + struct md_rdev *rdev; + char *desc, *action = NULL; + struct blk_plug plug; + int ret; + + /* just incase thread restarts... */ + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) + return; + if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return; + } + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) + goto skip; + + set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); + if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + && ((unsigned long long)mddev->curr_resync_completed + < (unsigned long long)mddev->resync_max_sectors)) + goto skip; + } + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + desc = "data-check"; + action = "check"; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + desc = "requested-resync"; + action = "repair"; + } else + desc = "resync"; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + desc = "reshape"; + else + desc = "recovery"; + + mddev->last_sync_action = action ?: desc; + + /* + * Before starting a resync we must have set curr_resync to + * 2, and then checked that every "conflicting" array has curr_resync + * less than ours. When we find one that is the same or higher + * we wait on resync_wait. To avoid deadlock, we reduce curr_resync + * to 1 if we choose to yield (based arbitrarily on address of mddev structure). + * This will mean we have to start checking from the beginning again. + * + */ + + do { + int mddev2_minor = -1; + mddev->curr_resync = MD_RESYNC_DELAYED; + + try_again: + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto skip; + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev2->flags)) + continue; + if (mddev2 == mddev) + continue; + if (!mddev->parallel_resync + && mddev2->curr_resync + && match_mddev_units(mddev, mddev2)) { + DEFINE_WAIT(wq); + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { + /* arbitrarily yield */ + mddev->curr_resync = MD_RESYNC_YIELDED; + wake_up(&resync_wait); + } + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) + /* no need to wait here, we can wait the next + * time 'round when curr_resync == 2 + */ + continue; + /* We need to wait 'interruptible' so as not to + * contribute to the load average, and not to + * be caught by 'softlockup' + */ + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev2->curr_resync >= mddev->curr_resync) { + if (mddev2_minor != mddev2->md_minor) { + mddev2_minor = mddev2->md_minor; + pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", + desc, mdname(mddev), + mdname(mddev2)); + } + spin_unlock(&all_mddevs_lock); + + if (signal_pending(current)) + flush_signals(current); + schedule(); + finish_wait(&resync_wait, &wq); + goto try_again; + } + finish_wait(&resync_wait, &wq); + } + } + spin_unlock(&all_mddevs_lock); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); + + j = 0; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* resync follows the size requested by the personality, + * which defaults to physical size, but can be virtual size + */ + max_sectors = mddev->resync_max_sectors; + atomic64_set(&mddev->resync_mismatches, 0); + /* we don't use the checkpoint if there's a bitmap */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + j = mddev->resync_min; + else if (!mddev->bitmap) + j = mddev->recovery_cp; + + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + max_sectors = mddev->resync_max_sectors; + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set j again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + j = mddev->reshape_position; + } else { + /* recovery follows the physical size of devices */ + max_sectors = mddev->dev_sectors; + j = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < j) + j = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + } + + pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); + pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); + pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); + + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ + + io_sectors = 0; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = io_sectors; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = 32 * (PAGE_SIZE / 512); + pr_debug("md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + last_check = 0; + + if (j>2) { + pr_debug("md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); + mddev->curr_resync = j; + } else + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ + mddev->curr_resync_completed = j; + sysfs_notify_dirent_safe(mddev->sysfs_completed); + md_new_event(); + update_time = jiffies; + + blk_start_plug(&plug); + while (j < max_sectors) { + sector_t sectors; + + skipped = 0; + + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed || + mddev->curr_resync_completed > mddev->resync_max + )) { + /* time to update curr_resync_completed */ + wait_event(mddev->recovery_wait, + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = j; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + j > mddev->recovery_cp) + mddev->recovery_cp = j; + update_time = jiffies; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + sysfs_notify_dirent_safe(mddev->sysfs_completed); + } + + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* As this condition is controlled by user-space, + * we can block indefinitely, so use '_interruptible' + * to avoid triggering warnings. + */ + flush_signals(current); /* just in case */ + wait_event_interruptible(mddev->recovery_wait, + mddev->resync_max > j + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); + } + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + + sectors = mddev->pers->sync_request(mddev, j, &skipped); + if (sectors == 0) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + break; + } + + if (!skipped) { /* actual IO requested */ + io_sectors += sectors; + atomic_add(sectors, &mddev->recovery_active); + } + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + + j += sectors; + if (j > max_sectors) + /* when skipping, extra large numbers can be returned. */ + j = max_sectors; + if (j > 2) + mddev->curr_resync = j; + mddev->curr_mark_cnt = io_sectors; + if (last_check == 0) + /* this is the earliest that rebuild will be + * visible in /proc/mdstat + */ + md_new_event(); + + if (last_check + window > io_sectors || j == max_sectors) + continue; + + last_check = io_sectors; + repeat: + if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + cond_resched(); + + recovery_done = io_sectors - atomic_read(&mddev->recovery_active); + currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 + /((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > speed_min(mddev)) { + if (currspeed > speed_max(mddev)) { + msleep(500); + goto repeat; + } + if (!is_mddev_idle(mddev, 0)) { + /* + * Give other IO more of a chance. + * The faster the devices, the less we wait. + */ + wait_event(mddev->recovery_wait, + !atomic_read(&mddev->recovery_active)); + } + } + } + pr_info("md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); + /* + * this also signals 'finished resyncing' to md_stop + */ + blk_finish_plug(&plug); + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->curr_resync >= MD_RESYNC_ACTIVE) { + mddev->curr_resync_completed = mddev->curr_resync; + sysfs_notify_dirent_safe(mddev->sysfs_completed); + } + mddev->pers->sync_request(mddev, max_sectors, &skipped); + + if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && + mddev->curr_resync > MD_RESYNC_ACTIVE) { + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (mddev->curr_resync >= mddev->recovery_cp) { + pr_debug("md: checkpointing %s of %s.\n", + desc, mdname(mddev)); + if (test_bit(MD_RECOVERY_ERROR, + &mddev->recovery)) + mddev->recovery_cp = + mddev->curr_resync_completed; + else + mddev->recovery_cp = + mddev->curr_resync; + } + } else + mddev->recovery_cp = MaxSector; + } else { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } + } + } + skip: + /* set CHANGE_PENDING here since maybe another update is needed, + * so other nodes are informed. It should be harmless for normal + * raid */ + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->delta_disks > 0 && + mddev->pers->finish_reshape && + mddev->pers->size && + mddev->queue) { + mddev_lock_nointr(mddev); + md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); + mddev_unlock(mddev); + if (!mddev_is_clustered(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } + + spin_lock(&mddev->lock); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* We completed so min/max setting can be forgotten if used. */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = mddev->curr_resync_completed; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + mddev->curr_resync = MD_RESYNC_NONE; + spin_unlock(&mddev->lock); + + wake_up(&resync_wait); + md_wakeup_thread(mddev->thread); + return; +} +EXPORT_SYMBOL_GPL(md_do_sync); + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) +{ + struct md_rdev *rdev; + int spares = 0; + int removed = 0; + bool remove_some = false; + + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; + + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + /* Faulty non-Blocked devices with nr_pending == 0 + * never get nr_pending incremented, + * never get Faulty cleared, and never get Blocked set. + * So we can synchronize_rcu now rather than once per device + */ + remove_some = true; + set_bit(RemoveSynchronized, &rdev->flags); + } + } + + if (remove_some) + synchronize_rcu(); + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + ((test_bit(RemoveSynchronized, &rdev->flags) || + (!test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags))) && + atomic_read(&rdev->nr_pending)==0)) { + if (mddev->pers->hot_remove_disk( + mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + removed++; + } + } + if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) + clear_bit(RemoveSynchronized, &rdev->flags); + } + + if (removed && mddev->kobj.sd) + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + + if (this && removed) + goto no_add; + + rdev_for_each(rdev, mddev) { + if (this && this != rdev) + continue; + if (test_bit(Candidate, &rdev->flags)) + continue; + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(Journal, &rdev->flags)) { + if (!md_is_rdwr(mddev) && + !(rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; + + rdev->recovery_offset = 0; + } + if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { + /* failure here is OK */ + sysfs_link_rdev(mddev, rdev); + if (!test_bit(Journal, &rdev->flags)) + spares++; + md_new_event(); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + } + } +no_add: + if (removed) + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return spares; +} + +static void md_start_sync(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, del_work); + + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "resync"); + if (!mddev->sync_thread) { + pr_warn("%s: could not start resync thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + wake_up(&resync_wait); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, + &mddev->recovery)) + if (mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); + } else + md_wakeup_thread(mddev->sync_thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(); +} + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + * 1/ if the superblock needs updating, update it. + * 2/ If a recovery thread is running, don't do anything else. + * 3/ If recovery has finished, clean up, possibly marking spares active. + * 4/ If there are any faulty devices, remove them. + * 5/ If array is degraded, try to add spares devices + * 6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(struct mddev *mddev) +{ + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); + } + + if (is_md_suspended(mddev)) + return; + + if (mddev->bitmap) + md_bitmap_daemon_work(mddev); + + if (signal_pending(current)) { + if (mddev->pers->sync_request && !mddev->external) { + pr_debug("md: %s in immediate safe mode\n", + mdname(mddev)); + mddev->safemode = 2; + } + flush_signals(current); + } + + if (!md_is_rdwr(mddev) && + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return; + if ( ! ( + (mddev->sb_flags & ~ (1<recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + (mddev->external == 0 && mddev->safemode == 1) || + (mddev->safemode == 2 + && !mddev->in_sync && mddev->recovery_cp == MaxSector) + )) + return; + + if (mddev_trylock(mddev)) { + int spares = 0; + bool try_set_sync = mddev->safemode != 0; + + if (!mddev->external && mddev->safemode == 1) + mddev->safemode = 0; + + if (!md_is_rdwr(mddev)) { + struct md_rdev *rdev; + if (!mddev->external && mddev->in_sync) + /* 'Blocked' flag not needed as failed devices + * will be recorded if array switched to read/write. + * Leaving it set will prevent the device + * from being removed. + */ + rdev_for_each(rdev, mddev) + clear_bit(Blocked, &rdev->flags); + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + md_reap_sync_thread(mddev); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + goto unlock; + } + + if (mddev_is_clustered(mddev)) { + struct md_rdev *rdev, *tmp; + /* kick the device if another node issued a + * remove disk. + */ + rdev_for_each_safe(rdev, tmp, mddev) { + if (test_and_clear_bit(ClusterRemove, &rdev->flags) && + rdev->raid_disk < 0) + md_kick_rdev_from_array(rdev); + } + } + + if (try_set_sync && !mddev->external && !mddev->in_sync) { + spin_lock(&mddev->lock); + set_in_sync(mddev); + spin_unlock(&mddev->lock); + } + + if (mddev->sb_flags) + md_update_sb(mddev, 0); + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + if (mddev->sync_thread) { + md_unregister_thread(&mddev->sync_thread); + md_reap_sync_thread(mddev); + goto unlock; + } + /* Set RUNNING before clearing NEEDED to avoid + * any transients in the value of "sync_action". + */ + mddev->curr_resync_completed = 0; + spin_lock(&mddev->lock); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + spin_unlock(&mddev->lock); + /* Clear some bits that don't mean anything, but + * might be left set + */ + clear_bit(MD_RECOVERY_INTR, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + + if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + goto not_running; + /* no recovery is running. + * remove any failed drives, then + * add spares if possible. + * Spares are also removed and re-added, to allow + * the personality to fail the re-add. + */ + + if (mddev->reshape_position != MaxSector) { + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) + /* Cannot proceed */ + goto not_running; + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if ((spares = remove_and_add_spares(mddev, NULL))) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + /* nothing to be done ... */ + goto not_running; + + if (mddev->pers->sync_request) { + if (spares) { + /* We are adding a device or devices to an array + * which has the bitmap stored on all devices. + * So make sure all bitmap pages get written + */ + md_bitmap_write_all(mddev->bitmap); + } + INIT_WORK(&mddev->del_work, md_start_sync); + queue_work(md_misc_wq, &mddev->del_work); + goto unlock; + } + not_running: + if (!mddev->sync_thread) { + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + wake_up(&resync_wait); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, + &mddev->recovery)) + if (mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); + } + unlock: + wake_up(&mddev->sb_wait); + mddev_unlock(mddev); + } +} +EXPORT_SYMBOL(md_check_recovery); + +void md_reap_sync_thread(struct mddev *mddev) +{ + struct md_rdev *rdev; + sector_t old_dev_sectors = mddev->dev_sectors; + bool is_reshaped = false; + + /* sync_thread should be unregistered, collect result */ + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + mddev->degraded != mddev->raid_disks) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) { + mddev->pers->finish_reshape(mddev); + if (mddev_is_clustered(mddev)) + is_reshaped = true; + } + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. + */ + if (!mddev->degraded) + rdev_for_each(rdev, mddev) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can + * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by + * clustered raid */ + if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) + md_cluster_ops->resync_finish(mddev); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* + * We call md_cluster_ops->update_size here because sync_size could + * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, + * so it is time to update size across cluster. + */ + if (mddev_is_clustered(mddev) && is_reshaped + && !test_bit(MD_CLOSING, &mddev->flags)) + md_cluster_ops->update_size(mddev, old_dev_sectors); + wake_up(&resync_wait); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_completed); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} +EXPORT_SYMBOL(md_reap_sync_thread); + +void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + sysfs_notify_dirent_safe(rdev->sysfs_state); + wait_event_timeout(rdev->blocked_wait, + !test_bit(Blocked, &rdev->flags) && + !test_bit(BlockedBadBlocks, &rdev->flags), + msecs_to_jiffies(5000)); + rdev_dec_pending(rdev, mddev); +} +EXPORT_SYMBOL(md_wait_for_blocked_rdev); + +void md_finish_reshape(struct mddev *mddev) +{ + /* called be personality module when reshape completes. */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->data_offset > rdev->new_data_offset) + rdev->sectors += rdev->data_offset - rdev->new_data_offset; + else + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; + rdev->data_offset = rdev->new_data_offset; + } +} +EXPORT_SYMBOL(md_finish_reshape); + +/* Bad block management */ + +/* Returns 1 on success, 0 on failure */ +int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + struct mddev *mddev = rdev->mddev; + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = badblocks_set(&rdev->badblocks, s, sectors, 0); + if (rv == 0) { + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return 1; + } else + return 0; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = badblocks_clear(&rdev->badblocks, s, sectors); + if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); + return rv; +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +static int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct mddev *mddev, *n; + int need_delay = 0; + + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); + if (mddev_trylock(mddev)) { + if (mddev->pers) + __md_stop_writes(mddev); + if (mddev->persistent) + mddev->safemode = 2; + mddev_unlock(mddev); + } + need_delay = 1; + mddev_put(mddev); + spin_lock(&all_mddevs_lock); + } + spin_unlock(&all_mddevs_lock); + + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + msleep(1000); + + return NOTIFY_DONE; +} + +static struct notifier_block md_notifier = { + .notifier_call = md_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + + proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); +} + +static int __init md_init(void) +{ + int ret = -ENOMEM; + + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); + if (!md_wq) + goto err_wq; + + md_misc_wq = alloc_workqueue("md_misc", 0, 0); + if (!md_misc_wq) + goto err_misc_wq; + + md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); + if (!md_rdev_misc_wq) + goto err_rdev_misc_wq; + + ret = __register_blkdev(MD_MAJOR, "md", md_probe); + if (ret < 0) + goto err_md; + + ret = __register_blkdev(0, "mdp", md_probe); + if (ret < 0) + goto err_mdp; + mdp_major = ret; + + register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table); + + md_geninit(); + return 0; + +err_mdp: + unregister_blkdev(MD_MAJOR, "md"); +err_md: + destroy_workqueue(md_rdev_misc_wq); +err_rdev_misc_wq: + destroy_workqueue(md_misc_wq); +err_misc_wq: + destroy_workqueue(md_wq); +err_wq: + return ret; +} + +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2, *tmp; + int role, ret; + + /* + * If size is changed in another node then we need to + * do resize as well. + */ + if (mddev->dev_sectors != le64_to_cpu(sb->size)) { + ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); + if (ret) + pr_info("md-cluster: resize failed\n"); + else + md_bitmap_update_sb(mddev->bitmap); + } + + /* Check for change of roles in the active devices */ + rdev_for_each_safe(rdev2, tmp, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == MD_DISK_ROLE_FAULTY) { + pr_info("md: Removing Candidate device %pg because add failed\n", + rdev2->bdev); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + + if (role != rdev2->raid_disk) { + /* + * got activated except reshape is happening. + */ + if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && + !(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE)) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %pg\n", + rdev2->bdev); + /* wakeup mddev->thread here, so array could + * perform resync with the new activated disk */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if (role == MD_DISK_ROLE_FAULTY || + role == MD_DISK_ROLE_JOURNAL) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } + } + + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { + ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (ret) + pr_warn("md: updating array disks failed. %d\n", ret); + } + + /* + * Since mddev->delta_disks has already updated in update_raid_disks, + * so it is time to check reshape. + */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* + * reshape is happening in the remote node, we need to + * update reshape_position and call start_reshape. + */ + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + if (mddev->pers->start_reshape) + mddev->pers->start_reshape(mddev); + } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + mddev->reshape_position != MaxSector && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* reshape is just done in another node. */ + mddev->reshape_position = MaxSector; + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + } + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + err = alloc_disk_sb(rdev); + if (err == 0) { + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version]. + load_super(rdev, NULL, mddev->minor_version); + } + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + if (rdev->sb_page) + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; + } + + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev = NULL, *iter; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(iter, mddev) { + if (iter->desc_nr == nr) { + rdev = iter; + break; + } + } + + if (!rdev) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; + } + + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) { + if (!test_bit(Faulty, &rdev->flags)) + read_rdev(mddev, rdev); + } +} +EXPORT_SYMBOL(md_reload_sb); + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ + +static DEFINE_MUTEX(detected_devices_mutex); +static LIST_HEAD(all_detected_devices); +struct detected_devices_node { + struct list_head list; + dev_t dev; +}; + +void md_autodetect_dev(dev_t dev) +{ + struct detected_devices_node *node_detected_dev; + + node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); + if (node_detected_dev) { + node_detected_dev->dev = dev; + mutex_lock(&detected_devices_mutex); + list_add_tail(&node_detected_dev->list, &all_detected_devices); + mutex_unlock(&detected_devices_mutex); + } +} + +void md_autostart_arrays(int part) +{ + struct md_rdev *rdev; + struct detected_devices_node *node_detected_dev; + dev_t dev; + int i_scanned, i_passed; + + i_scanned = 0; + i_passed = 0; + + pr_info("md: Autodetecting RAID arrays.\n"); + + mutex_lock(&detected_devices_mutex); + while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { + i_scanned++; + node_detected_dev = list_entry(all_detected_devices.next, + struct detected_devices_node, list); + list_del(&node_detected_dev->list); + dev = node_detected_dev->dev; + kfree(node_detected_dev); + mutex_unlock(&detected_devices_mutex); + rdev = md_import_device(dev,0, 90); + mutex_lock(&detected_devices_mutex); + if (IS_ERR(rdev)) + continue; + + if (test_bit(Faulty, &rdev->flags)) + continue; + + set_bit(AutoDetected, &rdev->flags); + list_add(&rdev->same_set, &pending_raid_disks); + i_passed++; + } + mutex_unlock(&detected_devices_mutex); + + pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); + + autorun_devices(part); +} + +#endif /* !MODULE */ + +static __exit void md_exit(void) +{ + struct mddev *mddev, *n; + int delay = 1; + + unregister_blkdev(MD_MAJOR,"md"); + unregister_blkdev(mdp_major, "mdp"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } + remove_proc_entry("mdstat", NULL); + + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); + export_array(mddev); + mddev->ctime = 0; + mddev->hold_active = 0; + /* + * As the mddev is now fully clear, mddev_put will schedule + * the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ + mddev_put(mddev); + spin_lock(&all_mddevs_lock); + } + spin_unlock(&all_mddevs_lock); + + destroy_workqueue(md_rdev_misc_wq); + destroy_workqueue(md_misc_wq); + destroy_workqueue(md_wq); +} + +subsys_initcall(md_init); +module_exit(md_exit) + +static int get_ro(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", start_readonly); +} +static int set_ro(const char *val, const struct kernel_param *kp) +{ + return kstrtouint(val, 10, (unsigned int *)&start_readonly); +} + +module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); +module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); +module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); +module_param(create_on_open, bool, S_IRUSR|S_IWUSR); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD RAID framework"); +MODULE_ALIAS("md"); +MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 000000000..4f0b48097 --- /dev/null +++ b/drivers/md/md.h @@ -0,0 +1,838 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + md.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + +*/ + +#ifndef _MD_MD_H +#define _MD_MD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "md-cluster.h" + +#define MaxSector (~(sector_t)0) + +/* + * These flags should really be called "NO_RETRY" rather than + * "FAILFAST" because they don't make any promise about time lapse, + * only about the number of retries, which will be zero. + * REQ_FAILFAST_DRIVER is not included because + * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") + * seems to suggest that the errors it avoids retrying should usually + * be retried. + */ +#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) + +/* + * The struct embedded in rdev is used to serialize IO. + */ +struct serial_in_rdev { + struct rb_root_cached serial_rb; + spinlock_t serial_lock; + wait_queue_head_t serial_io_wait; +}; + +/* + * MD's 'extended' device + */ +struct md_rdev { + struct list_head same_set; /* RAID devices within the same set */ + + sector_t sectors; /* Device size (in 512bytes sectors) */ + struct mddev *mddev; /* RAID array if running */ + int last_events; /* IO event timestamp */ + + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; + struct block_device *bdev; /* block device handle */ + + struct page *sb_page, *bb_page; + int sb_loaded; + __u64 sb_events; + sector_t data_offset; /* start of data in array */ + sector_t new_data_offset;/* only relevant while reshaping */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + int sb_size; /* bytes in the superblock */ + int preferred_minor; /* autorun support */ + + struct kobject kobj; + + /* A device can be in one of three states based on two flags: + * Not working: faulty==1 in_sync==0 + * Fully working: faulty==0 in_sync==1 + * Working, but not + * in sync with array + * faulty==0 in_sync==0 + * + * It can never have faulty==1, in_sync==1 + * This reduces the burden of testing multiple flags in many cases + */ + + unsigned long flags; /* bit set of 'enum flag_bits' bits. */ + wait_queue_head_t blocked_wait; + + int desc_nr; /* descriptor index in the superblock */ + int raid_disk; /* role of device in array */ + int new_raid_disk; /* role that the device will have in + * the array after a level-change completes. + */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ + union { + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ + sector_t journal_tail; /* If this device is a journal device, + * this is the journal tail (journal + * recovery start point) + */ + }; + + atomic_t nr_pending; /* number of pending requests. + * only maintained for arrays that + * support hot removal + */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ + time64_t last_read_error; /* monotonic time since our + * last read error + */ + atomic_t corrected_errors; /* number of corrected read errors, + * for reporting to userspace and storing + * in superblock. + */ + + struct serial_in_rdev *serial; /* used for raid1 io serialization */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + struct kernfs_node *sysfs_state; /* handle for 'state' + * sysfs entry */ + /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_unack_badblocks; + /* handle for 'bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_badblocks; + struct badblocks badblocks; + + struct { + short offset; /* Offset from superblock to start of PPL. + * Not used by external metadata. */ + unsigned int size; /* Size in sectors of the PPL space */ + sector_t sector; /* First sector of the PPL space */ + } ppl; +}; +enum flag_bits { + Faulty, /* device is known to have a fault */ + In_sync, /* device is in_sync with rest of array */ + Bitmap_sync, /* ..actually, not quite In_sync. Need a + * bitmap-based recovery to get fully in sync. + * The bit is only meaningful before device + * has been passed to pers->hot_add_disk. + */ + WriteMostly, /* Avoid reading if at all possible */ + AutoDetected, /* added by auto-detect */ + Blocked, /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes + * until it is cleared */ + WriteErrorSeen, /* A write error has been seen on this + * device + */ + FaultRecorded, /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ + BlockedBadBlocks, /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ + WantReplacement, /* This device is a candidate to be + * hot-replaced, either because it has + * reported some faults, or because + * of explicit request. + */ + Replacement, /* This device is a replacement for + * a want_replacement device with same + * raid_disk number. + */ + Candidate, /* For clustered environments only: + * This device is seen locally but not + * by the whole cluster + */ + Journal, /* This device is used as journal for + * raid-5/6. + * Usually, this device should be faster + * than other devices in the array + */ + ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ + ExternalBbl, /* External metadata provides bad + * block management for a disk + */ + FailFast, /* Minimal retries should be attempted on + * this device, so use REQ_FAILFAST_DEV. + * Also don't try to repair failed reads. + * It is expects that no bad block log + * is present. + */ + LastDev, /* Seems to be the last working dev as + * it didn't fail, so don't use FailFast + * any more for metadata + */ + CollisionCheck, /* + * check if there is collision between raid1 + * serial bios. + */ +}; + +static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (unlikely(rdev->badblocks.count)) { + int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, + sectors, + first_bad, bad_sectors); + if (rv) + *first_bad -= rdev->data_offset; + return rv; + } + return 0; +} +extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +struct md_cluster_info; + +/** + * enum mddev_flags - md device flags. + * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. + * @MD_CLOSING: If set, we are closing the array, do not open it then. + * @MD_JOURNAL_CLEAN: A raid with journal is already clean. + * @MD_HAS_JOURNAL: The raid array has journal feature set. + * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took + * resync lock, need to release the lock. + * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as + * calls to md_error() will never cause the array to + * become failed. + * @MD_HAS_PPL: The raid array has PPL feature set. + * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. + * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata + * without taking reconfig_mutex. + * @MD_UPDATING_SB: md_check_recovery is updating the metadata without + * explicitly holding reconfig_mutex. + * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that + * array is ready yet. + * @MD_BROKEN: This is used to stop writes and mark array as failed. + * @MD_DELETED: This device is being deleted + * + * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added + */ +enum mddev_flags { + MD_ARRAY_FIRST_USE, + MD_CLOSING, + MD_JOURNAL_CLEAN, + MD_HAS_JOURNAL, + MD_CLUSTER_RESYNC_LOCKED, + MD_FAILFAST_SUPPORTED, + MD_HAS_PPL, + MD_HAS_MULTIPLE_PPLS, + MD_ALLOW_SB_UPDATE, + MD_UPDATING_SB, + MD_NOT_READY, + MD_BROKEN, + MD_DELETED, +}; + +enum mddev_sb_flags { + MD_SB_CHANGE_DEVS, /* Some device status has changed */ + MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ + MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ + MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ +}; + +#define NR_SERIAL_INFOS 8 +/* record current range of serialize IOs */ +struct serial_info { + struct rb_node node; + sector_t start; /* start sector of rb node */ + sector_t last; /* end sector of rb node */ + sector_t _subtree_last; /* highest sector in subtree of rb node */ +}; + +/* + * mddev->curr_resync stores the current sector of the resync but + * also has some overloaded values. + */ +enum { + /* No resync in progress */ + MD_RESYNC_NONE = 0, + /* Yielded to allow another conflicting resync to commence */ + MD_RESYNC_YIELDED = 1, + /* Delayed to check that there is no conflict with another sync */ + MD_RESYNC_DELAYED = 2, + /* Any value greater than or equal to this is in an active resync */ + MD_RESYNC_ACTIVE = 3, +}; + +struct mddev { + void *private; + struct md_personality *pers; + dev_t unit; + int md_minor; + struct list_head disks; + unsigned long flags; + unsigned long sb_flags; + + int suspended; + struct percpu_ref active_io; + int ro; + int sysfs_active; /* set when sysfs deletes + * are happening, so run/ + * takeover/stop are not safe + */ + struct gendisk *gendisk; + + struct kobject kobj; + int hold_active; +#define UNTIL_IOCTL 1 +#define UNTIL_STOP 2 + + /* Superblock information */ + int major_version, + minor_version, + patch_version; + int persistent; + int external; /* metadata is + * managed externally */ + char metadata_type[17]; /* externally set*/ + int chunk_sectors; + time64_t ctime, utime; + int level, layout; + char clevel[16]; + int raid_disks; + int max_disks; + sector_t dev_sectors; /* used size of + * component devices */ + sector_t array_sectors; /* exported array size */ + int external_size; /* size managed + * externally */ + __u64 events; + /* If the last 'event' was simply a clean->dirty transition, and + * we didn't write it to the spares, then it is safe and simple + * to just decrement the event count on a dirty->clean transition. + * So we record that possibility here. + */ + int can_decrease_events; + + char uuid[16]; + + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout; + int new_chunk_sectors; + int reshape_backwards; + + struct md_thread *thread; /* management thread */ + struct md_thread *sync_thread; /* doing resync or reconstruct */ + + /* 'last_sync_action' is initialized to "none". It is set when a + * sync operation (i.e "data-check", "requested-resync", "resync", + * "recovery", or "reshape") is started. It holds this value even + * when the sync thread is "frozen" (interrupted) or "idle" (stopped + * or finished). It is overwritten when a new sync operation is begun. + */ + char *last_sync_action; + sector_t curr_resync; /* last block scheduled */ + /* As resync requests can complete out of order, we cannot easily track + * how much resync has been completed. So we occasionally pause until + * everything completes, then set curr_resync_completed to curr_resync. + * As such it may be well behind the real resync mark, but it is a value + * we are certain of. + */ + sector_t curr_resync_completed; + unsigned long resync_mark; /* a recent timestamp */ + sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t curr_mark_cnt; /* blocks scheduled now */ + + sector_t resync_max_sectors; /* may be set by personality */ + + atomic64_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; + /* if zero, use the system-wide default */ + int sync_speed_min; + int sync_speed_max; + + /* resync even though the same disks are shared among md-devices */ + int parallel_resync; + + int ok_start_degraded; + + unsigned long recovery; + /* If a RAID personality determines that recovery (of a particular + * device) will fail due to a read error on the source device, it + * takes a copy of this number and does not attempt recovery again + * until this number changes. + */ + int recovery_disabled; + + int in_sync; /* know to not need resync */ + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so + * that we are never stopping an array while it is open. + * 'reconfig_mutex' protects all other reconfiguration. + * These locks are separate due to conflicting interactions + * with disk->open_mutex. + * Lock ordering is: + * reconfig_mutex -> disk->open_mutex + * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open + */ + struct mutex open_mutex; + struct mutex reconfig_mutex; + atomic_t active; /* general refcount */ + atomic_t openers; /* number of active opens */ + + int changed; /* True if we might need to + * reread partition info */ + int degraded; /* whether md should consider + * adding a spare + */ + + atomic_t recovery_active; /* blocks scheduled, but not written */ + wait_queue_head_t recovery_wait; + sector_t recovery_cp; + sector_t resync_min; /* user requested sync + * starts here */ + sector_t resync_max; /* resync should pause + * when it gets here */ + + struct kernfs_node *sysfs_state; /* handle for 'array_state' + * file in sysfs. + */ + struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ + struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ + struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ + struct kernfs_node *sysfs_level; /*handle for 'level' */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + /* "lock" protects: + * flush_bio transition from NULL to !NULL + * rdev superblocks, events + * clearing MD_CHANGE_* + * in_sync - and related safemode and MD_CHANGE changes + * pers (also protected by reconfig_mutex and pending IO). + * clearing ->bitmap + * clearing ->bitmap_info.file + * changing ->resync_{min,max} + * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) + */ + spinlock_t lock; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ + + unsigned int safemode; /* if set, update "clean" superblock + * when no writes pending. + */ + unsigned int safemode_delay; + struct timer_list safemode_timer; + struct percpu_ref writes_pending; + int sync_checkers; /* # of threads checking writes_pending */ + struct request_queue *queue; /* for plugging ... */ + + struct bitmap *bitmap; /* the bitmap for the device */ + struct { + struct file *file; /* the bitmap file */ + loff_t offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + * For external metadata, offset + * from start of device. + */ + unsigned long space; /* space available at this offset */ + loff_t default_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + unsigned long default_space; /* space available at + * default offset */ + struct mutex mutex; + unsigned long chunksize; + unsigned long daemon_sleep; /* how many jiffies between updates? */ + unsigned long max_write_behind; /* write-behind mode */ + int external; + int nodes; /* Maximum number of nodes in the cluster */ + char cluster_name[64]; /* Name of the cluster */ + } bitmap_info; + + atomic_t max_corr_read_errors; /* max read retries */ + struct list_head all_mddevs; + + const struct attribute_group *to_remove; + + struct bio_set bio_set; + struct bio_set sync_set; /* for sync operations like + * metadata and bitmap writes + */ + struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ + + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_PREFLUSH flag). + */ + struct bio *flush_bio; + atomic_t flush_pending; + ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed + * flush was started. + */ + struct work_struct flush_work; + struct work_struct event_work; /* used by dm to report failure event */ + mempool_t *serial_info_pool; + void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); + struct md_cluster_info *cluster_info; + unsigned int good_device_nr; /* good device num within cluster raid */ + unsigned int noio_flag; /* for memalloc scope API */ + + bool has_superblocks:1; + bool fail_last_dev:1; + bool serialize_policy:1; +}; + +enum recovery_flags { + /* + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ + MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ + MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ + MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ + MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ + MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ + MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ + MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ + MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ + MD_RECOVERY_RESHAPE, /* A reshape is happening */ + MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ + MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ + MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ + MD_RESYNCING_REMOTE, /* remote node is running resync thread */ +}; + +static inline int __must_check mddev_lock(struct mddev *mddev) +{ + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} + +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev *mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_trylock(struct mddev *mddev) +{ + return mutex_trylock(&mddev->reconfig_mutex); +} +extern void mddev_unlock(struct mddev *mddev); + +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_disk->sync_io); +} + +static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) +{ + md_sync_acct(bio->bi_bdev, nr_sectors); +} + +struct md_personality +{ + char *name; + int level; + struct list_head list; + struct module *owner; + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); + /* + * start up works that do NOT require md_thread. tasks that + * requires md_thread should go into start() + */ + int (*run)(struct mddev *mddev); + /* start up works that require md threads */ + int (*start)(struct mddev *mddev); + void (*free)(struct mddev *mddev, void *priv); + void (*status)(struct seq_file *seq, struct mddev *mddev); + /* error_handler must set ->faulty and clear ->in_sync + * if appropriate, and should abort recovery if needed + */ + void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); + int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); + int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); + int (*spare_active) (struct mddev *mddev); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped); + int (*resize) (struct mddev *mddev, sector_t sectors); + sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); + int (*check_reshape) (struct mddev *mddev); + int (*start_reshape) (struct mddev *mddev); + void (*finish_reshape) (struct mddev *mddev); + void (*update_reshape_pos) (struct mddev *mddev); + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour + */ + void (*quiesce) (struct mddev *mddev, int quiesce); + /* takeover is used to transition an array from one + * personality to another. The new personality must be able + * to handle the data in the current layout. + * e.g. 2drive raid1 -> 2drive raid5 + * ndrive raid5 -> degraded n+1drive raid6 with special layout + * If the takeover succeeds, a new 'private' structure is returned. + * This needs to be installed and then ->run used to activate the + * array. + */ + void *(*takeover) (struct mddev *mddev); + /* Changes the consistency policy of an active array. */ + int (*change_consistency_policy)(struct mddev *mddev, const char *buf); +}; + +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct mddev *, char *); + ssize_t (*store)(struct mddev *, const char *, size_t); +}; +extern const struct attribute_group md_bitmap_group; + +static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) +{ + if (sd) + return sysfs_get_dirent(sd, name); + return sd; +} +static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd) +{ + if (sd) + sysfs_notify_dirent(sd); +} + +static inline char * mdname (struct mddev * mddev) +{ + return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; +} + +static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + char nm[20]; + if (!test_bit(Replacement, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + mddev->kobj.sd) { + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + return 0; +} + +static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + char nm[20]; + if (!test_bit(Replacement, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + mddev->kobj.sd) { + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } +} + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define rdev_for_each_list(rdev, tmp, head) \ + list_for_each_entry_safe(rdev, tmp, head, same_set) + +/* + * iterates through the 'same array disks' ringlist + */ +#define rdev_for_each(rdev, mddev) \ + list_for_each_entry(rdev, &((mddev)->disks), same_set) + +#define rdev_for_each_safe(rdev, tmp, mddev) \ + list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) + +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + +struct md_thread { + void (*run) (struct md_thread *thread); + struct mddev *mddev; + wait_queue_head_t wqueue; + unsigned long flags; + struct task_struct *tsk; + unsigned long timeout; + void *private; +}; + +struct md_io_acct { + struct bio *orig_bio; + unsigned long start_time; + struct bio bio_clone; +}; + +#define THREAD_WAKEUP 0 + +static inline void safe_put_page(struct page *p) +{ + if (p) put_page(p); +} + +extern int register_md_personality(struct md_personality *p); +extern int unregister_md_personality(struct md_personality *p); +extern int register_md_cluster_operations(struct md_cluster_operations *ops, + struct module *module); +extern int unregister_md_cluster_operations(void); +extern int md_setup_cluster(struct mddev *mddev, int nodes); +extern void md_cluster_stop(struct mddev *mddev); +extern struct md_thread *md_register_thread( + void (*run)(struct md_thread *thread), + struct mddev *mddev, + const char *name); +extern void md_unregister_thread(struct md_thread **threadp); +extern void md_wakeup_thread(struct md_thread *thread); +extern void md_check_recovery(struct mddev *mddev); +extern void md_reap_sync_thread(struct mddev *mddev); +extern int mddev_init_writes_pending(struct mddev *mddev); +extern bool md_write_start(struct mddev *mddev, struct bio *bi); +extern void md_write_inc(struct mddev *mddev, struct bio *bi); +extern void md_write_end(struct mddev *mddev); +extern void md_done_sync(struct mddev *mddev, int blocks, int ok); +extern void md_error(struct mddev *mddev, struct md_rdev *rdev); +extern void md_finish_reshape(struct mddev *mddev); +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size); +int acct_bioset_init(struct mddev *mddev); +void acct_bioset_exit(struct mddev *mddev); +void md_account_bio(struct mddev *mddev, struct bio **bio); + +extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); +extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page); +extern int md_super_wait(struct mddev *mddev); +extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, blk_opf_t opf, bool metadata_op); +extern void md_do_sync(struct md_thread *thread); +extern void md_new_event(void); +extern void md_allow_write(struct mddev *mddev); +extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); +extern int md_check_no_bitmap(struct mddev *mddev); +extern int md_integrity_register(struct mddev *mddev); +extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); + +extern void mddev_init(struct mddev *mddev); +struct mddev *md_alloc(dev_t dev, char *name); +void mddev_put(struct mddev *mddev); +extern int md_run(struct mddev *mddev); +extern int md_start(struct mddev *mddev); +extern void md_stop(struct mddev *mddev); +extern void md_stop_writes(struct mddev *mddev); +extern int md_rdev_init(struct md_rdev *rdev); +extern void md_rdev_clear(struct md_rdev *rdev); + +extern void md_handle_request(struct mddev *mddev, struct bio *bio); +extern void mddev_suspend(struct mddev *mddev); +extern void mddev_resume(struct mddev *mddev); + +extern void md_reload_sb(struct mddev *mddev, int raid_disk); +extern void md_update_sb(struct mddev *mddev, int force); +extern void md_kick_rdev_from_array(struct md_rdev * rdev); +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); +extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); +struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); + +static inline bool is_rdev_broken(struct md_rdev *rdev) +{ + return !disk_live(rdev->bdev->bd_disk); +} + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } +} + +extern struct md_cluster_operations *md_cluster_ops; +static inline int mddev_is_clustered(struct mddev *mddev) +{ + return mddev->cluster_info && mddev->bitmap_info.nodes > 1; +} + +/* clear unsupported mddev_flags */ +static inline void mddev_clear_unsupported_flags(struct mddev *mddev, + unsigned long unsupported_flags) +{ + mddev->flags &= ~unsupported_flags; +} + +static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) + mddev->queue->limits.max_write_zeroes_sectors = 0; +} + +struct mdu_array_info_s; +struct mdu_disk_info_s; + +extern int mdp_major; +void md_autostart_arrays(int part); +int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); +int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); +int do_md_run(struct mddev *mddev); + +extern const struct block_device_operations md_fops; + +#endif /* _MD_MD_H */ diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig new file mode 100644 index 000000000..f4f948b0e --- /dev/null +++ b/drivers/md/persistent-data/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config DM_PERSISTENT_DATA + tristate + depends on BLK_DEV_DM + select LIBCRC32C + select DM_BUFIO + help + Library providing immutable on-disk data structure support for + device-mapper targets such as the thin provisioning target. + diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile new file mode 100644 index 000000000..66be7c664 --- /dev/null +++ b/drivers/md/persistent-data/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o +dm-persistent-data-objs := \ + dm-array.o \ + dm-bitset.o \ + dm-block-manager.o \ + dm-space-map-common.o \ + dm-space-map-disk.o \ + dm-space-map-metadata.o \ + dm-transaction-manager.o \ + dm-btree.o \ + dm-btree-remove.o \ + dm-btree-spine.o diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c new file mode 100644 index 000000000..eff9b4186 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.c @@ -0,0 +1,1011 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-array.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include +#include + +#define DM_MSG_PREFIX "array" + +/*----------------------------------------------------------------*/ + +/* + * The array is implemented as a fully populated btree, which points to + * blocks that contain the packed values. This is more space efficient + * than just using a btree since we don't store 1 key per value. + */ +struct array_block { + __le32 csum; + __le32 max_entries; + __le32 nr_entries; + __le32 value_size; + __le64 blocknr; /* Block this node is supposed to live in. */ +} __packed; + +/*----------------------------------------------------------------*/ + +/* + * Validator methods. As usual we calculate a checksum, and also write the + * block location into the header (paranoia about ssds remapping areas by + * mistake). + */ +#define CSUM_XOR 595846735 + +static void array_block_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + + bh_le->blocknr = cpu_to_le64(dm_block_location(b)); + bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); +} + +static int array_block_check(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { + DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", + (unsigned long long) le64_to_cpu(bh_le->blocknr), + (unsigned long long) dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); + if (csum_disk != bh_le->csum) { + DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", + (unsigned int) le32_to_cpu(csum_disk), + (unsigned int) le32_to_cpu(bh_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator array_validator = { + .name = "array", + .prepare_for_write = array_block_prepare_for_write, + .check = array_block_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Functions for manipulating the array blocks. + */ + +/* + * Returns a pointer to a value within an array block. + * + * index - The index into _this_ specific block. + */ +static void *element_at(struct dm_array_info *info, struct array_block *ab, + unsigned int index) +{ + unsigned char *entry = (unsigned char *) (ab + 1); + + entry += index * info->value_type.size; + + return entry; +} + +/* + * Utility function that calls one of the value_type methods on every value + * in an array block. + */ +static void on_entries(struct dm_array_info *info, struct array_block *ab, + void (*fn)(void *, const void *, unsigned int)) +{ + unsigned int nr_entries = le32_to_cpu(ab->nr_entries); + fn(info->value_type.context, element_at(info, ab, 0), nr_entries); +} + +/* + * Increment every value in an array block. + */ +static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->inc) + on_entries(info, ab, vt->inc); +} + +/* + * Decrement every value in an array block. + */ +static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->dec) + on_entries(info, ab, vt->dec); +} + +/* + * Each array block can hold this many values. + */ +static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) +{ + return (size_of_block - sizeof(struct array_block)) / value_size; +} + +/* + * Allocate a new array block. The caller will need to unlock block. + */ +static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); + if (r) + return r; + + (*ab) = dm_block_data(*block); + (*ab)->max_entries = cpu_to_le32(max_entries); + (*ab)->nr_entries = cpu_to_le32(0); + (*ab)->value_size = cpu_to_le32(info->value_type.size); + + return 0; +} + +/* + * Pad an array block out with a particular value. Every instance will + * cause an increment of the value_type. new_nr must always be more than + * the current number of entries. + */ +static void fill_ablock(struct dm_array_info *info, struct array_block *ab, + const void *value, unsigned int new_nr) +{ + uint32_t nr_entries, delta, i; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + delta = new_nr - nr_entries; + if (vt->inc) + vt->inc(vt->context, value, delta); + for (i = nr_entries; i < new_nr; i++) + memcpy(element_at(info, ab, i), value, vt->size); + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Remove some entries from the back of an array block. Every value + * removed will be decremented. new_nr must be <= the current number of + * entries. + */ +static void trim_ablock(struct dm_array_info *info, struct array_block *ab, + unsigned int new_nr) +{ + uint32_t nr_entries, delta; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + delta = nr_entries - new_nr; + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta); + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Read locks a block, and coerces it to an array block. The caller must + * unlock 'block' when finished. + */ +static int get_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); + if (r) + return r; + + *ab = dm_block_data(*block); + return 0; +} + +/* + * Unlocks an array block. + */ +static void unlock_ablock(struct dm_array_info *info, struct dm_block *block) +{ + dm_tm_unlock(info->btree_info.tm, block); +} + +/*----------------------------------------------------------------*/ + +/* + * Btree manipulation. + */ + +/* + * Looks up an array block in the btree, and then read locks it. + * + * index is the index of the index of the array_block, (ie. the array index + * / max_entries). + */ +static int lookup_ablock(struct dm_array_info *info, dm_block_t root, + unsigned int index, struct dm_block **block, + struct array_block **ab) +{ + int r; + uint64_t key = index; + __le64 block_le; + + r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); + if (r) + return r; + + return get_ablock(info, le64_to_cpu(block_le), block, ab); +} + +/* + * Insert an array block into the btree. The block is _not_ unlocked. + */ +static int insert_ablock(struct dm_array_info *info, uint64_t index, + struct dm_block *block, dm_block_t *root) +{ + __le64 block_le = cpu_to_le64(dm_block_location(block)); + + __dm_bless_for_disk(block_le); + return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); +} + +/*----------------------------------------------------------------*/ + +static int __shadow_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int inc; + int r = dm_tm_shadow_block(info->btree_info.tm, b, + &array_validator, block, &inc); + if (r) + return r; + + *ab = dm_block_data(*block); + if (inc) + inc_ablock_entries(info, *ab); + + return 0; +} + +/* + * The shadow op will often be a noop. Only insert if it really + * copied data. + */ +static int __reinsert_ablock(struct dm_array_info *info, unsigned int index, + struct dm_block *block, dm_block_t b, + dm_block_t *root) +{ + int r = 0; + + if (dm_block_location(block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); + r = insert_ablock(info, index, block, root); + } + + return r; +} + +/* + * Looks up an array block in the btree. Then shadows it, and updates the + * btree to point to this new shadow. 'root' is an input/output parameter + * for both the current root block, and the new one. + */ +static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, + unsigned int index, struct dm_block **block, + struct array_block **ab) +{ + int r; + uint64_t key = index; + dm_block_t b; + __le64 block_le; + + r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); + if (r) + return r; + b = le64_to_cpu(block_le); + + r = __shadow_ablock(info, b, block, ab); + if (r) + return r; + + return __reinsert_ablock(info, index, *block, b, root); +} + +/* + * Allocate an new array block, and fill it with some values. + */ +static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + unsigned int block_index, uint32_t nr, + const void *value, dm_block_t *root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + return r; + + fill_ablock(info, ab, value, nr); + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + + return r; +} + +static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, + unsigned int begin_block, unsigned int end_block, + unsigned int max_entries, const void *value, + dm_block_t *root) +{ + int r = 0; + + for (; !r && begin_block != end_block; begin_block++) + r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); + + return r; +} + +/* + * There are a bunch of functions involved with resizing an array. This + * structure holds information that commonly needed by them. Purely here + * to reduce parameter count. + */ +struct resize { + /* + * Describes the array. + */ + struct dm_array_info *info; + + /* + * The current root of the array. This gets updated. + */ + dm_block_t root; + + /* + * Metadata block size. Used to calculate the nr entries in an + * array block. + */ + size_t size_of_block; + + /* + * Maximum nr entries in an array block. + */ + unsigned int max_entries; + + /* + * nr of completely full blocks in the array. + * + * 'old' refers to before the resize, 'new' after. + */ + unsigned int old_nr_full_blocks, new_nr_full_blocks; + + /* + * Number of entries in the final block. 0 iff only full blocks in + * the array. + */ + unsigned int old_nr_entries_in_last_block, new_nr_entries_in_last_block; + + /* + * The default value used when growing the array. + */ + const void *value; +}; + +/* + * Removes a consecutive set of array blocks from the btree. The values + * in block are decremented as a side effect of the btree remove. + * + * begin_index - the index of the first array block to remove. + * end_index - the one-past-the-end value. ie. this block is not removed. + */ +static int drop_blocks(struct resize *resize, unsigned int begin_index, + unsigned int end_index) +{ + int r; + + while (begin_index != end_index) { + uint64_t key = begin_index++; + r = dm_btree_remove(&resize->info->btree_info, resize->root, + &key, &resize->root); + if (r) + return r; + } + + return 0; +} + +/* + * Calculates how many blocks are needed for the array. + */ +static unsigned int total_nr_blocks_needed(unsigned int nr_full_blocks, + unsigned int nr_entries_in_last_block) +{ + return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); +} + +/* + * Shrink an array. + */ +static int shrink(struct resize *resize) +{ + int r; + unsigned int begin, end; + struct dm_block *block; + struct array_block *ab; + + /* + * Lose some blocks from the back? + */ + if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { + begin = total_nr_blocks_needed(resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block); + end = total_nr_blocks_needed(resize->old_nr_full_blocks, + resize->old_nr_entries_in_last_block); + + r = drop_blocks(resize, begin, end); + if (r) + return r; + } + + /* + * Trim the new tail block + */ + if (resize->new_nr_entries_in_last_block) { + r = shadow_ablock(resize->info, &resize->root, + resize->new_nr_full_blocks, &block, &ab); + if (r) + return r; + + trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); + unlock_ablock(resize->info, block); + } + + return 0; +} + +/* + * Grow an array. + */ +static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = shadow_ablock(resize->info, &resize->root, + resize->old_nr_full_blocks, &block, &ab); + if (r) + return r; + + fill_ablock(resize->info, ab, resize->value, new_nr_entries); + unlock_ablock(resize->info, block); + + return r; +} + +static int grow_add_tail_block(struct resize *resize) +{ + return insert_new_ablock(resize->info, resize->size_of_block, + resize->max_entries, + resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block, + resize->value, &resize->root); +} + +static int grow_needs_more_blocks(struct resize *resize) +{ + int r; + unsigned int old_nr_blocks = resize->old_nr_full_blocks; + + if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + + r = grow_extend_tail_block(resize, resize->max_entries); + if (r) + return r; + } + + r = insert_full_ablocks(resize->info, resize->size_of_block, + old_nr_blocks, + resize->new_nr_full_blocks, + resize->max_entries, resize->value, + &resize->root); + if (r) + return r; + + if (resize->new_nr_entries_in_last_block) + r = grow_add_tail_block(resize); + + return r; +} + +static int grow(struct resize *resize) +{ + if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) + return grow_needs_more_blocks(resize); + + else if (resize->old_nr_entries_in_last_block) + return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); + + else + return grow_add_tail_block(resize); +} + +/*----------------------------------------------------------------*/ + +/* + * These are the value_type functions for the btree elements, which point + * to array blocks. + */ +static void block_inc(void *context, const void *value, unsigned int count) +{ + const __le64 *block_le = value; + struct dm_array_info *info = context; + unsigned int i; + + for (i = 0; i < count; i++, block_le++) + dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le)); +} + +static void __block_dec(void *context, const void *value) +{ + int r; + uint64_t b; + __le64 block_le; + uint32_t ref_count; + struct dm_block *block; + struct array_block *ab; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + b = le64_to_cpu(block_le); + + r = dm_tm_ref(info->btree_info.tm, b, &ref_count); + if (r) { + DMERR_LIMIT("couldn't get reference count for block %llu", + (unsigned long long) b); + return; + } + + if (ref_count == 1) { + /* + * We're about to drop the last reference to this ablock. + * So we need to decrement the ref count of the contents. + */ + r = get_ablock(info, b, &block, &ab); + if (r) { + DMERR_LIMIT("couldn't get array block %llu", + (unsigned long long) b); + return; + } + + dec_ablock_entries(info, ab); + unlock_ablock(info, block); + } + + dm_tm_dec(info->btree_info.tm, b); +} + +static void block_dec(void *context, const void *value, unsigned int count) +{ + unsigned int i; + for (i = 0; i < count; i++, value += sizeof(__le64)) + __block_dec(context, value); +} + +static int block_equal(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(__le64)); +} + +/*----------------------------------------------------------------*/ + +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + struct dm_btree_value_type *bvt = &info->btree_info.value_type; + + memcpy(&info->value_type, vt, sizeof(info->value_type)); + info->btree_info.tm = tm; + info->btree_info.levels = 1; + + bvt->context = info; + bvt->size = sizeof(__le64); + bvt->inc = block_inc; + bvt->dec = block_dec; + bvt->equal = block_equal; +} +EXPORT_SYMBOL_GPL(dm_array_info_init); + +int dm_array_empty(struct dm_array_info *info, dm_block_t *root) +{ + return dm_btree_empty(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_empty); + +static int array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) +{ + int r; + struct resize resize; + + if (old_size == new_size) { + *new_root = root; + return 0; + } + + resize.info = info; + resize.root = root; + resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + resize.max_entries = calc_max_entries(info->value_type.size, + resize.size_of_block); + + resize.old_nr_full_blocks = old_size / resize.max_entries; + resize.old_nr_entries_in_last_block = old_size % resize.max_entries; + resize.new_nr_full_blocks = new_size / resize.max_entries; + resize.new_nr_entries_in_last_block = new_size % resize.max_entries; + resize.value = value; + + r = ((new_size > old_size) ? grow : shrink)(&resize); + if (r) + return r; + + *new_root = resize.root; + return 0; +} + +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r = array_resize(info, root, old_size, new_size, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_resize); + +static int populate_ablock_with_values(struct dm_array_info *info, struct array_block *ab, + value_fn fn, void *context, + unsigned int base, unsigned int new_nr) +{ + int r; + unsigned int i; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(le32_to_cpu(ab->nr_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + + for (i = 0; i < new_nr; i++) { + r = fn(base + i, element_at(info, ab, i), context); + if (r) + return r; + + if (vt->inc) + vt->inc(vt->context, element_at(info, ab, i), 1); + } + + ab->nr_entries = cpu_to_le32(new_nr); + return 0; +} + +int dm_array_new(struct dm_array_info *info, dm_block_t *root, + uint32_t size, value_fn fn, void *context) +{ + int r; + struct dm_block *block; + struct array_block *ab; + unsigned int block_index, end_block, size_of_block, max_entries; + + r = dm_array_empty(info, root); + if (r) + return r; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + end_block = dm_div_up(size, max_entries); + + for (block_index = 0; block_index != end_block; block_index++) { + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + break; + + r = populate_ablock_with_values(info, ab, fn, context, + block_index * max_entries, + min(max_entries, size)); + if (r) { + unlock_ablock(info, block); + break; + } + + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + if (r) + break; + + size -= max_entries; + } + + return r; +} +EXPORT_SYMBOL_GPL(dm_array_new); + +int dm_array_del(struct dm_array_info *info, dm_block_t root) +{ + return dm_btree_del(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_del); + +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value_le) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned int entry, max_entries; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = lookup_ablock(info, root, index / max_entries, &block, &ab); + if (r) + return r; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) + r = -ENODATA; + else + memcpy(value_le, element_at(info, ab, entry), + info->value_type.size); + + unlock_ablock(info, block); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_get_value); + +static int array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned int max_entries; + unsigned int entry; + void *old_value; + struct dm_btree_value_type *vt = &info->value_type; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = shadow_ablock(info, &root, index / max_entries, &block, &ab); + if (r) + return r; + *new_root = root; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) { + r = -ENODATA; + goto out; + } + + old_value = element_at(info, ab, entry); + if (vt->dec && + (!vt->equal || !vt->equal(vt->context, old_value, value))) { + vt->dec(vt->context, old_value, 1); + if (vt->inc) + vt->inc(vt->context, value, 1); + } + + memcpy(old_value, value, info->value_type.size); + +out: + unlock_ablock(info, block); + return r; +} + +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r; + + r = array_set_value(info, root, index, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_set_value); + +struct walk_info { + struct dm_array_info *info; + int (*fn)(void *context, uint64_t key, void *leaf); + void *context; +}; + +static int walk_ablock(void *context, uint64_t *keys, void *leaf) +{ + struct walk_info *wi = context; + + int r; + unsigned int i; + __le64 block_le; + unsigned int nr_entries, max_entries; + struct dm_block *block; + struct array_block *ab; + + memcpy(&block_le, leaf, sizeof(block_le)); + r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); + if (r) + return r; + + max_entries = le32_to_cpu(ab->max_entries); + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = 0; i < nr_entries; i++) { + r = wi->fn(wi->context, keys[0] * max_entries + i, + element_at(wi->info, ab, i)); + + if (r) + break; + } + + unlock_ablock(wi->info, block); + return r; +} + +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *, uint64_t key, void *leaf), + void *context) +{ + struct walk_info wi; + + wi.info = info; + wi.fn = fn; + wi.context = context; + + return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); +} +EXPORT_SYMBOL_GPL(dm_array_walk); + +/*----------------------------------------------------------------*/ + +static int load_ablock(struct dm_array_cursor *c) +{ + int r; + __le64 value_le; + uint64_t key; + + if (c->block) + unlock_ablock(c->info, c->block); + + c->block = NULL; + c->ab = NULL; + c->index = 0; + + r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le); + if (r) { + DMERR("dm_btree_cursor_get_value failed"); + dm_btree_cursor_end(&c->cursor); + + } else { + r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab); + if (r) { + DMERR("get_ablock failed"); + dm_btree_cursor_end(&c->cursor); + } + } + + return r; +} + +int dm_array_cursor_begin(struct dm_array_info *info, dm_block_t root, + struct dm_array_cursor *c) +{ + int r; + + memset(c, 0, sizeof(*c)); + c->info = info; + r = dm_btree_cursor_begin(&info->btree_info, root, true, &c->cursor); + if (r) { + DMERR("couldn't create btree cursor"); + return r; + } + + return load_ablock(c); +} +EXPORT_SYMBOL_GPL(dm_array_cursor_begin); + +void dm_array_cursor_end(struct dm_array_cursor *c) +{ + if (c->block) { + unlock_ablock(c->info, c->block); + dm_btree_cursor_end(&c->cursor); + } +} +EXPORT_SYMBOL_GPL(dm_array_cursor_end); + +int dm_array_cursor_next(struct dm_array_cursor *c) +{ + int r; + + if (!c->block) + return -ENODATA; + + c->index++; + + if (c->index >= le32_to_cpu(c->ab->nr_entries)) { + r = dm_btree_cursor_next(&c->cursor); + if (r) + return r; + + r = load_ablock(c); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_array_cursor_next); + +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) +{ + int r; + + do { + uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index; + + if (count < remaining) { + c->index += count; + return 0; + } + + count -= remaining; + r = dm_array_cursor_next(c); + + } while (!r); + + return r; +} +EXPORT_SYMBOL_GPL(dm_array_cursor_skip); + +void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le) +{ + *value_le = element_at(c->info, c->ab, c->index); +} +EXPORT_SYMBOL_GPL(dm_array_cursor_get_value); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h new file mode 100644 index 000000000..b6c7077c7 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.h @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_ARRAY_H +#define _LINUX_DM_ARRAY_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * The dm-array is a persistent version of an array. It packs the data + * more efficiently than a btree which will result in less disk space use, + * and a performance boost. The element get and set operations are still + * O(ln(n)), but with a much smaller constant. + * + * The value type structure is reused from the btree type to support proper + * reference counting of values. + * + * The arrays implicitly know their length, and bounds are checked for + * lookups and updated. It doesn't store this in an accessible place + * because it would waste a whole metadata block. Make sure you store the + * size along with the array root in your encompassing data. + * + * Array entries are indexed via an unsigned integer starting from zero. + * Arrays are not sparse; if you resize an array to have 'n' entries then + * 'n - 1' will be the last valid index. + * + * Typical use: + * + * a) initialise a dm_array_info structure. This describes the array + * values and ties it into a specific transaction manager. It holds no + * instance data; the same info can be used for many similar arrays if + * you wish. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an array. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty array with dm_array_empty(). + * + * Like the other data structures in this library, dm_array objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * c) resize an array with dm_array_resize(). + * + * d) Get a value from the array with dm_array_get_value(). + * + * e) Set a value in the array with dm_array_set_value(). + * + * f) Walk an array of values in index order with dm_array_walk(). More + * efficient than making many calls to dm_array_get_value(). + * + * g) Destroy the array with dm_array_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_array_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Describes an array. Don't initialise this structure yourself, use the + * init function below. + */ +struct dm_array_info { + struct dm_transaction_manager *tm; + struct dm_btree_value_type value_type; + struct dm_btree_info btree_info; +}; + +/* + * Sets up a dm_array_info structure. You don't need to do anything with + * this structure when you finish using it. + * + * info - the structure being filled in. + * tm - the transaction manager that should supervise this structure. + * vt - describes the leaf values. + */ +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + +/* + * Create an empty, zero length array. + * + * info - describes the array + * root - on success this will be filled out with the root block + */ +int dm_array_empty(struct dm_array_info *info, dm_block_t *root); + +/* + * Resizes the array. + * + * info - describes the array + * root - the root block of the array on disk + * old_size - the caller is responsible for remembering the size of + * the array + * new_size - can be bigger or smaller than old_size + * value - if we're growing the array the new entries will have this value + * new_root - on success, points to the new root block + * + * If growing the inc function for 'value' will be called the appropriate + * number of times. So if the caller is holding a reference they may want + * to drop it. + */ +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Creates a new array populated with values provided by a callback + * function. This is more efficient than creating an empty array, + * resizing, and then setting values since that process incurs a lot of + * copying. + * + * Assumes 32bit values for now since it's only used by the cache hint + * array. + * + * info - describes the array + * root - the root block of the array on disk + * size - the number of entries in the array + * fn - the callback + * context - passed to the callback + */ +typedef int (*value_fn)(uint32_t index, void *value_le, void *context); +int dm_array_new(struct dm_array_info *info, dm_block_t *root, + uint32_t size, value_fn fn, void *context); + +/* + * Frees a whole array. The value_type's decrement operation will be called + * for all values in the array + */ +int dm_array_del(struct dm_array_info *info, dm_block_t root); + +/* + * Lookup a value in the array + * + * info - describes the array + * root - root block of the array + * index - array index + * value - the value to be read. Will be in on-disk format of course. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value); + +/* + * Set an entry in the array. + * + * info - describes the array + * root - root block of the array + * index - array index + * value - value to be written to disk. Make sure you confirm the value is + * in on-disk format with__dm_bless_for_disk() before calling. + * new_root - the new root block + * + * The old value being overwritten will be decremented, the new value + * incremented. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Walk through all the entries in an array. + * + * info - describes the array + * root - root block of the array + * fn - called back for every element + * context - passed to the callback + */ +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t key, void *leaf), + void *context); + +/*----------------------------------------------------------------*/ + +/* + * Cursor api. + * + * This lets you iterate through all the entries in an array efficiently + * (it will preload metadata). + * + * I'm using a cursor, rather than a walk function with a callback because + * the cache target needs to iterate both the mapping and hint arrays in + * unison. + */ +struct dm_array_cursor { + struct dm_array_info *info; + struct dm_btree_cursor cursor; + + struct dm_block *block; + struct array_block *ab; + unsigned int index; +}; + +int dm_array_cursor_begin(struct dm_array_info *info, + dm_block_t root, struct dm_array_cursor *c); +void dm_array_cursor_end(struct dm_array_cursor *c); + +uint32_t dm_array_cursor_index(struct dm_array_cursor *c); +int dm_array_cursor_next(struct dm_array_cursor *c); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count); + +/* + * value_le is only valid while the cursor points at the current value. + */ +void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_ARRAY_H */ diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c new file mode 100644 index 000000000..625d93498 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.c @@ -0,0 +1,317 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-bitset.h" +#include "dm-transaction-manager.h" + +#include +#include + +#define DM_MSG_PREFIX "bitset" +#define BITS_PER_ARRAY_ENTRY 64 + +/*----------------------------------------------------------------*/ + +static struct dm_btree_value_type bitset_bvt = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL, +}; + +/*----------------------------------------------------------------*/ + +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info) +{ + dm_array_info_init(&info->array_info, tm, &bitset_bvt); + info->current_index_set = false; +} +EXPORT_SYMBOL_GPL(dm_disk_bitset_init); + +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) +{ + return dm_array_empty(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_empty); + +struct packer_context { + bit_value_fn fn; + unsigned int nr_bits; + void *context; +}; + +static int pack_bits(uint32_t index, void *value, void *context) +{ + int r; + struct packer_context *p = context; + unsigned int bit, nr = min(64u, p->nr_bits - (index * 64)); + uint64_t word = 0; + bool bv; + + for (bit = 0; bit < nr; bit++) { + r = p->fn(index * 64 + bit, &bv, p->context); + if (r) + return r; + + if (bv) + set_bit(bit, (unsigned long *) &word); + else + clear_bit(bit, (unsigned long *) &word); + } + + *((__le64 *) value) = cpu_to_le64(word); + + return 0; +} + +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context) +{ + struct packer_context p; + p.fn = fn; + p.nr_bits = size; + p.context = context; + + return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p); +} +EXPORT_SYMBOL_GPL(dm_bitset_new); + +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root) +{ + uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY); + uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY); + __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0); + + __dm_bless_for_disk(&value); + return dm_array_resize(&info->array_info, root, old_blocks, new_blocks, + &value, new_root); +} +EXPORT_SYMBOL_GPL(dm_bitset_resize); + +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root) +{ + return dm_array_del(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_del); + +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root) +{ + int r; + __le64 value; + + if (!info->current_index_set || !info->dirty) + return 0; + + value = cpu_to_le64(info->current_bits); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&info->array_info, root, info->current_index, + &value, new_root); + if (r) + return r; + + info->current_index_set = false; + info->dirty = false; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_flush); + +static int read_bits(struct dm_disk_bitset *info, dm_block_t root, + uint32_t array_index) +{ + int r; + __le64 value; + + r = dm_array_get_value(&info->array_info, root, array_index, &value); + if (r) + return r; + + info->current_bits = le64_to_cpu(value); + info->current_index_set = true; + info->current_index = array_index; + info->dirty = false; + + return 0; +} + +static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned int array_index = index / BITS_PER_ARRAY_ENTRY; + + if (info->current_index_set) { + if (info->current_index == array_index) + return 0; + + r = dm_bitset_flush(info, root, new_root); + if (r) + return r; + } + + return read_bits(info, root, array_index); +} + +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned int b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + set_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_set_bit); + +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned int b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + clear_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); + +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result) +{ + int r; + unsigned int b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + *result = test_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_test_bit); + +static int cursor_next_array_entry(struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + r = dm_array_cursor_next(&c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index++; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + return 0; +} + +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + if (!nr_entries) + return -ENODATA; + + c->info = info; + c->entries_remaining = nr_entries; + + r = dm_array_cursor_begin(&info->array_info, root, &c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index = 0; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin); + +void dm_bitset_cursor_end(struct dm_bitset_cursor *c) +{ + return dm_array_cursor_end(&c->cursor); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_end); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c) +{ + int r = 0; + + if (!c->entries_remaining) + return -ENODATA; + + c->entries_remaining--; + if (++c->bit_index > 63) + r = cursor_next_array_entry(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_next); + +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count) +{ + int r; + __le64 *value; + uint32_t nr_array_skip; + uint32_t remaining_in_word = 64 - c->bit_index; + + if (c->entries_remaining < count) + return -ENODATA; + + if (count < remaining_in_word) { + c->bit_index += count; + c->entries_remaining -= count; + return 0; + + } else { + c->entries_remaining -= remaining_in_word; + count -= remaining_in_word; + } + + nr_array_skip = (count / 64) + 1; + r = dm_array_cursor_skip(&c->cursor, nr_array_skip); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->entries_remaining -= count; + c->array_index += nr_array_skip; + c->bit_index = count & 63; + c->current_bits = le64_to_cpu(*value); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip); + +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c) +{ + return test_bit(c->bit_index, (unsigned long *) &c->current_bits); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h new file mode 100644 index 000000000..df888da04 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.h @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BITSET_H +#define _LINUX_DM_BITSET_H + +#include "dm-array.h" + +/*----------------------------------------------------------------*/ + +/* + * This bitset type is a thin wrapper round a dm_array of 64bit words. It + * uses a tiny, one word cache to reduce the number of array lookups and so + * increase performance. + * + * Like the dm-array that it's based on, the caller needs to keep track of + * the size of the bitset separately. The underlying dm-array implicitly + * knows how many words it's storing and will return -ENODATA if you try + * and access an out of bounds word. However, an out of bounds bit in the + * final word will _not_ be detected, you have been warned. + * + * Bits are indexed from zero. + + * Typical use: + * + * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init(). + * This describes the bitset and includes the cache. It's not called it + * dm_bitset_info in line with other data structures because it does + * include instance data. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an bitset. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty bitset with dm_bitset_empty(). + * + * Like the other data structures in this library, dm_bitset objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * Even read operations may trigger the cache to be flushed and as such + * return a root for a new, updated bitset. + * + * c) resize a bitset with dm_bitset_resize(). + * + * d) Set a bit with dm_bitset_set_bit(). + * + * e) Clear a bit with dm_bitset_clear_bit(). + * + * f) Test a bit with dm_bitset_test_bit(). + * + * g) Flush all updates from the cache with dm_bitset_flush(). + * + * h) Destroy the bitset with dm_bitset_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_bitset_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Opaque object. Unlike dm_array_info, you should have one of these per + * bitset. Initialise with dm_disk_bitset_init(). + */ +struct dm_disk_bitset { + struct dm_array_info array_info; + + uint32_t current_index; + uint64_t current_bits; + + bool current_index_set:1; + bool dirty:1; +}; + +/* + * Sets up a dm_disk_bitset structure. You don't need to do anything with + * this structure when you finish using it. + * + * tm - the transaction manager that should supervise this structure + * info - the structure being initialised + */ +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info); + +/* + * Create an empty, zero length bitset. + * + * info - describes the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); + +/* + * Creates a new bitset populated with values provided by a callback + * function. This is more efficient than creating an empty bitset, + * resizing, and then setting values since that process incurs a lot of + * copying. + * + * info - describes the array + * root - the root block of the array on disk + * size - the number of entries in the array + * fn - the callback + * context - passed to the callback + */ +typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context); +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context); + +/* + * Resize the bitset. + * + * info - describes the bitset + * old_root - the root block of the array on disk + * old_nr_entries - the number of bits in the old bitset + * new_nr_entries - the number of bits you want in the new bitset + * default_value - the value for any new bits + * new_root - on success, points to the new root block + */ +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root); + +/* + * Frees the bitset. + */ +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root); + +/* + * Set a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Clears a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Tests a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block (cached values may have been written) + * result - the bit value you're after + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result); + +/* + * Flush any cached changes to disk. + * + * info - describes the bitset + * root - the root block of the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root); + +struct dm_bitset_cursor { + struct dm_disk_bitset *info; + struct dm_array_cursor cursor; + + uint32_t entries_remaining; + uint32_t array_index; + uint32_t bit_index; + uint64_t current_bits; +}; + +/* + * Make sure you've flush any dm_disk_bitset and updated the root before + * using this. + */ +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c); +void dm_bitset_cursor_end(struct dm_bitset_cursor *c); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c); +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count); +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c new file mode 100644 index 000000000..2bbfbb704 --- /dev/null +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -0,0 +1,656 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#include "dm-block-manager.h" +#include "dm-persistent-data-internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "block manager" + +/*----------------------------------------------------------------*/ + +#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING + +/* + * This is a read/write semaphore with a couple of differences. + * + * i) There is a restriction on the number of concurrent read locks that + * may be held at once. This is just an implementation detail. + * + * ii) Recursive locking attempts are detected and return EINVAL. A stack + * trace is also emitted for the previous lock acquisition. + * + * iii) Priority is given to write locks. + */ +#define MAX_HOLDERS 4 +#define MAX_STACK 10 + +struct stack_store { + unsigned int nr_entries; + unsigned long entries[MAX_STACK]; +}; + +struct block_lock { + spinlock_t lock; + __s32 count; + struct list_head waiters; + struct task_struct *holders[MAX_HOLDERS]; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_store traces[MAX_HOLDERS]; +#endif +}; + +struct waiter { + struct list_head list; + struct task_struct *task; + int wants_write; +}; + +static unsigned int __find_holder(struct block_lock *lock, + struct task_struct *task) +{ + unsigned int i; + + for (i = 0; i < MAX_HOLDERS; i++) + if (lock->holders[i] == task) + break; + + BUG_ON(i == MAX_HOLDERS); + return i; +} + +/* call this *after* you increment lock->count */ +static void __add_holder(struct block_lock *lock, struct task_struct *task) +{ + unsigned int h = __find_holder(lock, NULL); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_store *t; +#endif + + get_task_struct(task); + lock->holders[h] = task; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + t = lock->traces + h; + t->nr_entries = stack_trace_save(t->entries, MAX_STACK, 2); +#endif +} + +/* call this *before* you decrement lock->count */ +static void __del_holder(struct block_lock *lock, struct task_struct *task) +{ + unsigned int h = __find_holder(lock, task); + lock->holders[h] = NULL; + put_task_struct(task); +} + +static int __check_holder(struct block_lock *lock) +{ + unsigned int i; + + for (i = 0; i < MAX_HOLDERS; i++) { + if (lock->holders[i] == current) { + DMERR("recursive lock detected in metadata"); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + DMERR("previously held here:"); + stack_trace_print(lock->traces[i].entries, + lock->traces[i].nr_entries, 4); + + DMERR("subsequent acquisition attempted here:"); + dump_stack(); +#endif + return -EINVAL; + } + } + + return 0; +} + +static void __wait(struct waiter *w) +{ + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!w->task) + break; + + schedule(); + } + + set_current_state(TASK_RUNNING); +} + +static void __wake_waiter(struct waiter *w) +{ + struct task_struct *task; + + list_del(&w->list); + task = w->task; + smp_mb(); + w->task = NULL; + wake_up_process(task); +} + +/* + * We either wake a few readers or a single writer. + */ +static void __wake_many(struct block_lock *lock) +{ + struct waiter *w, *tmp; + + BUG_ON(lock->count < 0); + list_for_each_entry_safe(w, tmp, &lock->waiters, list) { + if (lock->count >= MAX_HOLDERS) + return; + + if (w->wants_write) { + if (lock->count > 0) + return; /* still read locked */ + + lock->count = -1; + __add_holder(lock, w->task); + __wake_waiter(w); + return; + } + + lock->count++; + __add_holder(lock, w->task); + __wake_waiter(w); + } +} + +static void bl_init(struct block_lock *lock) +{ + int i; + + spin_lock_init(&lock->lock); + lock->count = 0; + INIT_LIST_HEAD(&lock->waiters); + for (i = 0; i < MAX_HOLDERS; i++) + lock->holders[i] = NULL; +} + +static int __available_for_read(struct block_lock *lock) +{ + return lock->count >= 0 && + lock->count < MAX_HOLDERS && + list_empty(&lock->waiters); +} + +static int bl_down_read(struct block_lock *lock) +{ + int r; + struct waiter w; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } + + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + + w.task = current; + w.wants_write = 0; + list_add_tail(&w.list, &lock->waiters); + spin_unlock(&lock->lock); + + __wait(&w); + put_task_struct(current); + return 0; +} + +static int bl_down_read_nonblock(struct block_lock *lock) +{ + int r; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) + goto out; + + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + r = 0; + } else + r = -EWOULDBLOCK; + +out: + spin_unlock(&lock->lock); + return r; +} + +static void bl_up_read(struct block_lock *lock) +{ + spin_lock(&lock->lock); + BUG_ON(lock->count <= 0); + __del_holder(lock, current); + --lock->count; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} + +static int bl_down_write(struct block_lock *lock) +{ + int r; + struct waiter w; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } + + if (lock->count == 0 && list_empty(&lock->waiters)) { + lock->count = -1; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + w.task = current; + w.wants_write = 1; + + /* + * Writers given priority. We know there's only one mutator in the + * system, so ignoring the ordering reversal. + */ + list_add(&w.list, &lock->waiters); + spin_unlock(&lock->lock); + + __wait(&w); + put_task_struct(current); + + return 0; +} + +static void bl_up_write(struct block_lock *lock) +{ + spin_lock(&lock->lock); + __del_holder(lock, current); + lock->count = 0; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} + +static void report_recursive_bug(dm_block_t b, int r) +{ + if (r == -EINVAL) + DMERR("recursive acquisition of block %llu requested.", + (unsigned long long) b); +} + +#else /* !CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ + +#define bl_init(x) do { } while (0) +#define bl_down_read(x) 0 +#define bl_down_read_nonblock(x) 0 +#define bl_up_read(x) do { } while (0) +#define bl_down_write(x) 0 +#define bl_up_write(x) do { } while (0) +#define report_recursive_bug(x, y) do { } while (0) + +#endif /* CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ + +/*----------------------------------------------------------------*/ + +/* + * Block manager is currently implemented using dm-bufio. struct + * dm_block_manager and struct dm_block map directly onto a couple of + * structs in the bufio interface. I want to retain the freedom to move + * away from bufio in the future. So these structs are just cast within + * this .c file, rather than making it through to the public interface. + */ +static struct dm_buffer *to_buffer(struct dm_block *b) +{ + return (struct dm_buffer *) b; +} + +dm_block_t dm_block_location(struct dm_block *b) +{ + return dm_bufio_get_block_number(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_location); + +void *dm_block_data(struct dm_block *b) +{ + return dm_bufio_get_block_data(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_data); + +struct buffer_aux { + struct dm_block_validator *validator; + int write_locked; + +#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING + struct block_lock lock; +#endif +}; + +static void dm_block_manager_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + aux->validator = NULL; + bl_init(&aux->lock); +} + +static void dm_block_manager_write_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + if (aux->validator) { + aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf, + dm_bufio_get_block_size(dm_bufio_get_client(buf))); + } +} + +/*---------------------------------------------------------------- + * Public interface + *--------------------------------------------------------------*/ +struct dm_block_manager { + struct dm_bufio_client *bufio; + bool read_only:1; +}; + +struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, + unsigned int block_size, + unsigned int max_held_per_thread) +{ + int r; + struct dm_block_manager *bm; + + bm = kmalloc(sizeof(*bm), GFP_KERNEL); + if (!bm) { + r = -ENOMEM; + goto bad; + } + + bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, + sizeof(struct buffer_aux), + dm_block_manager_alloc_callback, + dm_block_manager_write_callback, + 0); + if (IS_ERR(bm->bufio)) { + r = PTR_ERR(bm->bufio); + kfree(bm); + goto bad; + } + + bm->read_only = false; + + return bm; + +bad: + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_block_manager_create); + +void dm_block_manager_destroy(struct dm_block_manager *bm) +{ + dm_bufio_client_destroy(bm->bufio); + kfree(bm); +} +EXPORT_SYMBOL_GPL(dm_block_manager_destroy); + +void dm_block_manager_reset(struct dm_block_manager *bm) +{ + dm_bufio_client_reset(bm->bufio); +} +EXPORT_SYMBOL_GPL(dm_block_manager_reset); + +unsigned int dm_bm_block_size(struct dm_block_manager *bm) +{ + return dm_bufio_get_block_size(bm->bufio); +} +EXPORT_SYMBOL_GPL(dm_bm_block_size); + +dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) +{ + return dm_bufio_get_device_size(bm->bufio); +} + +static int dm_bm_validate_buffer(struct dm_block_manager *bm, + struct dm_buffer *buf, + struct buffer_aux *aux, + struct dm_block_validator *v) +{ + if (unlikely(!aux->validator)) { + int r; + if (!v) + return 0; + r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio)); + if (unlikely(r)) { + DMERR_LIMIT("%s validator check failed for block %llu", v->name, + (unsigned long long) dm_bufio_get_block_number(buf)); + return r; + } + aux->validator = v; + } else { + if (unlikely(aux->validator != v)) { + DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu", + aux->validator->name, v ? v->name : "NULL", + (unsigned long long) dm_bufio_get_block_number(buf)); + return -EINVAL; + } + } + + return 0; +} +int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); + if (IS_ERR(p)) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read(&aux->lock); + if (unlikely(r)) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_read_lock); + +int dm_bm_write_lock(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + if (dm_bm_is_read_only(bm)) + return -EPERM; + + p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); + if (IS_ERR(p)) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 1; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_write(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_write_lock); + +int dm_bm_read_try_lock(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); + if (IS_ERR(p)) + return PTR_ERR(p); + if (unlikely(!p)) + return -EWOULDBLOCK; + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read_nonblock(&aux->lock); + if (r < 0) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} + +int dm_bm_write_lock_zero(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + struct buffer_aux *aux; + void *p; + + if (dm_bm_is_read_only(bm)) + return -EPERM; + + p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); + if (IS_ERR(p)) + return PTR_ERR(p); + + memset(p, 0, dm_bm_block_size(bm)); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + return r; + } + + aux->write_locked = 1; + aux->validator = v; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); + +void dm_bm_unlock(struct dm_block *b) +{ + struct buffer_aux *aux; + aux = dm_bufio_get_aux_data(to_buffer(b)); + + if (aux->write_locked) { + dm_bufio_mark_buffer_dirty(to_buffer(b)); + bl_up_write(&aux->lock); + } else + bl_up_read(&aux->lock); + + dm_bufio_release(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_bm_unlock); + +int dm_bm_flush(struct dm_block_manager *bm) +{ + if (dm_bm_is_read_only(bm)) + return -EPERM; + + return dm_bufio_write_dirty_buffers(bm->bufio); +} +EXPORT_SYMBOL_GPL(dm_bm_flush); + +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) +{ + dm_bufio_prefetch(bm->bufio, b, 1); +} + +bool dm_bm_is_read_only(struct dm_block_manager *bm) +{ + return (bm ? bm->read_only : true); +} +EXPORT_SYMBOL_GPL(dm_bm_is_read_only); + +void dm_bm_set_read_only(struct dm_block_manager *bm) +{ + if (bm) + bm->read_only = true; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_only); + +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + if (bm) + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) +{ + return crc32c(~(u32) 0, data, len) ^ init_xor; +} +EXPORT_SYMBOL_GPL(dm_bm_checksum); + +/*----------------------------------------------------------------*/ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_DESCRIPTION("Immutable metadata library for dm"); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h new file mode 100644 index 000000000..4371d85d3 --- /dev/null +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_BLOCK_MANAGER_H +#define _LINUX_DM_BLOCK_MANAGER_H + +#include +#include + +/*----------------------------------------------------------------*/ + +/* + * Block number. + */ +typedef uint64_t dm_block_t; +struct dm_block; + +dm_block_t dm_block_location(struct dm_block *b); +void *dm_block_data(struct dm_block *b); + +/*----------------------------------------------------------------*/ + +/* + * @name should be a unique identifier for the block manager, no longer + * than 32 chars. + * + * @max_held_per_thread should be the maximum number of locks, read or + * write, that an individual thread holds at any one time. + */ +struct dm_block_manager; +struct dm_block_manager *dm_block_manager_create( + struct block_device *bdev, unsigned int block_size, + unsigned int max_held_per_thread); +void dm_block_manager_destroy(struct dm_block_manager *bm); +void dm_block_manager_reset(struct dm_block_manager *bm); + +unsigned int dm_bm_block_size(struct dm_block_manager *bm); +dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); + +/*----------------------------------------------------------------*/ + +/* + * The validator allows the caller to verify newly-read data and modify + * the data just before writing, e.g. to calculate checksums. It's + * important to be consistent with your use of validators. The only time + * you can change validators is if you call dm_bm_write_lock_zero. + */ +struct dm_block_validator { + const char *name; + void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + + /* + * Return 0 if the checksum is valid or < 0 on error. + */ + int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); +}; + +/*----------------------------------------------------------------*/ + +/* + * You can have multiple concurrent readers or a single writer holding a + * block lock. + */ + +/* + * dm_bm_lock() locks a block and returns through @result a pointer to + * memory that holds a copy of that block. If you have write-locked the + * block then any changes you make to memory pointed to by @result will be + * written back to the disk sometime after dm_bm_unlock is called. + */ +int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * The *_try_lock variants return -EWOULDBLOCK if the block isn't + * available immediately. + */ +int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * Use dm_bm_write_lock_zero() when you know you're going to + * overwrite the block completely. It saves a disk read. + */ +int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +void dm_bm_unlock(struct dm_block *b); + +/* + * It's a common idiom to have a superblock that should be committed last. + * + * @superblock should be write-locked on entry. It will be unlocked during + * this function. All dirty blocks are guaranteed to be written and flushed + * before the superblock. + * + * This method always blocks. + */ +int dm_bm_flush(struct dm_block_manager *bm); + +/* + * Request data is prefetched into the cache. + */ +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); + +/* + * Switches the bm to a read only mode. Once read-only mode + * has been entered the following functions will return -EPERM. + * + * dm_bm_write_lock + * dm_bm_write_lock_zero + * dm_bm_flush_and_unlock + * + * Additionally you should not use dm_bm_unlock_move, however no error will + * be returned if you do. + */ +bool dm_bm_is_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); + +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BLOCK_MANAGER_H */ diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h new file mode 100644 index 000000000..893edb426 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BTREE_INTERNAL_H +#define DM_BTREE_INTERNAL_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * We'll need 2 accessor functions for n->csum and n->blocknr + * to support dm-btree-spine.c in that case. + */ + +enum node_flags { + INTERNAL_NODE = 1, + LEAF_NODE = 1 << 1 +}; + +/* + * Every btree node begins with this structure. Make sure it's a multiple + * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned. + */ +struct node_header { + __le32 csum; + __le32 flags; + __le64 blocknr; /* Block this node is supposed to live in. */ + + __le32 nr_entries; + __le32 max_entries; + __le32 value_size; + __le32 padding; +} __attribute__((packed, aligned(8))); + +struct btree_node { + struct node_header header; + __le64 keys[]; +} __attribute__((packed, aligned(8))); + + +/* + * Locks a block using the btree node validator. + */ +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result); + +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, + struct dm_btree_value_type *vt); + +int new_block(struct dm_btree_info *info, struct dm_block **result); +void unlock_block(struct dm_btree_info *info, struct dm_block *b); + +/* + * Spines keep track of the rolling locks. There are 2 variants, read-only + * and one that uses shadowing. These are separate structs to allow the + * type checker to spot misuse, for example accidentally calling read_lock + * on a shadow spine. + */ +struct ro_spine { + struct dm_btree_info *info; + + int count; + struct dm_block *nodes[2]; +}; + +void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); +void exit_ro_spine(struct ro_spine *s); +int ro_step(struct ro_spine *s, dm_block_t new_child); +void ro_pop(struct ro_spine *s); +struct btree_node *ro_node(struct ro_spine *s); + +struct shadow_spine { + struct dm_btree_info *info; + + int count; + struct dm_block *nodes[2]; + + dm_block_t root; +}; + +void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info); +void exit_shadow_spine(struct shadow_spine *s); + +int shadow_step(struct shadow_spine *s, dm_block_t b, + struct dm_btree_value_type *vt); + +/* + * The spine must have at least one entry before calling this. + */ +struct dm_block *shadow_current(struct shadow_spine *s); + +/* + * The spine must have at least two entries before calling this. + */ +struct dm_block *shadow_parent(struct shadow_spine *s); + +int shadow_has_parent(struct shadow_spine *s); + +dm_block_t shadow_root(struct shadow_spine *s); + +/* + * Some inlines. + */ +static inline __le64 *key_ptr(struct btree_node *n, uint32_t index) +{ + return n->keys + index; +} + +static inline void *value_base(struct btree_node *n) +{ + return &n->keys[le32_to_cpu(n->header.max_entries)]; +} + +static inline void *value_ptr(struct btree_node *n, uint32_t index) +{ + uint32_t value_size = le32_to_cpu(n->header.value_size); + return value_base(n) + (value_size * index); +} + +/* + * Assumes the values are suitably-aligned and converts to core format. + */ +static inline uint64_t value64(struct btree_node *n, uint32_t index) +{ + __le64 *values_le = value_base(n); + + return le64_to_cpu(values_le[index]); +} + +/* + * Searching for a key within a single node. + */ +int lower_bound(struct btree_node *n, uint64_t key); + +extern struct dm_block_validator btree_node_validator; + +/* + * Value type for upper levels of multi-level btrees. + */ +extern void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + +/* + * This returns a shadowed btree leaf that you may modify. In practise + * this means overwrites only, since an insert could cause a node to + * be split. Useful if you need access to the old value to calculate the + * new one. + * + * This only works with single level btrees. The given key must be present in + * the tree, otherwise -EINVAL will be returned. + */ +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf); + +#endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c new file mode 100644 index 000000000..ac213138b --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -0,0 +1,759 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree.h" +#include "dm-btree-internal.h" +#include "dm-transaction-manager.h" + +#include +#include + +#define DM_MSG_PREFIX "btree" + +/* + * Removing an entry from a btree + * ============================== + * + * A very important constraint for our btree is that no node, except the + * root, may have fewer than a certain number of entries. + * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). + * + * Ensuring this is complicated by the way we want to only ever hold the + * locks on 2 nodes concurrently, and only change nodes in a top to bottom + * fashion. + * + * Each node may have a left or right sibling. When decending the spine, + * if a node contains only MIN_ENTRIES then we try and increase this to at + * least MIN_ENTRIES + 1. We do this in the following ways: + * + * [A] No siblings => this can only happen if the node is the root, in which + * case we copy the childs contents over the root. + * + * [B] No left sibling + * ==> rebalance(node, right sibling) + * + * [C] No right sibling + * ==> rebalance(left sibling, node) + * + * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD + * ==> delete node adding it's contents to left and right + * + * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD + * ==> rebalance(left, node, right) + * + * After these operations it's possible that the our original node no + * longer contains the desired sub tree. For this reason this rebalancing + * is performed on the children of the current node. This also avoids + * having a special case for the root. + * + * Once this rebalancing has occurred we can then step into the child node + * for internal nodes. Or delete the entry for leaf nodes. + */ + +/* + * Some little utilities for moving node data around. + */ +static void node_shift(struct btree_node *n, int shift) +{ + uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + uint32_t value_size = le32_to_cpu(n->header.value_size); + + if (shift < 0) { + shift = -shift; + BUG_ON(shift > nr_entries); + BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); + memmove(key_ptr(n, 0), + key_ptr(n, shift), + (nr_entries - shift) * sizeof(__le64)); + memmove(value_ptr(n, 0), + value_ptr(n, shift), + (nr_entries - shift) * value_size); + } else { + BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); + memmove(key_ptr(n, shift), + key_ptr(n, 0), + nr_entries * sizeof(__le64)); + memmove(value_ptr(n, shift), + value_ptr(n, 0), + nr_entries * value_size); + } +} + +static int node_copy(struct btree_node *left, struct btree_node *right, int shift) +{ + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t value_size = le32_to_cpu(left->header.value_size); + if (value_size != le32_to_cpu(right->header.value_size)) { + DMERR("mismatched value size"); + return -EILSEQ; + } + + if (shift < 0) { + shift = -shift; + + if (nr_left + shift > le32_to_cpu(left->header.max_entries)) { + DMERR("bad shift"); + return -EINVAL; + } + + memcpy(key_ptr(left, nr_left), + key_ptr(right, 0), + shift * sizeof(__le64)); + memcpy(value_ptr(left, nr_left), + value_ptr(right, 0), + shift * value_size); + } else { + if (shift > le32_to_cpu(right->header.max_entries)) { + DMERR("bad shift"); + return -EINVAL; + } + + memcpy(key_ptr(right, 0), + key_ptr(left, nr_left - shift), + shift * sizeof(__le64)); + memcpy(value_ptr(right, 0), + value_ptr(left, nr_left - shift), + shift * value_size); + } + return 0; +} + +/* + * Delete a specific entry from a leaf node. + */ +static void delete_at(struct btree_node *n, unsigned int index) +{ + unsigned int nr_entries = le32_to_cpu(n->header.nr_entries); + unsigned int nr_to_copy = nr_entries - (index + 1); + uint32_t value_size = le32_to_cpu(n->header.value_size); + BUG_ON(index >= nr_entries); + + if (nr_to_copy) { + memmove(key_ptr(n, index), + key_ptr(n, index + 1), + nr_to_copy * sizeof(__le64)); + + memmove(value_ptr(n, index), + value_ptr(n, index + 1), + nr_to_copy * value_size); + } + + n->header.nr_entries = cpu_to_le32(nr_entries - 1); +} + +static unsigned int merge_threshold(struct btree_node *n) +{ + return le32_to_cpu(n->header.max_entries) / 3; +} + +struct child { + unsigned int index; + struct dm_block *block; + struct btree_node *n; +}; + +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, + unsigned int index, struct child *result) +{ + int r, inc; + dm_block_t root; + + result->index = index; + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + &result->block, &inc); + if (r) + return r; + + result->n = dm_block_data(result->block); + + if (inc) + inc_children(info->tm, result->n, vt); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(result->block)); + + return 0; +} + +static void exit_child(struct dm_btree_info *info, struct child *c) +{ + dm_tm_unlock(info->tm, c->block); +} + +static int shift(struct btree_node *left, struct btree_node *right, int count) +{ + int r; + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); + + if (max_entries != r_max_entries) { + DMERR("node max_entries mismatch"); + return -EILSEQ; + } + + if (nr_left - count > max_entries) { + DMERR("node shift out of bounds"); + return -EINVAL; + } + + if (nr_right + count > max_entries) { + DMERR("node shift out of bounds"); + return -EINVAL; + } + + if (!count) + return 0; + + if (count > 0) { + node_shift(right, count); + r = node_copy(left, right, count); + if (r) + return r; + } else { + r = node_copy(left, right, count); + if (r) + return r; + node_shift(right, count); + } + + left->header.nr_entries = cpu_to_le32(nr_left - count); + right->header.nr_entries = cpu_to_le32(nr_right + count); + + return 0; +} + +static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *r) +{ + int ret; + struct btree_node *left = l->n; + struct btree_node *right = r->n; + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + /* + * Ensure the number of entries in each child will be greater + * than or equal to (max_entries / 3 + 1), so no matter which + * child is used for removal, the number will still be not + * less than (max_entries / 3). + */ + unsigned int threshold = 2 * (merge_threshold(left) + 1); + + if (nr_left + nr_right < threshold) { + /* + * Merge + */ + node_copy(left, right, -nr_right); + left->header.nr_entries = cpu_to_le32(nr_left + nr_right); + delete_at(parent, r->index); + + /* + * We need to decrement the right block, but not it's + * children, since they're still referenced by left. + */ + dm_tm_dec(info->tm, dm_block_location(r->block)); + } else { + /* + * Rebalance. + */ + unsigned int target_left = (nr_left + nr_right) / 2; + ret = shift(left, right, nr_left - target_left); + if (ret) + return ret; + *key_ptr(parent, r->index) = right->keys[0]; + } + return 0; +} + +static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, unsigned int left_index) +{ + int r; + struct btree_node *parent; + struct child left, right; + + parent = dm_block_data(shadow_current(s)); + + r = init_child(info, vt, parent, left_index, &left); + if (r) + return r; + + r = init_child(info, vt, parent, left_index + 1, &right); + if (r) { + exit_child(info, &left); + return r; + } + + r = __rebalance2(info, parent, &left, &right); + + exit_child(info, &left); + exit_child(info, &right); + + return r; +} + +/* + * We dump as many entries from center as possible into left, then the rest + * in right, then rebalance2. This wastes some cpu, but I want something + * simple atm. + */ +static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r, + struct btree_node *left, struct btree_node *center, struct btree_node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned int shift = min(max_entries - nr_left, nr_center); + + if (nr_left + shift > max_entries) { + DMERR("node shift out of bounds"); + return -EINVAL; + } + + node_copy(left, center, -shift); + left->header.nr_entries = cpu_to_le32(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + + if ((nr_right + shift) > max_entries) { + DMERR("node shift out of bounds"); + return -EINVAL; + } + + node_shift(right, shift); + node_copy(center, right, shift); + right->header.nr_entries = cpu_to_le32(nr_right + shift); + } + *key_ptr(parent, r->index) = right->keys[0]; + + delete_at(parent, c->index); + r->index--; + + dm_tm_dec(info->tm, dm_block_location(c->block)); + return __rebalance2(info, parent, l, r); +} + +/* + * Redistributes entries among 3 sibling nodes. + */ +static int redistribute3(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r, + struct btree_node *left, struct btree_node *center, struct btree_node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + int s, ret; + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned int total = nr_left + nr_center + nr_right; + unsigned int target_right = total / 3; + unsigned int remainder = (target_right * 3) != total; + unsigned int target_left = target_right + remainder; + + BUG_ON(target_left > max_entries); + BUG_ON(target_right > max_entries); + + if (nr_left < nr_right) { + s = nr_left - target_left; + + if (s < 0 && nr_center < -s) { + /* not enough in central node */ + ret = shift(left, center, -nr_center); + if (ret) + return ret; + + s += nr_center; + ret = shift(left, right, s); + if (ret) + return ret; + + nr_right += s; + } else { + ret = shift(left, center, s); + if (ret) + return ret; + } + + ret = shift(center, right, target_right - nr_right); + if (ret) + return ret; + } else { + s = target_right - nr_right; + if (s > 0 && nr_center < s) { + /* not enough in central node */ + ret = shift(center, right, nr_center); + if (ret) + return ret; + s -= nr_center; + ret = shift(left, right, s); + if (ret) + return ret; + nr_left -= s; + } else { + ret = shift(center, right, s); + if (ret) + return ret; + } + + ret = shift(left, center, nr_left - target_left); + if (ret) + return ret; + } + + *key_ptr(parent, c->index) = center->keys[0]; + *key_ptr(parent, r->index) = right->keys[0]; + return 0; +} + +static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r) +{ + struct btree_node *left = l->n; + struct btree_node *center = c->n; + struct btree_node *right = r->n; + + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_center = le32_to_cpu(center->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + + unsigned int threshold = merge_threshold(left) * 4 + 1; + + if ((left->header.max_entries != center->header.max_entries) || + (center->header.max_entries != right->header.max_entries)) { + DMERR("bad btree metadata, max_entries differ"); + return -EILSEQ; + } + + if ((nr_left + nr_center + nr_right) < threshold) { + return delete_center_node(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); + } + + return redistribute3(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); +} + +static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, unsigned int left_index) +{ + int r; + struct btree_node *parent = dm_block_data(shadow_current(s)); + struct child left, center, right; + + /* + * FIXME: fill out an array? + */ + r = init_child(info, vt, parent, left_index, &left); + if (r) + return r; + + r = init_child(info, vt, parent, left_index + 1, ¢er); + if (r) { + exit_child(info, &left); + return r; + } + + r = init_child(info, vt, parent, left_index + 2, &right); + if (r) { + exit_child(info, &left); + exit_child(info, ¢er); + return r; + } + + r = __rebalance3(info, parent, &left, ¢er, &right); + + exit_child(info, &left); + exit_child(info, ¢er); + exit_child(info, &right); + + return r; +} + +static int rebalance_children(struct shadow_spine *s, + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) +{ + int i, r, has_left_sibling, has_right_sibling; + struct btree_node *n; + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.nr_entries) == 1) { + struct dm_block *child; + dm_block_t b = value64(n, 0); + + r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); + if (r) + return r; + + memcpy(n, dm_block_data(child), + dm_bm_block_size(dm_tm_get_bm(info->tm))); + + dm_tm_dec(info->tm, dm_block_location(child)); + dm_tm_unlock(info->tm, child); + return 0; + } + + i = lower_bound(n, key); + if (i < 0) + return -ENODATA; + + has_left_sibling = i > 0; + has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); + + if (!has_left_sibling) + r = rebalance2(s, info, vt, i); + + else if (!has_right_sibling) + r = rebalance2(s, info, vt, i - 1); + + else + r = rebalance3(s, info, vt, i - 1); + + return r; +} + +static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index) +{ + int i = lower_bound(n, key); + + if ((i < 0) || + (i >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[i]) != key)) + return -ENODATA; + + *index = i; + + return 0; +} + +/* + * Prepares for removal from one level of the hierarchy. The caller must + * call delete_at() to remove the entry at index. + */ +static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, dm_block_t root, + uint64_t key, unsigned int *index) +{ + int i = *index, r; + struct btree_node *n; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + break; + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s)) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.flags) & LEAF_NODE) + return do_leaf(n, key, index); + + r = rebalance_children(s, info, vt, key); + if (r) + break; + + n = dm_block_data(shadow_current(s)); + if (le32_to_cpu(n->header.flags) & LEAF_NODE) + return do_leaf(n, key, index); + + i = lower_bound(n, key); + + /* + * We know the key is present, or else + * rebalance_children would have returned + * -ENODATA + */ + root = value64(n, i); + } + + return r; +} + +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, dm_block_t *new_root) +{ + unsigned int level, last_level = info->levels - 1; + int index = 0, r = 0; + struct shadow_spine spine; + struct btree_node *n; + struct dm_btree_value_type le64_vt; + + init_le64_type(info->tm, &le64_vt); + init_shadow_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + r = remove_raw(&spine, info, + (level == last_level ? + &info->value_type : &le64_vt), + root, keys[level], (unsigned int *)&index); + if (r < 0) + break; + + n = dm_block_data(shadow_current(&spine)); + if (level != last_level) { + root = value64(n, index); + continue; + } + + BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); + + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(n, index), 1); + + delete_at(n, index); + } + + if (!r) + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_remove); + +/*----------------------------------------------------------------*/ + +static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, dm_block_t root, + uint64_t key, int *index) +{ + int i = *index, r; + struct btree_node *n; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + break; + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s)) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.flags) & LEAF_NODE) { + *index = lower_bound(n, key); + return 0; + } + + r = rebalance_children(s, info, vt, key); + if (r) + break; + + n = dm_block_data(shadow_current(s)); + if (le32_to_cpu(n->header.flags) & LEAF_NODE) { + *index = lower_bound(n, key); + return 0; + } + + i = lower_bound(n, key); + + /* + * We know the key is present, or else + * rebalance_children would have returned + * -ENODATA + */ + root = value64(n, i); + } + + return r; +} + +static int remove_one(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, uint64_t end_key, + dm_block_t *new_root, unsigned int *nr_removed) +{ + unsigned int level, last_level = info->levels - 1; + int index = 0, r = 0; + struct shadow_spine spine; + struct btree_node *n; + struct dm_btree_value_type le64_vt; + uint64_t k; + + init_le64_type(info->tm, &le64_vt); + init_shadow_spine(&spine, info); + for (level = 0; level < last_level; level++) { + r = remove_raw(&spine, info, &le64_vt, + root, keys[level], (unsigned int *) &index); + if (r < 0) + goto out; + + n = dm_block_data(shadow_current(&spine)); + root = value64(n, index); + } + + r = remove_nearest(&spine, info, &info->value_type, + root, keys[last_level], &index); + if (r < 0) + goto out; + + n = dm_block_data(shadow_current(&spine)); + + if (index < 0) + index = 0; + + if (index >= le32_to_cpu(n->header.nr_entries)) { + r = -ENODATA; + goto out; + } + + k = le64_to_cpu(n->keys[index]); + if (k >= keys[last_level] && k < end_key) { + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(n, index), 1); + + delete_at(n, index); + keys[last_level] = k + 1ull; + + } else + r = -ENODATA; + +out: + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return r; +} + +int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, + uint64_t *first_key, uint64_t end_key, + dm_block_t *new_root, unsigned int *nr_removed) +{ + int r; + + *nr_removed = 0; + do { + r = remove_one(info, root, first_key, end_key, &root, nr_removed); + if (!r) + (*nr_removed)++; + } while (!r); + + *new_root = root; + return r == -ENODATA ? 0 : r; +} +EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c new file mode 100644 index 000000000..45a39d4f1 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree-internal.h" +#include "dm-transaction-manager.h" + +#include + +#define DM_MSG_PREFIX "btree spine" + +/*----------------------------------------------------------------*/ + +#define BTREE_CSUM_XOR 121107 + +static void node_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct btree_node *n = dm_block_data(b); + struct node_header *h = &n->header; + + h->blocknr = cpu_to_le64(dm_block_location(b)); + h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); +} + +static int node_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct btree_node *n = dm_block_data(b); + struct node_header *h = &n->header; + size_t value_size; + __le32 csum_disk; + uint32_t flags, nr_entries, max_entries; + + if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { + DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(h->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); + if (csum_disk != h->csum) { + DMERR_LIMIT("node_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); + return -EILSEQ; + } + + nr_entries = le32_to_cpu(h->nr_entries); + max_entries = le32_to_cpu(h->max_entries); + value_size = le32_to_cpu(h->value_size); + + if (sizeof(struct node_header) + + (sizeof(__le64) + value_size) * max_entries > block_size) { + DMERR_LIMIT("node_check failed: max_entries too large"); + return -EILSEQ; + } + + if (nr_entries > max_entries) { + DMERR_LIMIT("node_check failed: too many entries"); + return -EILSEQ; + } + + /* + * The node must be either INTERNAL or LEAF. + */ + flags = le32_to_cpu(h->flags); + if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { + DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF"); + return -EILSEQ; + } + + return 0; +} + +struct dm_block_validator btree_node_validator = { + .name = "btree_node", + .prepare_for_write = node_prepare_for_write, + .check = node_check +}; + +/*----------------------------------------------------------------*/ + +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result) +{ + return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); +} + +static int bn_shadow(struct dm_btree_info *info, dm_block_t orig, + struct dm_btree_value_type *vt, + struct dm_block **result) +{ + int r, inc; + + r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator, + result, &inc); + if (!r && inc) + inc_children(info->tm, dm_block_data(*result), vt); + + return r; +} + +int new_block(struct dm_btree_info *info, struct dm_block **result) +{ + return dm_tm_new_block(info->tm, &btree_node_validator, result); +} + +void unlock_block(struct dm_btree_info *info, struct dm_block *b) +{ + dm_tm_unlock(info->tm, b); +} + +/*----------------------------------------------------------------*/ + +void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info) +{ + s->info = info; + s->count = 0; + s->nodes[0] = NULL; + s->nodes[1] = NULL; +} + +void exit_ro_spine(struct ro_spine *s) +{ + int i; + + for (i = 0; i < s->count; i++) { + unlock_block(s->info, s->nodes[i]); + } +} + +int ro_step(struct ro_spine *s, dm_block_t new_child) +{ + int r; + + if (s->count == 2) { + unlock_block(s->info, s->nodes[0]); + s->nodes[0] = s->nodes[1]; + s->count--; + } + + r = bn_read_lock(s->info, new_child, s->nodes + s->count); + if (!r) + s->count++; + + return r; +} + +void ro_pop(struct ro_spine *s) +{ + BUG_ON(!s->count); + --s->count; + unlock_block(s->info, s->nodes[s->count]); +} + +struct btree_node *ro_node(struct ro_spine *s) +{ + struct dm_block *block; + + BUG_ON(!s->count); + block = s->nodes[s->count - 1]; + + return dm_block_data(block); +} + +/*----------------------------------------------------------------*/ + +void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info) +{ + s->info = info; + s->count = 0; +} + +void exit_shadow_spine(struct shadow_spine *s) +{ + int i; + + for (i = 0; i < s->count; i++) { + unlock_block(s->info, s->nodes[i]); + } +} + +int shadow_step(struct shadow_spine *s, dm_block_t b, + struct dm_btree_value_type *vt) +{ + int r; + + if (s->count == 2) { + unlock_block(s->info, s->nodes[0]); + s->nodes[0] = s->nodes[1]; + s->count--; + } + + r = bn_shadow(s->info, b, vt, s->nodes + s->count); + if (!r) { + if (!s->count) + s->root = dm_block_location(s->nodes[0]); + + s->count++; + } + + return r; +} + +struct dm_block *shadow_current(struct shadow_spine *s) +{ + BUG_ON(!s->count); + + return s->nodes[s->count - 1]; +} + +struct dm_block *shadow_parent(struct shadow_spine *s) +{ + BUG_ON(s->count != 2); + + return s->count == 2 ? s->nodes[0] : NULL; +} + +int shadow_has_parent(struct shadow_spine *s) +{ + return s->count >= 2; +} + +dm_block_t shadow_root(struct shadow_spine *s) +{ + return s->root; +} + +static void le64_inc(void *context, const void *value_le, unsigned int count) +{ + dm_tm_with_runs(context, value_le, count, dm_tm_inc_range); +} + +static void le64_dec(void *context, const void *value_le, unsigned int count) +{ + dm_tm_with_runs(context, value_le, count, dm_tm_dec_range); +} + +static int le64_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + return v1_le == v2_le; +} + +void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + vt->context = tm; + vt->size = sizeof(__le64); + vt->inc = le64_inc; + vt->dec = le64_dec; + vt->equal = le64_equal; +} diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c new file mode 100644 index 000000000..1cc783d70 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.c @@ -0,0 +1,1625 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree-internal.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include +#include + +#define DM_MSG_PREFIX "btree" + +/*---------------------------------------------------------------- + * Array manipulation + *--------------------------------------------------------------*/ +static void memcpy_disk(void *dest, const void *src, size_t len) + __dm_written_to_disk(src) +{ + memcpy(dest, src, len); + __dm_unbless_for_disk(src); +} + +static void array_insert(void *base, size_t elt_size, unsigned int nr_elts, + unsigned int index, void *elt) + __dm_written_to_disk(elt) +{ + if (index < nr_elts) + memmove(base + (elt_size * (index + 1)), + base + (elt_size * index), + (nr_elts - index) * elt_size); + + memcpy_disk(base + (elt_size * index), elt, elt_size); +} + +/*----------------------------------------------------------------*/ + +/* makes the assumption that no two keys are the same. */ +static int bsearch(struct btree_node *n, uint64_t key, int want_hi) +{ + int lo = -1, hi = le32_to_cpu(n->header.nr_entries); + + while (hi - lo > 1) { + int mid = lo + ((hi - lo) / 2); + uint64_t mid_key = le64_to_cpu(n->keys[mid]); + + if (mid_key == key) + return mid; + + if (mid_key < key) + lo = mid; + else + hi = mid; + } + + return want_hi ? hi : lo; +} + +int lower_bound(struct btree_node *n, uint64_t key) +{ + return bsearch(n, key, 0); +} + +static int upper_bound(struct btree_node *n, uint64_t key) +{ + return bsearch(n, key, 1); +} + +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, + struct dm_btree_value_type *vt) +{ + uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) + dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); + + else if (vt->inc) + vt->inc(vt->context, value_ptr(n, 0), nr_entries); +} + +static int insert_at(size_t value_size, struct btree_node *node, unsigned int index, + uint64_t key, void *value) + __dm_written_to_disk(value) +{ + uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); + uint32_t max_entries = le32_to_cpu(node->header.max_entries); + __le64 key_le = cpu_to_le64(key); + + if (index > nr_entries || + index >= max_entries || + nr_entries >= max_entries) { + DMERR("too many entries in btree node for insert"); + __dm_unbless_for_disk(value); + return -ENOMEM; + } + + __dm_bless_for_disk(&key_le); + + array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); + array_insert(value_base(node), value_size, nr_entries, index, value); + node->header.nr_entries = cpu_to_le32(nr_entries + 1); + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* + * We want 3n entries (for some n). This works more nicely for repeated + * insert remove loops than (2n + 1). + */ +static uint32_t calc_max_entries(size_t value_size, size_t block_size) +{ + uint32_t total, n; + size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ + + block_size -= sizeof(struct node_header); + total = block_size / elt_size; + n = total / 3; /* rounds down */ + + return 3 * n; +} + +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) +{ + int r; + struct dm_block *b; + struct btree_node *n; + size_t block_size; + uint32_t max_entries; + + r = new_block(info, &b); + if (r < 0) + return r; + + block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); + max_entries = calc_max_entries(info->value_type.size, block_size); + + n = dm_block_data(b); + memset(n, 0, block_size); + n->header.flags = cpu_to_le32(LEAF_NODE); + n->header.nr_entries = cpu_to_le32(0); + n->header.max_entries = cpu_to_le32(max_entries); + n->header.value_size = cpu_to_le32(info->value_type.size); + + *root = dm_block_location(b); + unlock_block(info, b); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_btree_empty); + +/*----------------------------------------------------------------*/ + +/* + * Deletion uses a recursive algorithm, since we have limited stack space + * we explicitly manage our own stack on the heap. + */ +#define MAX_SPINE_DEPTH 64 +struct frame { + struct dm_block *b; + struct btree_node *n; + unsigned int level; + unsigned int nr_children; + unsigned int current_child; +}; + +struct del_stack { + struct dm_btree_info *info; + struct dm_transaction_manager *tm; + int top; + struct frame spine[MAX_SPINE_DEPTH]; +}; + +static int top_frame(struct del_stack *s, struct frame **f) +{ + if (s->top < 0) { + DMERR("btree deletion stack empty"); + return -EINVAL; + } + + *f = s->spine + s->top; + + return 0; +} + +static int unprocessed_frames(struct del_stack *s) +{ + return s->top >= 0; +} + +static void prefetch_children(struct del_stack *s, struct frame *f) +{ + unsigned int i; + struct dm_block_manager *bm = dm_tm_get_bm(s->tm); + + for (i = 0; i < f->nr_children; i++) + dm_bm_prefetch(bm, value64(f->n, i)); +} + +static bool is_internal_level(struct dm_btree_info *info, struct frame *f) +{ + return f->level < (info->levels - 1); +} + +static int push_frame(struct del_stack *s, dm_block_t b, unsigned int level) +{ + int r; + uint32_t ref_count; + + if (s->top >= MAX_SPINE_DEPTH - 1) { + DMERR("btree deletion stack out of memory"); + return -ENOMEM; + } + + r = dm_tm_ref(s->tm, b, &ref_count); + if (r) + return r; + + if (ref_count > 1) + /* + * This is a shared node, so we can just decrement it's + * reference counter and leave the children. + */ + dm_tm_dec(s->tm, b); + + else { + uint32_t flags; + struct frame *f = s->spine + ++s->top; + + r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); + if (r) { + s->top--; + return r; + } + + f->n = dm_block_data(f->b); + f->level = level; + f->nr_children = le32_to_cpu(f->n->header.nr_entries); + f->current_child = 0; + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) + prefetch_children(s, f); + } + + return 0; +} + +static void pop_frame(struct del_stack *s) +{ + struct frame *f = s->spine + s->top--; + + dm_tm_dec(s->tm, dm_block_location(f->b)); + dm_tm_unlock(s->tm, f->b); +} + +static void unlock_all_frames(struct del_stack *s) +{ + struct frame *f; + + while (unprocessed_frames(s)) { + f = s->spine + s->top--; + dm_tm_unlock(s->tm, f->b); + } +} + +int dm_btree_del(struct dm_btree_info *info, dm_block_t root) +{ + int r; + struct del_stack *s; + + /* + * dm_btree_del() is called via an ioctl, as such should be + * considered an FS op. We can't recurse back into the FS, so we + * allocate GFP_NOFS. + */ + s = kmalloc(sizeof(*s), GFP_NOFS); + if (!s) + return -ENOMEM; + s->info = info; + s->tm = info->tm; + s->top = -1; + + r = push_frame(s, root, 0); + if (r) + goto out; + + while (unprocessed_frames(s)) { + uint32_t flags; + struct frame *f; + dm_block_t b; + + r = top_frame(s, &f); + if (r) + goto out; + + if (f->current_child >= f->nr_children) { + pop_frame(s); + continue; + } + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE) { + b = value64(f->n, f->current_child); + f->current_child++; + r = push_frame(s, b, f->level); + if (r) + goto out; + + } else if (is_internal_level(info, f)) { + b = value64(f->n, f->current_child); + f->current_child++; + r = push_frame(s, b, f->level + 1); + if (r) + goto out; + + } else { + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(f->n, 0), f->nr_children); + pop_frame(s); + } + } +out: + if (r) { + /* cleanup all frames of del_stack */ + unlock_all_frames(s); + } + kfree(s); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_del); + +/*----------------------------------------------------------------*/ + +static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, + int (*search_fn)(struct btree_node *, uint64_t), + uint64_t *result_key, void *v, size_t value_size) +{ + int i, r; + uint32_t flags, nr_entries; + + do { + r = ro_step(s, block); + if (r < 0) + return r; + + i = search_fn(ro_node(s), key); + + flags = le32_to_cpu(ro_node(s)->header.flags); + nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); + if (i < 0 || i >= nr_entries) + return -ENODATA; + + if (flags & INTERNAL_NODE) + block = value64(ro_node(s), i); + + } while (!(flags & LEAF_NODE)); + + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + if (v) + memcpy(v, value_ptr(ro_node(s), i), value_size); + + return 0; +} + +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value_le) +{ + unsigned int level, last_level = info->levels - 1; + int r = -ENODATA; + uint64_t rkey; + __le64 internal_value_le; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + size_t size; + void *value_p; + + if (level == last_level) { + value_p = value_le; + size = info->value_type.size; + + } else { + value_p = &internal_value_le; + size = sizeof(uint64_t); + } + + r = btree_lookup_raw(&spine, root, keys[level], + lower_bound, &rkey, + value_p, size); + + if (!r) { + if (rkey != keys[level]) { + exit_ro_spine(&spine); + return -ENODATA; + } + } else { + exit_ro_spine(&spine); + return r; + } + + root = le64_to_cpu(internal_value_le); + } + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_lookup); + +static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, + uint64_t key, uint64_t *rkey, void *value_le) +{ + int r, i; + uint32_t flags, nr_entries; + struct dm_block *node; + struct btree_node *n; + + r = bn_read_lock(info, root, &node); + if (r) + return r; + + n = dm_block_data(node); + flags = le32_to_cpu(n->header.flags); + nr_entries = le32_to_cpu(n->header.nr_entries); + + if (flags & INTERNAL_NODE) { + i = lower_bound(n, key); + if (i < 0) { + /* + * avoid early -ENODATA return when all entries are + * higher than the search @key. + */ + i = 0; + } + if (i >= nr_entries) { + r = -ENODATA; + goto out; + } + + r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); + if (r == -ENODATA && i < (nr_entries - 1)) { + i++; + r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); + } + + } else { + i = upper_bound(n, key); + if (i < 0 || i >= nr_entries) { + r = -ENODATA; + goto out; + } + + *rkey = le64_to_cpu(n->keys[i]); + memcpy(value_le, value_ptr(n, i), info->value_type.size); + } +out: + dm_tm_unlock(info->tm, node); + return r; +} + +int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, uint64_t *rkey, void *value_le) +{ + unsigned int level; + int r = -ENODATA; + __le64 internal_value_le; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels - 1u; level++) { + r = btree_lookup_raw(&spine, root, keys[level], + lower_bound, rkey, + &internal_value_le, sizeof(uint64_t)); + if (r) + goto out; + + if (*rkey != keys[level]) { + r = -ENODATA; + goto out; + } + + root = le64_to_cpu(internal_value_le); + } + + r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); +out: + exit_ro_spine(&spine); + return r; +} + +EXPORT_SYMBOL_GPL(dm_btree_lookup_next); + +/*----------------------------------------------------------------*/ + +/* + * Copies entries from one region of a btree node to another. The regions + * must not overlap. + */ +static void copy_entries(struct btree_node *dest, unsigned int dest_offset, + struct btree_node *src, unsigned int src_offset, + unsigned int count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Moves entries from one region fo a btree node to another. The regions + * may overlap. + */ +static void move_entries(struct btree_node *dest, unsigned int dest_offset, + struct btree_node *src, unsigned int src_offset, + unsigned int count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Erases the first 'count' entries of a btree node, shifting following + * entries down into their place. + */ +static void shift_down(struct btree_node *n, unsigned int count) +{ + move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); +} + +/* + * Moves entries in a btree node up 'count' places, making space for + * new entries at the start of the node. + */ +static void shift_up(struct btree_node *n, unsigned int count) +{ + move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); +} + +/* + * Redistributes entries between two btree nodes to make them + * have similar numbers of entries. + */ +static void redistribute2(struct btree_node *left, struct btree_node *right) +{ + unsigned int nr_left = le32_to_cpu(left->header.nr_entries); + unsigned int nr_right = le32_to_cpu(right->header.nr_entries); + unsigned int total = nr_left + nr_right; + unsigned int target_left = total / 2; + unsigned int target_right = total - target_left; + + if (nr_left < target_left) { + unsigned int delta = target_left - nr_left; + copy_entries(left, nr_left, right, 0, delta); + shift_down(right, delta); + } else if (nr_left > target_left) { + unsigned int delta = nr_left - target_left; + if (nr_right) + shift_up(right, delta); + copy_entries(right, 0, left, target_left, delta); + } + + left->header.nr_entries = cpu_to_le32(target_left); + right->header.nr_entries = cpu_to_le32(target_right); +} + +/* + * Redistribute entries between three nodes. Assumes the central + * node is empty. + */ +static void redistribute3(struct btree_node *left, struct btree_node *center, + struct btree_node *right) +{ + unsigned int nr_left = le32_to_cpu(left->header.nr_entries); + unsigned int nr_center = le32_to_cpu(center->header.nr_entries); + unsigned int nr_right = le32_to_cpu(right->header.nr_entries); + unsigned int total, target_left, target_center, target_right; + + BUG_ON(nr_center); + + total = nr_left + nr_right; + target_left = total / 3; + target_center = (total - target_left) / 2; + target_right = (total - target_left - target_center); + + if (nr_left < target_left) { + unsigned int left_short = target_left - nr_left; + copy_entries(left, nr_left, right, 0, left_short); + copy_entries(center, 0, right, left_short, target_center); + shift_down(right, nr_right - target_right); + + } else if (nr_left < (target_left + target_center)) { + unsigned int left_to_center = nr_left - target_left; + copy_entries(center, 0, left, target_left, left_to_center); + copy_entries(center, left_to_center, right, 0, target_center - left_to_center); + shift_down(right, nr_right - target_right); + + } else { + unsigned int right_short = target_right - nr_right; + shift_up(right, right_short); + copy_entries(right, 0, left, nr_left - right_short, right_short); + copy_entries(center, 0, left, target_left, nr_left - target_left); + } + + left->header.nr_entries = cpu_to_le32(target_left); + center->header.nr_entries = cpu_to_le32(target_center); + right->header.nr_entries = cpu_to_le32(target_right); +} + +/* + * Splits a node by creating a sibling node and shifting half the nodes + * contents across. Assumes there is a parent node, and it has room for + * another child. + * + * Before: + * +--------+ + * | Parent | + * +--------+ + * | + * v + * +----------+ + * | A ++++++ | + * +----------+ + * + * + * After: + * +--------+ + * | Parent | + * +--------+ + * | | + * v +------+ + * +---------+ | + * | A* +++ | v + * +---------+ +-------+ + * | B +++ | + * +-------+ + * + * Where A* is a shadow of A. + */ +static int split_one_into_two(struct shadow_spine *s, unsigned int parent_index, + struct dm_btree_value_type *vt, uint64_t key) +{ + int r; + struct dm_block *left, *right, *parent; + struct btree_node *ln, *rn, *pn; + __le64 location; + + left = shadow_current(s); + + r = new_block(s->info, &right); + if (r < 0) + return r; + + ln = dm_block_data(left); + rn = dm_block_data(right); + + rn->header.flags = ln->header.flags; + rn->header.nr_entries = cpu_to_le32(0); + rn->header.max_entries = ln->header.max_entries; + rn->header.value_size = ln->header.value_size; + redistribute2(ln, rn); + + /* patch up the parent */ + parent = shadow_parent(s); + pn = dm_block_data(parent); + + location = cpu_to_le64(dm_block_location(right)); + __dm_bless_for_disk(&location); + r = insert_at(sizeof(__le64), pn, parent_index + 1, + le64_to_cpu(rn->keys[0]), &location); + if (r) { + unlock_block(s->info, right); + return r; + } + + /* patch up the spine */ + if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, right); + s->nodes[1] = left; + } else { + unlock_block(s->info, left); + s->nodes[1] = right; + } + + return 0; +} + +/* + * We often need to modify a sibling node. This function shadows a particular + * child of the given parent node. Making sure to update the parent to point + * to the new shadow. + */ +static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned int index, + struct dm_block **result) +{ + int r, inc; + dm_block_t root; + struct btree_node *node; + + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + result, &inc); + if (r) + return r; + + node = dm_block_data(*result); + + if (inc) + inc_children(info->tm, node, vt); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(*result)); + + return 0; +} + +/* + * Splits two nodes into three. This is more work, but results in fuller + * nodes, so saves metadata space. + */ +static int split_two_into_three(struct shadow_spine *s, unsigned int parent_index, + struct dm_btree_value_type *vt, uint64_t key) +{ + int r; + unsigned int middle_index; + struct dm_block *left, *middle, *right, *parent; + struct btree_node *ln, *rn, *mn, *pn; + __le64 location; + + parent = shadow_parent(s); + pn = dm_block_data(parent); + + if (parent_index == 0) { + middle_index = 1; + left = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index + 1, &right); + if (r) + return r; + } else { + middle_index = parent_index; + right = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index - 1, &left); + if (r) + return r; + } + + r = new_block(s->info, &middle); + if (r < 0) + return r; + + ln = dm_block_data(left); + mn = dm_block_data(middle); + rn = dm_block_data(right); + + mn->header.nr_entries = cpu_to_le32(0); + mn->header.flags = ln->header.flags; + mn->header.max_entries = ln->header.max_entries; + mn->header.value_size = ln->header.value_size; + + redistribute3(ln, mn, rn); + + /* patch up the parent */ + pn->keys[middle_index] = rn->keys[0]; + location = cpu_to_le64(dm_block_location(middle)); + __dm_bless_for_disk(&location); + r = insert_at(sizeof(__le64), pn, middle_index, + le64_to_cpu(mn->keys[0]), &location); + if (r) { + if (shadow_current(s) != left) + unlock_block(s->info, left); + + unlock_block(s->info, middle); + + if (shadow_current(s) != right) + unlock_block(s->info, right); + + return r; + } + + + /* patch up the spine */ + if (key < le64_to_cpu(mn->keys[0])) { + unlock_block(s->info, middle); + unlock_block(s->info, right); + s->nodes[1] = left; + } else if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, left); + unlock_block(s->info, right); + s->nodes[1] = middle; + } else { + unlock_block(s->info, left); + unlock_block(s->info, middle); + s->nodes[1] = right; + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* + * Splits a node by creating two new children beneath the given node. + * + * Before: + * +----------+ + * | A ++++++ | + * +----------+ + * + * + * After: + * +------------+ + * | A (shadow) | + * +------------+ + * | | + * +------+ +----+ + * | | + * v v + * +-------+ +-------+ + * | B +++ | | C +++ | + * +-------+ +-------+ + */ +static int btree_split_beneath(struct shadow_spine *s, uint64_t key) +{ + int r; + size_t size; + unsigned int nr_left, nr_right; + struct dm_block *left, *right, *new_parent; + struct btree_node *pn, *ln, *rn; + __le64 val; + + new_parent = shadow_current(s); + + pn = dm_block_data(new_parent); + size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? + sizeof(__le64) : s->info->value_type.size; + + /* create & init the left block */ + r = new_block(s->info, &left); + if (r < 0) + return r; + + ln = dm_block_data(left); + nr_left = le32_to_cpu(pn->header.nr_entries) / 2; + + ln->header.flags = pn->header.flags; + ln->header.nr_entries = cpu_to_le32(nr_left); + ln->header.max_entries = pn->header.max_entries; + ln->header.value_size = pn->header.value_size; + memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + + /* create & init the right block */ + r = new_block(s->info, &right); + if (r < 0) { + unlock_block(s->info, left); + return r; + } + + rn = dm_block_data(right); + nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; + + rn->header.flags = pn->header.flags; + rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.max_entries = pn->header.max_entries; + rn->header.value_size = pn->header.value_size; + memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); + memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), + nr_right * size); + + /* new_parent should just point to l and r now */ + pn->header.flags = cpu_to_le32(INTERNAL_NODE); + pn->header.nr_entries = cpu_to_le32(2); + pn->header.max_entries = cpu_to_le32( + calc_max_entries(sizeof(__le64), + dm_bm_block_size( + dm_tm_get_bm(s->info->tm)))); + pn->header.value_size = cpu_to_le32(sizeof(__le64)); + + val = cpu_to_le64(dm_block_location(left)); + __dm_bless_for_disk(&val); + pn->keys[0] = ln->keys[0]; + memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); + + val = cpu_to_le64(dm_block_location(right)); + __dm_bless_for_disk(&val); + pn->keys[1] = rn->keys[0]; + memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); + + unlock_block(s->info, left); + unlock_block(s->info, right); + return 0; +} + +/*----------------------------------------------------------------*/ + +/* + * Redistributes a node's entries with its left sibling. + */ +static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned int parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); + if (r) + return r; + + left = dm_block_data(sib); + right = dm_block_data(shadow_current(s)); + redistribute2(left, right); + *key_ptr(parent, parent_index) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } else { + unlock_block(s->info, sib); + } + + return 0; +} + +/* + * Redistributes a nodes entries with its right sibling. + */ +static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned int parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); + if (r) + return r; + + left = dm_block_data(shadow_current(s)); + right = dm_block_data(sib); + redistribute2(left, right); + *key_ptr(parent, parent_index + 1) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, sib); + } else { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } + + return 0; +} + +/* + * Returns the number of spare entries in a node. + */ +static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned int *space) +{ + int r; + unsigned int nr_entries; + struct dm_block *block; + struct btree_node *node; + + r = bn_read_lock(info, b, &block); + if (r) + return r; + + node = dm_block_data(block); + nr_entries = le32_to_cpu(node->header.nr_entries); + *space = le32_to_cpu(node->header.max_entries) - nr_entries; + + unlock_block(info, block); + return 0; +} + +/* + * Make space in a node, either by moving some entries to a sibling, + * or creating a new sibling node. SPACE_THRESHOLD defines the minimum + * number of free entries that must be in the sibling to make the move + * worth while. If the siblings are shared (eg, part of a snapshot), + * then they are not touched, since this break sharing and so consume + * more space than we save. + */ +#define SPACE_THRESHOLD 8 +static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned int parent_index, uint64_t key) +{ + int r; + struct btree_node *parent = dm_block_data(shadow_parent(s)); + unsigned int nr_parent = le32_to_cpu(parent->header.nr_entries); + unsigned int free_space; + int left_shared = 0, right_shared = 0; + + /* Should we move entries to the left sibling? */ + if (parent_index > 0) { + dm_block_t left_b = value64(parent, parent_index - 1); + r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); + if (r) + return r; + + if (!left_shared) { + r = get_node_free_space(s->info, left_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_left(s, vt, parent_index, key); + } + } + + /* Should we move entries to the right sibling? */ + if (parent_index < (nr_parent - 1)) { + dm_block_t right_b = value64(parent, parent_index + 1); + r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); + if (r) + return r; + + if (!right_shared) { + r = get_node_free_space(s->info, right_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_right(s, vt, parent_index, key); + } + } + + /* + * We need to split the node, normally we split two nodes + * into three. But when inserting a sequence that is either + * monotonically increasing or decreasing it's better to split + * a single node into two. + */ + if (left_shared || right_shared || (nr_parent <= 2) || + (parent_index == 0) || (parent_index + 1 == nr_parent)) { + return split_one_into_two(s, parent_index, vt, key); + } else { + return split_two_into_three(s, parent_index, vt, key); + } +} + +/* + * Does the node contain a particular key? + */ +static bool contains_key(struct btree_node *node, uint64_t key) +{ + int i = lower_bound(node, key); + + if (i >= 0 && le64_to_cpu(node->keys[i]) == key) + return true; + + return false; +} + +/* + * In general we preemptively make sure there's a free entry in every + * node on the spine when doing an insert. But we can avoid that with + * leaf nodes if we know it's an overwrite. + */ +static bool has_space_for_insert(struct btree_node *node, uint64_t key) +{ + if (node->header.nr_entries == node->header.max_entries) { + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + /* we don't need space if it's an overwrite */ + return contains_key(node, key); + } + + return false; + } + + return true; +} + +static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, + struct dm_btree_value_type *vt, + uint64_t key, unsigned int *index) +{ + int r, i = *index, top = 1; + struct btree_node *node; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + + if (!has_space_for_insert(node, key)) { + if (top) + r = btree_split_beneath(s, key); + else + r = rebalance_or_split(s, vt, i, key); + + if (r < 0) + return r; + + /* making space can cause the current node to change */ + node = dm_block_data(shadow_current(s)); + } + + i = lower_bound(node, key); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) + break; + + if (i < 0) { + /* change the bounds on the lowest key */ + node->keys[0] = cpu_to_le64(key); + i = 0; + } + + root = value64(node, i); + top = 0; + } + + if (i < 0 || le64_to_cpu(node->keys[i]) != key) + i++; + + *index = i; + return 0; +} + +static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, + uint64_t key, int *index) +{ + int r, i = -1; + struct btree_node *node; + + *index = 0; + for (;;) { + r = shadow_step(s, root, &s->info->value_type); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + i = lower_bound(node, key); + + BUG_ON(i < 0); + BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + if (key != le64_to_cpu(node->keys[i])) + return -EINVAL; + break; + } + + root = value64(node, i); + } + + *index = i; + return 0; +} + +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf) +{ + int r; + struct shadow_spine spine; + + BUG_ON(info->levels > 1); + init_shadow_spine(&spine, info); + r = __btree_get_overwrite_leaf(&spine, root, key, index); + if (!r) { + *new_root = shadow_root(&spine); + *leaf = shadow_current(&spine); + + /* + * Decrement the count so exit_shadow_spine() doesn't + * unlock the leaf. + */ + spine.count--; + } + exit_shadow_spine(&spine); + + return r; +} + +static bool need_insert(struct btree_node *node, uint64_t *keys, + unsigned int level, unsigned int index) +{ + return ((index >= le32_to_cpu(node->header.nr_entries)) || + (le64_to_cpu(node->keys[index]) != keys[level])); +} + +static int insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value) +{ + int r; + unsigned int level, index = -1, last_level = info->levels - 1; + dm_block_t block = root; + struct shadow_spine spine; + struct btree_node *n; + struct dm_btree_value_type le64_type; + + init_le64_type(info->tm, &le64_type); + init_shadow_spine(&spine, info); + + for (level = 0; level < (info->levels - 1); level++) { + r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); + if (r < 0) + goto bad; + + n = dm_block_data(shadow_current(&spine)); + + if (need_insert(n, keys, level, index)) { + dm_block_t new_tree; + __le64 new_le; + + r = dm_btree_empty(info, &new_tree); + if (r < 0) + goto bad; + + new_le = cpu_to_le64(new_tree); + __dm_bless_for_disk(&new_le); + + r = insert_at(sizeof(uint64_t), n, index, + keys[level], &new_le); + if (r) + goto bad; + } + + if (level < last_level) + block = value64(n, index); + } + + r = btree_insert_raw(&spine, block, &info->value_type, + keys[level], &index); + if (r < 0) + goto bad; + + n = dm_block_data(shadow_current(&spine)); + + if (need_insert(n, keys, level, index)) { + if (inserted) + *inserted = 1; + + r = insert_at(info->value_type.size, n, index, + keys[level], value); + if (r) + goto bad_unblessed; + } else { + if (inserted) + *inserted = 0; + + if (info->value_type.dec && + (!info->value_type.equal || + !info->value_type.equal( + info->value_type.context, + value_ptr(n, index), + value))) { + info->value_type.dec(info->value_type.context, + value_ptr(n, index), 1); + } + memcpy_disk(value_ptr(n, index), + value, info->value_type.size); + } + + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return 0; + +bad: + __dm_unbless_for_disk(value); +bad_unblessed: + exit_shadow_spine(&spine); + return r; +} + +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + return insert(info, root, keys, value, new_root, NULL); +} +EXPORT_SYMBOL_GPL(dm_btree_insert); + +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value) +{ + return insert(info, root, keys, value, new_root, inserted); +} +EXPORT_SYMBOL_GPL(dm_btree_insert_notify); + +/*----------------------------------------------------------------*/ + +static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, + uint64_t *result_key, dm_block_t *next_block) +{ + int i, r; + uint32_t flags; + + do { + r = ro_step(s, block); + if (r < 0) + return r; + + flags = le32_to_cpu(ro_node(s)->header.flags); + i = le32_to_cpu(ro_node(s)->header.nr_entries); + if (!i) + return -ENODATA; + else + i--; + + if (find_highest) + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + else + *result_key = le64_to_cpu(ro_node(s)->keys[0]); + + if (next_block || flags & INTERNAL_NODE) { + if (find_highest) + block = value64(ro_node(s), i); + else + block = value64(ro_node(s), 0); + } + + } while (flags & INTERNAL_NODE); + + if (next_block) + *next_block = block; + return 0; +} + +static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, + bool find_highest, uint64_t *result_keys) +{ + int r = 0, count = 0, level; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + r = find_key(&spine, root, find_highest, result_keys + level, + level == info->levels - 1 ? NULL : &root); + if (r == -ENODATA) { + r = 0; + break; + + } else if (r) + break; + + count++; + } + exit_ro_spine(&spine); + + return r ? r : count; +} + +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, true, result_keys); +} +EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); + +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, false, result_keys); +} +EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); + +/*----------------------------------------------------------------*/ + +/* + * FIXME: We shouldn't use a recursive algorithm when we have limited stack + * space. Also this only works for single level trees. + */ +static int walk_node(struct dm_btree_info *info, dm_block_t block, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + unsigned int i, nr; + struct dm_block *node; + struct btree_node *n; + uint64_t keys; + + r = bn_read_lock(info, block, &node); + if (r) + return r; + + n = dm_block_data(node); + + nr = le32_to_cpu(n->header.nr_entries); + for (i = 0; i < nr; i++) { + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { + r = walk_node(info, value64(n, i), fn, context); + if (r) + goto out; + } else { + keys = le64_to_cpu(*key_ptr(n, i)); + r = fn(context, &keys, value_ptr(n, i)); + if (r) + goto out; + } + } + +out: + dm_tm_unlock(info->tm, node); + return r; +} + +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + BUG_ON(info->levels > 1); + return walk_node(info, root, fn, context); +} +EXPORT_SYMBOL_GPL(dm_btree_walk); + +/*----------------------------------------------------------------*/ + +static void prefetch_values(struct dm_btree_cursor *c) +{ + unsigned int i, nr; + __le64 value_le; + struct cursor_node *n = c->nodes + c->depth - 1; + struct btree_node *bn = dm_block_data(n->b); + struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); + + BUG_ON(c->info->value_type.size != sizeof(value_le)); + + nr = le32_to_cpu(bn->header.nr_entries); + for (i = 0; i < nr; i++) { + memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); + dm_bm_prefetch(bm, le64_to_cpu(value_le)); + } +} + +static bool leaf_node(struct dm_btree_cursor *c) +{ + struct cursor_node *n = c->nodes + c->depth - 1; + struct btree_node *bn = dm_block_data(n->b); + + return le32_to_cpu(bn->header.flags) & LEAF_NODE; +} + +static int push_node(struct dm_btree_cursor *c, dm_block_t b) +{ + int r; + struct cursor_node *n = c->nodes + c->depth; + + if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { + DMERR("couldn't push cursor node, stack depth too high"); + return -EINVAL; + } + + r = bn_read_lock(c->info, b, &n->b); + if (r) + return r; + + n->index = 0; + c->depth++; + + if (c->prefetch_leaves || !leaf_node(c)) + prefetch_values(c); + + return 0; +} + +static void pop_node(struct dm_btree_cursor *c) +{ + c->depth--; + unlock_block(c->info, c->nodes[c->depth].b); +} + +static int inc_or_backtrack(struct dm_btree_cursor *c) +{ + struct cursor_node *n; + struct btree_node *bn; + + for (;;) { + if (!c->depth) + return -ENODATA; + + n = c->nodes + c->depth - 1; + bn = dm_block_data(n->b); + + n->index++; + if (n->index < le32_to_cpu(bn->header.nr_entries)) + break; + + pop_node(c); + } + + return 0; +} + +static int find_leaf(struct dm_btree_cursor *c) +{ + int r = 0; + struct cursor_node *n; + struct btree_node *bn; + __le64 value_le; + + for (;;) { + n = c->nodes + c->depth - 1; + bn = dm_block_data(n->b); + + if (le32_to_cpu(bn->header.flags) & LEAF_NODE) + break; + + memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); + r = push_node(c, le64_to_cpu(value_le)); + if (r) { + DMERR("push_node failed"); + break; + } + } + + if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) + return -ENODATA; + + return r; +} + +int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, + bool prefetch_leaves, struct dm_btree_cursor *c) +{ + int r; + + c->info = info; + c->root = root; + c->depth = 0; + c->prefetch_leaves = prefetch_leaves; + + r = push_node(c, root); + if (r) + return r; + + return find_leaf(c); +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); + +void dm_btree_cursor_end(struct dm_btree_cursor *c) +{ + while (c->depth) + pop_node(c); +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_end); + +int dm_btree_cursor_next(struct dm_btree_cursor *c) +{ + int r = inc_or_backtrack(c); + if (!r) { + r = find_leaf(c); + if (r) + DMERR("find_leaf failed"); + } + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_next); + +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) +{ + int r = 0; + + while (count-- && !r) + r = dm_btree_cursor_next(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); + +int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) +{ + if (c->depth) { + struct cursor_node *n = c->nodes + c->depth - 1; + struct btree_node *bn = dm_block_data(n->b); + + if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) + return -EINVAL; + + *key = le64_to_cpu(*key_ptr(bn, n->index)); + memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); + return 0; + + } else + return -ENODATA; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h new file mode 100644 index 000000000..5566e7c32 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.h @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BTREE_H +#define _LINUX_DM_BTREE_H + +#include "dm-block-manager.h" + +struct dm_transaction_manager; + +/*----------------------------------------------------------------*/ + +/* + * Annotations used to check on-disk metadata is handled as little-endian. + */ +#ifdef __CHECKER__ +# define __dm_written_to_disk(x) __releases(x) +# define __dm_reads_from_disk(x) __acquires(x) +# define __dm_bless_for_disk(x) __acquire(x) +# define __dm_unbless_for_disk(x) __release(x) +#else +# define __dm_written_to_disk(x) +# define __dm_reads_from_disk(x) +# define __dm_bless_for_disk(x) +# define __dm_unbless_for_disk(x) +#endif + +/*----------------------------------------------------------------*/ + +/* + * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized + * values. + */ + +/* + * Information about the values stored within the btree. + */ +struct dm_btree_value_type { + void *context; + + /* + * The size in bytes of each value. + */ + uint32_t size; + + /* + * Any of these methods can be safely set to NULL if you do not + * need the corresponding feature. + */ + + /* + * The btree is making a duplicate of a run of values, for instance + * because previously-shared btree nodes have now diverged. + * @value argument is the new copy that the copy function may modify. + * (Probably it just wants to increment a reference count + * somewhere.) This method is _not_ called for insertion of a new + * value: It is assumed the ref count is already 1. + */ + void (*inc)(void *context, const void *value, unsigned int count); + + /* + * These values are being deleted. The btree takes care of freeing + * the memory pointed to by @value. Often the del function just + * needs to decrement a reference counts somewhere. + */ + void (*dec)(void *context, const void *value, unsigned int count); + + /* + * A test for equality between two values. When a value is + * overwritten with a new one, the old one has the dec method + * called _unless_ the new and old value are deemed equal. + */ + int (*equal)(void *context, const void *value1, const void *value2); +}; + +/* + * The shape and contents of a btree. + */ +struct dm_btree_info { + struct dm_transaction_manager *tm; + + /* + * Number of nested btrees. (Not the depth of a single tree.) + */ + unsigned int levels; + struct dm_btree_value_type value_type; +}; + +/* + * Set up an empty tree. O(1). + */ +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); + +/* + * Delete a tree. O(n) - this is the slow one! It can also block, so + * please don't call it on an IO path. + */ +int dm_btree_del(struct dm_btree_info *info, dm_block_t root); + +/* + * All the lookup functions return -ENODATA if the key cannot be found. + */ + +/* + * Tries to find a key that matches exactly. O(ln(n)) + */ +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value_le); + +/* + * Tries to find the first key where the bottom level key is >= to that + * given. Useful for skipping empty sections of the btree. + */ +int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, uint64_t *rkey, void *value_le); + +/* + * Insertion (or overwrite an existing value). O(ln(n)) + */ +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * A variant of insert that indicates whether it actually inserted or just + * overwrote. Useful if you're keeping track of the number of entries in a + * tree. + */ +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value); + +/* + * Remove a key if present. This doesn't remove empty sub trees. Normally + * subtrees represent a separate entity, like a snapshot map, so this is + * correct behaviour. O(ln(n)). + */ +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, dm_block_t *new_root); + +/* + * Removes a _contiguous_ run of values starting from 'keys' and not + * reaching keys2 (where keys2 is keys with the final key replaced with + * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be + * altered. + */ +int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, uint64_t end_key, + dm_block_t *new_root, unsigned int *nr_removed); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have + * no lowest key. + */ +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have + * no highest key. + */ +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +/* + * Iterate through the a btree, calling fn() on each entry. + * It only works for single level trees and is internally recursive, so + * monitor stack usage carefully. + */ +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context); + + +/*----------------------------------------------------------------*/ + +/* + * Cursor API. This does not follow the rolling lock convention. Since we + * know the order that values are required we can issue prefetches to speed + * up iteration. Use on a single level btree only. + */ +#define DM_BTREE_CURSOR_MAX_DEPTH 16 + +struct cursor_node { + struct dm_block *b; + unsigned int index; +}; + +struct dm_btree_cursor { + struct dm_btree_info *info; + dm_block_t root; + + bool prefetch_leaves; + unsigned int depth; + struct cursor_node nodes[DM_BTREE_CURSOR_MAX_DEPTH]; +}; + +/* + * Creates a fresh cursor. If prefetch_leaves is set then it is assumed + * the btree contains block indexes that will be prefetched. The cursor is + * quite large, so you probably don't want to put it on the stack. + */ +int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, + bool prefetch_leaves, struct dm_btree_cursor *c); +void dm_btree_cursor_end(struct dm_btree_cursor *c); +int dm_btree_cursor_next(struct dm_btree_cursor *c); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count); +int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); + +#endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h new file mode 100644 index 000000000..b945a2be9 --- /dev/null +++ b/drivers/md/persistent-data/dm-persistent-data-internal.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _DM_PERSISTENT_DATA_INTERNAL_H +#define _DM_PERSISTENT_DATA_INTERNAL_H + +#include "dm-block-manager.h" + +static inline unsigned int dm_hash_block(dm_block_t b, unsigned int hash_mask) +{ + const unsigned int BIG_PRIME = 4294967291UL; + + return (((unsigned int) b) * BIG_PRIME) & hash_mask; +} + +#endif /* _PERSISTENT_DATA_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c new file mode 100644 index 000000000..af800efed --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -0,0 +1,1259 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-common.h" +#include "dm-transaction-manager.h" +#include "dm-btree-internal.h" +#include "dm-persistent-data-internal.h" + +#include +#include + +#define DM_MSG_PREFIX "space map common" + +/*----------------------------------------------------------------*/ + +/* + * Index validator. + */ +#define INDEX_CSUM_XOR 160478 + +static void index_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + + mi_le->blocknr = cpu_to_le64(dm_block_location(b)); + mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); +} + +static int index_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { + DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(mi_le->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); + if (csum_disk != mi_le->csum) { + DMERR_LIMIT("index_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator index_validator = { + .name = "index", + .prepare_for_write = index_prepare_for_write, + .check = index_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Bitmap validator + */ +#define BITMAP_CSUM_XOR 240779 + +static void dm_bitmap_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + + disk_header->blocknr = cpu_to_le64(dm_block_location(b)); + disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); +} + +static int dm_bitmap_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { + DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu", + le64_to_cpu(disk_header->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); + if (csum_disk != disk_header->csum) { + DMERR_LIMIT("bitmap check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator dm_sm_bitmap_validator = { + .name = "sm_bitmap", + .prepare_for_write = dm_bitmap_prepare_for_write, + .check = dm_bitmap_check, +}; + +/*----------------------------------------------------------------*/ + +#define ENTRIES_PER_WORD 32 +#define ENTRIES_SHIFT 5 + +static void *dm_bitmap_data(struct dm_block *b) +{ + return dm_block_data(b) + sizeof(struct disk_bitmap_header); +} + +#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL + +static unsigned int dm_bitmap_word_used(void *addr, unsigned int b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + uint64_t bits = le64_to_cpu(*w_le); + uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH; + + return !(~bits & mask); +} + +static unsigned int sm_lookup_bitmap(void *addr, unsigned int b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + unsigned int hi, lo; + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + hi = !!test_bit_le(b, (void *) w_le); + lo = !!test_bit_le(b + 1, (void *) w_le); + return (hi << 1) | lo; +} + +static void sm_set_bitmap(void *addr, unsigned int b, unsigned int val) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + + if (val & 2) + __set_bit_le(b, (void *) w_le); + else + __clear_bit_le(b, (void *) w_le); + + if (val & 1) + __set_bit_le(b + 1, (void *) w_le); + else + __clear_bit_le(b + 1, (void *) w_le); +} + +static int sm_find_free(void *addr, unsigned int begin, unsigned int end, + unsigned int *result) +{ + while (begin < end) { + if (!(begin & (ENTRIES_PER_WORD - 1)) && + dm_bitmap_word_used(addr, begin)) { + begin += ENTRIES_PER_WORD; + continue; + } + + if (!sm_lookup_bitmap(addr, begin)) { + *result = begin; + return 0; + } + + begin++; + } + + return -ENOSPC; +} + +/*----------------------------------------------------------------*/ + +static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + memset(ll, 0, sizeof(struct ll_disk)); + + ll->tm = tm; + + ll->bitmap_info.tm = tm; + ll->bitmap_info.levels = 1; + + /* + * Because the new bitmap blocks are created via a shadow + * operation, the old entry has already had its reference count + * decremented and we don't need the btree to do any bookkeeping. + */ + ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry); + ll->bitmap_info.value_type.inc = NULL; + ll->bitmap_info.value_type.dec = NULL; + ll->bitmap_info.value_type.equal = NULL; + + ll->ref_count_info.tm = tm; + ll->ref_count_info.levels = 1; + ll->ref_count_info.value_type.size = sizeof(uint32_t); + ll->ref_count_info.value_type.inc = NULL; + ll->ref_count_info.value_type.dec = NULL; + ll->ref_count_info.value_type.equal = NULL; + + ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); + + if (ll->block_size > (1 << 30)) { + DMERR("block size too big to hold bitmaps"); + return -EINVAL; + } + + ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) * + ENTRIES_PER_BYTE; + ll->nr_blocks = 0; + ll->bitmap_root = 0; + ll->ref_count_root = 0; + ll->bitmap_index_changed = false; + + return 0; +} + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) +{ + int r; + dm_block_t i, nr_blocks, nr_indexes; + unsigned int old_blocks, blocks; + + nr_blocks = ll->nr_blocks + extra_blocks; + old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block); + blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block); + + nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block); + if (nr_indexes > ll->max_entries(ll)) { + DMERR("space map too large"); + return -EINVAL; + } + + /* + * We need to set this before the dm_tm_new_block() call below. + */ + ll->nr_blocks = nr_blocks; + for (i = old_blocks; i < blocks; i++) { + struct dm_block *b; + struct disk_index_entry idx; + + r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); + if (r < 0) + return r; + + idx.blocknr = cpu_to_le64(dm_block_location(b)); + + dm_tm_unlock(ll->tm, b); + + idx.nr_free = cpu_to_le32(ll->entries_per_block); + idx.none_free_before = 0; + + r = ll->save_ie(ll, i, &idx); + if (r < 0) + return r; + } + + return 0; +} + +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r; + dm_block_t index = b; + struct disk_index_entry ie_disk; + struct dm_block *blk; + + if (b >= ll->nr_blocks) { + DMERR_LIMIT("metadata block out of bounds"); + return -EINVAL; + } + + b = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); + + dm_tm_unlock(ll->tm, blk); + + return 0; +} + +static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b, + uint32_t *result) +{ + __le32 le_rc; + int r; + + r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); + if (r < 0) + return r; + + *result = le32_to_cpu(le_rc); + + return r; +} + +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r = sm_ll_lookup_bitmap(ll, b, result); + + if (r) + return r; + + if (*result != 3) + return r; + + return sm_ll_lookup_big_ref_count(ll, b, result); +} + +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result) +{ + int r; + struct disk_index_entry ie_disk; + dm_block_t i, index_begin = begin; + dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block); + + /* + * FIXME: Use shifts + */ + begin = do_div(index_begin, ll->entries_per_block); + end = do_div(end, ll->entries_per_block); + if (end == 0) + end = ll->entries_per_block; + + for (i = index_begin; i < index_end; i++, begin = 0) { + struct dm_block *blk; + unsigned int position; + uint32_t bit_end; + + r = ll->load_ie(ll, i, &ie_disk); + if (r < 0) + return r; + + if (le32_to_cpu(ie_disk.nr_free) == 0) + continue; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + bit_end = (i == index_end - 1) ? end : ll->entries_per_block; + + r = sm_find_free(dm_bitmap_data(blk), + max_t(unsigned int, begin, le32_to_cpu(ie_disk.none_free_before)), + bit_end, &position); + if (r == -ENOSPC) { + /* + * This might happen because we started searching + * part way through the bitmap. + */ + dm_tm_unlock(ll->tm, blk); + continue; + } + + dm_tm_unlock(ll->tm, blk); + + *result = i * ll->entries_per_block + (dm_block_t) position; + return 0; + } + + return -ENOSPC; +} + +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *b) +{ + int r; + uint32_t count; + + do { + r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b); + if (r) + break; + + /* double check this block wasn't used in the old transaction */ + if (*b >= old_ll->nr_blocks) + count = 0; + else { + r = sm_ll_lookup(old_ll, *b, &count); + if (r) + break; + + if (count) + begin = *b + 1; + } + } while (count); + + return r; +} + +/*----------------------------------------------------------------*/ + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, int32_t *nr_allocations) +{ + int r; + uint32_t bit, old; + struct dm_block *nb; + dm_block_t index = b; + struct disk_index_entry ie_disk; + void *bm_le; + int inc; + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &nb, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); + bm_le = dm_bitmap_data(nb); + + old = sm_lookup_bitmap(bm_le, bit); + if (old > 2) { + r = sm_ll_lookup_big_ref_count(ll, b, &old); + if (r < 0) { + dm_tm_unlock(ll->tm, nb); + return r; + } + } + + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } + + if (ref_count <= 2) { + sm_set_bitmap(bm_le, bit, ref_count); + dm_tm_unlock(ll->tm, nb); + + if (old > 2) { + r = dm_btree_remove(&ll->ref_count_info, + ll->ref_count_root, + &b, &ll->ref_count_root); + if (r) + return r; + } + + } else { + __le32 le_rc = cpu_to_le32(ref_count); + + sm_set_bitmap(bm_le, bit, 3); + dm_tm_unlock(ll->tm, nb); + + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + } + + if (ref_count && !old) { + *nr_allocations = 1; + ll->nr_allocated++; + le32_add_cpu(&ie_disk.nr_free, -1); + if (le32_to_cpu(ie_disk.none_free_before) == bit) + ie_disk.none_free_before = cpu_to_le32(bit + 1); + + } else if (old && !ref_count) { + *nr_allocations = -1; + ll->nr_allocated--; + le32_add_cpu(&ie_disk.nr_free, 1); + ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); + } else + *nr_allocations = 0; + + return ll->save_ie(ll, index, &ie_disk); +} + +/*----------------------------------------------------------------*/ + +/* + * Holds useful intermediate results for the range based inc and dec + * operations. + */ +struct inc_context { + struct disk_index_entry ie_disk; + struct dm_block *bitmap_block; + void *bitmap; + + struct dm_block *overflow_leaf; +}; + +static inline void init_inc_context(struct inc_context *ic) +{ + ic->bitmap_block = NULL; + ic->bitmap = NULL; + ic->overflow_leaf = NULL; +} + +static inline void exit_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + if (ic->bitmap_block) + dm_tm_unlock(ll->tm, ic->bitmap_block); + if (ic->overflow_leaf) + dm_tm_unlock(ll->tm, ic->overflow_leaf); +} + +static inline void reset_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + exit_inc_context(ll, ic); + init_inc_context(ic); +} + +/* + * Confirms a btree node contains a particular key at an index. + */ +static bool contains_key(struct btree_node *n, uint64_t key, int index) +{ + return index >= 0 && + index < le32_to_cpu(n->header.nr_entries) && + le64_to_cpu(n->keys[index]) == key; +} + +static int __sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) +{ + int r; + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * bitmap_block needs to be unlocked because getting the + * overflow_leaf may need to allocate, and thus use the space map. + */ + reset_inc_context(ll, ic); + + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); + return -EINVAL; + } + + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + + return 0; +} + +static int sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) +{ + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + + return 0; + } + } + + return __sm_ll_inc_overflow(ll, b, ic); +} + +static inline int shadow_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + int r, inc; + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ic->ie_disk.blocknr = cpu_to_le64(dm_block_location(ic->bitmap_block)); + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + return 0; +} + +/* + * Once shadow_bitmap has been called, which always happens at the start of inc/dec, + * we can reopen the bitmap with a simple write lock, rather than re calling + * dm_tm_shadow_block(). + */ +static inline int ensure_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + if (!ic->bitmap_block) { + int r = dm_bm_write_lock(dm_tm_get_bm(ll->tm), le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block); + if (r) { + DMERR("unable to re-get write lock for bitmap"); + return r; + } + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + } + + return 0; +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_inc_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + int32_t *nr_allocations, dm_block_t *new_b, + struct inc_context *ic) +{ + int r; + __le32 le_rc; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + /* inc bitmap, adjust nr_allocated */ + sm_set_bitmap(ic->bitmap, bit, 1); + (*nr_allocations)++; + ll->nr_allocated++; + le32_add_cpu(&ic->ie_disk.nr_free, -1); + if (le32_to_cpu(ic->ie_disk.none_free_before) == bit) + ic->ie_disk.none_free_before = cpu_to_le32(bit + 1); + break; + + case 1: + /* inc bitmap */ + sm_set_bitmap(ic->bitmap, bit, 2); + break; + + case 2: + /* inc bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 3); + reset_inc_context(ll, ic); + + le_rc = cpu_to_le32(3); + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + break; + + default: + /* + * inc within the overflow tree only. + */ + r = sm_ll_inc_overflow(ll, b, ic); + if (r < 0) + return r; + } + } + + *new_b = b; + return 0; +} + +/* + * Finds a bitmap that contains entries in the block range, and increments + * them. + */ +static int __sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + struct inc_context ic; + uint32_t bit, bit_end; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_inc_bitmap(ll, b, bit, bit_end, nr_allocations, new_b, &ic); + + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); +} + +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) +{ + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_inc(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +static int __sm_ll_del_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic) +{ + reset_inc_context(ll, ic); + return dm_btree_remove(&ll->ref_count_info, ll->ref_count_root, + &b, &ll->ref_count_root); +} + +static int __sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) +{ + int r; + int index = -1; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + reset_inc_context(ll, ic); + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); + return -EINVAL; + } + + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc == 3) { + return __sm_ll_del_overflow(ll, b, ic); + } else { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } +} + +static int sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) +{ + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc > 3) { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } else { + return __sm_ll_del_overflow(ll, b, ic); + } + + } + } + + return __sm_ll_dec_overflow(ll, b, ic, old_rc); +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_dec_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + struct inc_context *ic, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + DMERR("unable to decrement block"); + return -EINVAL; + + case 1: + /* dec bitmap */ + sm_set_bitmap(ic->bitmap, bit, 0); + (*nr_allocations)--; + ll->nr_allocated--; + le32_add_cpu(&ic->ie_disk.nr_free, 1); + ic->ie_disk.none_free_before = + cpu_to_le32(min(le32_to_cpu(ic->ie_disk.none_free_before), bit)); + break; + + case 2: + /* dec bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 1); + break; + + case 3: + r = sm_ll_dec_overflow(ll, b, ic, &old); + if (r < 0) + return r; + + if (old == 3) { + r = ensure_bitmap(ll, ic); + if (r) + return r; + + sm_set_bitmap(ic->bitmap, bit, 2); + } + break; + } + } + + *new_b = b; + return 0; +} + +static int __sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t bit, bit_end; + struct inc_context ic; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_dec_bitmap(ll, b, bit, bit_end, &ic, nr_allocations, new_b); + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) +{ + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_dec(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +int sm_ll_commit(struct ll_disk *ll) +{ + int r = 0; + + if (ll->bitmap_index_changed) { + r = ll->commit(ll); + if (!r) + ll->bitmap_index_changed = false; + } + + return r; +} + +/*----------------------------------------------------------------*/ + +static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + memcpy(ie, ll->mi_le.index + index, sizeof(*ie)); + return 0; +} + +static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + ll->bitmap_index_changed = true; + memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); + return 0; +} + +static int metadata_ll_init_index(struct ll_disk *ll) +{ + int r; + struct dm_block *b; + + r = dm_tm_new_block(ll->tm, &index_validator, &b); + if (r < 0) + return r; + + ll->bitmap_root = dm_block_location(b); + + dm_tm_unlock(ll->tm, b); + + return 0; +} + +static int metadata_ll_open(struct ll_disk *ll) +{ + int r; + struct dm_block *block; + + r = dm_tm_read_lock(ll->tm, ll->bitmap_root, + &index_validator, &block); + if (r) + return r; + + memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); + dm_tm_unlock(ll->tm, block); + + return 0; +} + +static dm_block_t metadata_ll_max_entries(struct ll_disk *ll) +{ + return MAX_METADATA_BITMAPS; +} + +static int metadata_ll_commit(struct ll_disk *ll) +{ + int r, inc; + struct dm_block *b; + + r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc); + if (r) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + dm_tm_unlock(ll->tm, b); + + return 0; +} + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root smr; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + /* + * We don't know the alignment of the root_le buffer, so need to + * copy into a new structure. + */ + memcpy(&smr, root_le, sizeof(smr)); + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr.nr_blocks); + ll->nr_allocated = le64_to_cpu(smr.nr_allocated); + ll->bitmap_root = le64_to_cpu(smr.bitmap_root); + ll->ref_count_root = le64_to_cpu(smr.ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ + +static inline int ie_cache_writeback(struct ll_disk *ll, struct ie_cache *iec) +{ + iec->dirty = false; + __dm_bless_for_disk(iec->ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &iec->index, &iec->ie, &ll->bitmap_root); +} + +static inline unsigned int hash_index(dm_block_t index) +{ + return dm_hash_block(index, IE_CACHE_MASK); +} + +static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + int r; + unsigned int h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + if (iec->valid) { + if (iec->index == index) { + memcpy(ie, &iec->ie, sizeof(*ie)); + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + r = dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + if (!r) { + iec->valid = true; + iec->dirty = false; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + } + + return r; +} + +static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + int r; + unsigned int h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + ll->bitmap_index_changed = true; + if (iec->valid) { + if (iec->index == index) { + memcpy(&iec->ie, ie, sizeof(*ie)); + iec->dirty = true; + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + iec->valid = true; + iec->dirty = true; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + return 0; +} + +static int disk_ll_init_index(struct ll_disk *ll) +{ + unsigned int i; + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + iec->valid = false; + iec->dirty = false; + } + return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); +} + +static int disk_ll_open(struct ll_disk *ll) +{ + return 0; +} + +static dm_block_t disk_ll_max_entries(struct ll_disk *ll) +{ + return -1ULL; +} + +static int disk_ll_commit(struct ll_disk *ll) +{ + int r = 0; + unsigned int i; + + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + if (iec->valid && iec->dirty) + r = ie_cache_writeback(ll, iec); + } + + return r; +} + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h new file mode 100644 index 000000000..706ceb85d --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_SPACE_MAP_COMMON_H +#define DM_SPACE_MAP_COMMON_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * Low level disk format + * + * Bitmap btree + * ------------ + * + * Each value stored in the btree is an index_entry. This points to a + * block that is used as a bitmap. Within the bitmap hold 2 bits per + * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and + * REF_COUNT = many. + * + * Refcount btree + * -------------- + * + * Any entry that has a ref count higher than 2 gets entered in the ref + * count tree. The leaf values for this tree is the 32-bit ref count. + */ + +struct disk_index_entry { + __le64 blocknr; + __le32 nr_free; + __le32 none_free_before; +} __attribute__ ((packed, aligned(8))); + + +#define MAX_METADATA_BITMAPS 255 +struct disk_metadata_index { + __le32 csum; + __le32 padding; + __le64 blocknr; + + struct disk_index_entry index[MAX_METADATA_BITMAPS]; +} __attribute__ ((packed, aligned(8))); + +struct ll_disk; + +typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result); +typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie); +typedef int (*init_index_fn)(struct ll_disk *ll); +typedef int (*open_index_fn)(struct ll_disk *ll); +typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); +typedef int (*commit_fn)(struct ll_disk *ll); + +/* + * A lot of time can be wasted reading and writing the same + * index entry. So we cache a few entries. + */ +#define IE_CACHE_SIZE 64 +#define IE_CACHE_MASK (IE_CACHE_SIZE - 1) + +struct ie_cache { + bool valid; + bool dirty; + dm_block_t index; + struct disk_index_entry ie; +}; + +struct ll_disk { + struct dm_transaction_manager *tm; + struct dm_btree_info bitmap_info; + struct dm_btree_info ref_count_info; + + uint32_t block_size; + uint32_t entries_per_block; + dm_block_t nr_blocks; + dm_block_t nr_allocated; + + /* + * bitmap_root may be a btree root or a simple index. + */ + dm_block_t bitmap_root; + + dm_block_t ref_count_root; + + struct disk_metadata_index mi_le; + load_ie_fn load_ie; + save_ie_fn save_ie; + init_index_fn init_index; + open_index_fn open_index; + max_index_entries_fn max_entries; + commit_fn commit; + bool bitmap_index_changed:1; + + struct ie_cache ie_cache[IE_CACHE_SIZE]; +}; + +struct disk_sm_root { + __le64 nr_blocks; + __le64 nr_allocated; + __le64 bitmap_root; + __le64 ref_count_root; +} __attribute__ ((packed, aligned(8))); + +#define ENTRIES_PER_BYTE 4 + +struct disk_bitmap_header { + __le32 csum; + __le32 not_used; + __le64 blocknr; +} __attribute__ ((packed, aligned(8))); + +/*----------------------------------------------------------------*/ + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result); +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *result); + +/* + * The next three functions return (via nr_allocations) the net number of + * allocations that were made. This number may be negative if there were + * more frees than allocs. + */ +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); +int sm_ll_commit(struct ll_disk *ll); + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +/*----------------------------------------------------------------*/ + +#endif /* DM_SPACE_MAP_COMMON_H */ diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c new file mode 100644 index 000000000..d0a8d5e73 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-common.h" +#include "dm-space-map-disk.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include +#include +#include +#include + +#define DM_MSG_PREFIX "space map disk" + +/*----------------------------------------------------------------*/ + +/* + * Space map interface. + */ +struct sm_disk { + struct dm_space_map sm; + + struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + dm_block_t nr_allocated_this_transaction; +}; + +static void sm_disk_destroy(struct dm_space_map *sm) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + kfree(smd); +} + +static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + return sm_ll_extend(&smd->ll, extra_blocks); +} + +static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + *count = smd->old_ll.nr_blocks; + + return 0; +} + +static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction; + + return 0; +} + +static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + return sm_ll_lookup(&smd->ll, b, result); +} + +static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, + int *result) +{ + int r; + uint32_t count; + + r = sm_disk_get_count(sm, b, &count); + if (r) + return r; + + *result = count > 1; + + return 0; +} + +static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + int r; + int32_t nr_allocations; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_insert(&smd->ll, b, count, &nr_allocations); + if (!r) { + smd->nr_allocated_this_transaction += nr_allocations; + } + + return r; +} + +static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + int r; + int32_t nr_allocations; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_inc(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; + + return r; +} + +static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + int r; + int32_t nr_allocations; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_dec(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; + + return r; +} + +static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + int r; + int32_t nr_allocations; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smd->begin and the end of the metadata device. + * We search before smd->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b); + } + + if (r) + return r; + + smd->begin = *b + 1; + r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations); + if (!r) { + smd->nr_allocated_this_transaction += nr_allocations; + } + + return r; +} + +static int sm_disk_commit(struct dm_space_map *sm) +{ + int r; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_commit(&smd->ll); + if (r) + return r; + + memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); + smd->nr_allocated_this_transaction = 0; + + return 0; +} + +static int sm_disk_root_size(struct dm_space_map *sm, size_t *result) +{ + *result = sizeof(struct disk_sm_root); + + return 0; +} + +static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + struct disk_sm_root root_le; + + root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks); + root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated); + root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root); + root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root); + + if (max < sizeof(root_le)) + return -ENOSPC; + + memcpy(where_le, &root_le, sizeof(root_le)); + + return 0; +} + +/*----------------------------------------------------------------*/ + +static struct dm_space_map ops = { + .destroy = sm_disk_destroy, + .extend = sm_disk_extend, + .get_nr_blocks = sm_disk_get_nr_blocks, + .get_nr_free = sm_disk_get_nr_free, + .get_count = sm_disk_get_count, + .count_is_more_than_one = sm_disk_count_is_more_than_one, + .set_count = sm_disk_set_count, + .inc_blocks = sm_disk_inc_blocks, + .dec_blocks = sm_disk_dec_blocks, + .new_block = sm_disk_new_block, + .commit = sm_disk_commit, + .root_size = sm_disk_root_size, + .copy_root = sm_disk_copy_root, + .register_threshold_callback = NULL +}; + +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) +{ + int r; + struct sm_disk *smd; + + smd = kmalloc(sizeof(*smd), GFP_KERNEL); + if (!smd) + return ERR_PTR(-ENOMEM); + + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + memcpy(&smd->sm, &ops, sizeof(smd->sm)); + + r = sm_ll_new_disk(&smd->ll, tm); + if (r) + goto bad; + + r = sm_ll_extend(&smd->ll, nr_blocks); + if (r) + goto bad; + + r = sm_disk_commit(&smd->sm); + if (r) + goto bad; + + return &smd->sm; + +bad: + kfree(smd); + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_sm_disk_create); + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct sm_disk *smd; + + smd = kmalloc(sizeof(*smd), GFP_KERNEL); + if (!smd) + return ERR_PTR(-ENOMEM); + + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + memcpy(&smd->sm, &ops, sizeof(smd->sm)); + + r = sm_ll_open_disk(&smd->ll, tm, root_le, len); + if (r) + goto bad; + + r = sm_disk_commit(&smd->sm); + if (r) + goto bad; + + return &smd->sm; + +bad: + kfree(smd); + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_sm_disk_open); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h new file mode 100644 index 000000000..447a0a9a2 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-disk.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_SPACE_MAP_DISK_H +#define _LINUX_DM_SPACE_MAP_DISK_H + +#include "dm-block-manager.h" + +struct dm_space_map; +struct dm_transaction_manager; + +/* + * Unfortunately we have to use two-phase construction due to the cycle + * between the tm and sm. + */ +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks); + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root, size_t len); + +#endif /* _LINUX_DM_SPACE_MAP_DISK_H */ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c new file mode 100644 index 000000000..0d1fcdf29 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -0,0 +1,842 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map.h" +#include "dm-space-map-common.h" +#include "dm-space-map-metadata.h" + +#include +#include +#include +#include + +#define DM_MSG_PREFIX "space map metadata" + +/*----------------------------------------------------------------*/ + +/* + * An edge triggered threshold. + */ +struct threshold { + bool threshold_set; + bool value_set; + dm_block_t threshold; + dm_block_t current_value; + dm_sm_threshold_fn fn; + void *context; +}; + +static void threshold_init(struct threshold *t) +{ + t->threshold_set = false; + t->value_set = false; +} + +static void set_threshold(struct threshold *t, dm_block_t value, + dm_sm_threshold_fn fn, void *context) +{ + t->threshold_set = true; + t->threshold = value; + t->fn = fn; + t->context = context; +} + +static bool below_threshold(struct threshold *t, dm_block_t value) +{ + return t->threshold_set && value <= t->threshold; +} + +static bool threshold_already_triggered(struct threshold *t) +{ + return t->value_set && below_threshold(t, t->current_value); +} + +static void check_threshold(struct threshold *t, dm_block_t value) +{ + if (below_threshold(t, value) && + !threshold_already_triggered(t)) + t->fn(t->context); + + t->value_set = true; + t->current_value = value; +} + +/*----------------------------------------------------------------*/ + +/* + * Space map interface. + * + * The low level disk format is written using the standard btree and + * transaction manager. This means that performing disk operations may + * cause us to recurse into the space map in order to allocate new blocks. + * For this reason we have a pool of pre-allocated blocks large enough to + * service any metadata_ll_disk operation. + */ + +/* + * FIXME: we should calculate this based on the size of the device. + * Only the metadata space map needs this functionality. + */ +#define MAX_RECURSIVE_ALLOCATIONS 1024 + +enum block_op_type { + BOP_INC, + BOP_DEC +}; + +struct block_op { + enum block_op_type type; + dm_block_t b; + dm_block_t e; +}; + +struct bop_ring_buffer { + unsigned int begin; + unsigned int end; + struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; +}; + +static void brb_init(struct bop_ring_buffer *brb) +{ + brb->begin = 0; + brb->end = 0; +} + +static bool brb_empty(struct bop_ring_buffer *brb) +{ + return brb->begin == brb->end; +} + +static unsigned int brb_next(struct bop_ring_buffer *brb, unsigned int old) +{ + unsigned int r = old + 1; + return r >= ARRAY_SIZE(brb->bops) ? 0 : r; +} + +static int brb_push(struct bop_ring_buffer *brb, + enum block_op_type type, dm_block_t b, dm_block_t e) +{ + struct block_op *bop; + unsigned int next = brb_next(brb, brb->end); + + /* + * We don't allow the last bop to be filled, this way we can + * differentiate between full and empty. + */ + if (next == brb->begin) + return -ENOMEM; + + bop = brb->bops + brb->end; + bop->type = type; + bop->b = b; + bop->e = e; + + brb->end = next; + + return 0; +} + +static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result) +{ + struct block_op *bop; + + if (brb_empty(brb)) + return -ENODATA; + + bop = brb->bops + brb->begin; + memcpy(result, bop, sizeof(*result)); + return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb) +{ + if (brb_empty(brb)) + return -ENODATA; + + brb->begin = brb_next(brb, brb->begin); + + return 0; +} + +/*----------------------------------------------------------------*/ + +struct sm_metadata { + struct dm_space_map sm; + + struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + + unsigned int recursion_count; + unsigned int allocated_this_transaction; + struct bop_ring_buffer uncommitted; + + struct threshold threshold; +}; + +static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e) +{ + int r = brb_push(&smm->uncommitted, type, b, e); + if (r) { + DMERR("too many recursive allocations"); + return -ENOMEM; + } + + return 0; +} + +static int commit_bop(struct sm_metadata *smm, struct block_op *op) +{ + int r = 0; + int32_t nr_allocations; + + switch (op->type) { + case BOP_INC: + r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations); + break; + + case BOP_DEC: + r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations); + break; + } + + return r; +} + +static void in(struct sm_metadata *smm) +{ + smm->recursion_count++; +} + +static int apply_bops(struct sm_metadata *smm) +{ + int r = 0; + + while (!brb_empty(&smm->uncommitted)) { + struct block_op bop; + + r = brb_peek(&smm->uncommitted, &bop); + if (r) { + DMERR("bug in bop ring buffer"); + break; + } + + r = commit_bop(smm, &bop); + if (r) + break; + + brb_pop(&smm->uncommitted); + } + + return r; +} + +static int out(struct sm_metadata *smm) +{ + int r = 0; + + /* + * If we're not recursing then very bad things are happening. + */ + if (!smm->recursion_count) { + DMERR("lost track of recursion depth"); + return -ENOMEM; + } + + if (smm->recursion_count == 1) + r = apply_bops(smm); + + smm->recursion_count--; + + return r; +} + +/* + * When using the out() function above, we often want to combine an error + * code for the operation run in the recursive context with that from + * out(). + */ +static int combine_errors(int r1, int r2) +{ + return r1 ? r1 : r2; +} + +static int recursing(struct sm_metadata *smm) +{ + return smm->recursion_count; +} + +static void sm_metadata_destroy(struct dm_space_map *sm) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + kfree(smm); +} + +static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks; + + return 0; +} + +static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated - + smm->allocated_this_transaction; + + return 0; +} + +static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + int r; + unsigned int i; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + unsigned int adjustment = 0; + + /* + * We may have some uncommitted adjustments to add. This list + * should always be really short. + */ + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + struct block_op *op = smm->uncommitted.bops + i; + + if (b < op->b || b >= op->e) + continue; + + switch (op->type) { + case BOP_INC: + adjustment++; + break; + + case BOP_DEC: + adjustment--; + break; + } + } + + r = sm_ll_lookup(&smm->ll, b, result); + if (r) + return r; + + *result += adjustment; + + return 0; +} + +static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + int r, adjustment = 0; + unsigned int i; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + uint32_t rc; + + /* + * We may have some uncommitted adjustments to add. This list + * should always be really short. + */ + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + + struct block_op *op = smm->uncommitted.bops + i; + + if (b < op->b || b >= op->e) + continue; + + switch (op->type) { + case BOP_INC: + adjustment++; + break; + + case BOP_DEC: + adjustment--; + break; + } + } + + if (adjustment > 1) { + *result = 1; + return 0; + } + + r = sm_ll_lookup_bitmap(&smm->ll, b, &rc); + if (r) + return r; + + if (rc == 3) + /* + * We err on the side of caution, and always return true. + */ + *result = 1; + else + *result = rc + adjustment > 1; + + return 0; +} + +static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + int r, r2; + int32_t nr_allocations; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (smm->recursion_count) { + DMERR("cannot recurse set_count()"); + return -EINVAL; + } + + in(smm); + r = sm_ll_insert(&smm->ll, b, count, &nr_allocations); + r2 = out(smm); + + return combine_errors(r, r2); +} + +static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + int r, r2 = 0; + int32_t nr_allocations; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (recursing(smm)) { + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + } else { + in(smm); + r = sm_ll_inc(&smm->ll, b, e, &nr_allocations); + r2 = out(smm); + } + + return combine_errors(r, r2); +} + +static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + int r, r2 = 0; + int32_t nr_allocations; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (recursing(smm)) + r = add_bop(smm, BOP_DEC, b, e); + else { + in(smm); + r = sm_ll_dec(&smm->ll, b, e, &nr_allocations); + r2 = out(smm); + } + + return combine_errors(r, r2); +} + +static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) +{ + int r, r2 = 0; + int32_t nr_allocations; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smm->begin and the end of the metadata device. + * We search before smm->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b); + } + + if (r) + return r; + + smm->begin = *b + 1; + + if (recursing(smm)) + r = add_bop(smm, BOP_INC, *b, *b + 1); + else { + in(smm); + r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations); + r2 = out(smm); + } + + if (!r) + smm->allocated_this_transaction++; + + return combine_errors(r, r2); +} + +static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + dm_block_t count; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + int r = sm_metadata_new_block_(sm, b); + if (r) { + DMERR_LIMIT("unable to allocate new metadata block"); + return r; + } + + r = sm_metadata_get_nr_free(sm, &count); + if (r) { + DMERR_LIMIT("couldn't get free block count"); + return r; + } + + check_threshold(&smm->threshold, count); + + return r; +} + +static int sm_metadata_commit(struct dm_space_map *sm) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_commit(&smm->ll); + if (r) + return r; + + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + smm->allocated_this_transaction = 0; + + return 0; +} + +static int sm_metadata_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + set_threshold(&smm->threshold, threshold, fn, context); + + return 0; +} + +static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) +{ + *result = sizeof(struct disk_sm_root); + + return 0; +} + +static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + struct disk_sm_root root_le; + + root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks); + root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated); + root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root); + root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root); + + if (max < sizeof(root_le)) + return -ENOSPC; + + memcpy(where_le, &root_le, sizeof(root_le)); + + return 0; +} + +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); + +static const struct dm_space_map ops = { + .destroy = sm_metadata_destroy, + .extend = sm_metadata_extend, + .get_nr_blocks = sm_metadata_get_nr_blocks, + .get_nr_free = sm_metadata_get_nr_free, + .get_count = sm_metadata_get_count, + .count_is_more_than_one = sm_metadata_count_is_more_than_one, + .set_count = sm_metadata_set_count, + .inc_blocks = sm_metadata_inc_blocks, + .dec_blocks = sm_metadata_dec_blocks, + .new_block = sm_metadata_new_block, + .commit = sm_metadata_commit, + .root_size = sm_metadata_root_size, + .copy_root = sm_metadata_copy_root, + .register_threshold_callback = sm_metadata_register_threshold_callback +}; + +/*----------------------------------------------------------------*/ + +/* + * When a new space map is created that manages its own space. We use + * this tiny bootstrap allocator. + */ +static void sm_bootstrap_destroy(struct dm_space_map *sm) +{ +} + +static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + DMERR("bootstrap doesn't support extend"); + + return -EINVAL; +} + +static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks; + + return 0; +} + +static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks - smm->begin; + + return 0; +} + +static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *result = (b < smm->begin) ? 1 : 0; + + return 0; +} + +static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + *result = 0; + + return 0; +} + +static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + DMERR("bootstrap doesn't support set_count"); + + return -EINVAL; +} + +static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + /* + * We know the entire device is unused. + */ + if (smm->begin == smm->ll.nr_blocks) + return -ENOSPC; + + *b = smm->begin++; + + return 0; +} + +static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + + return 0; +} + +static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = add_bop(smm, BOP_DEC, b, e); + if (r) + return r; + + return 0; +} + +static int sm_bootstrap_commit(struct dm_space_map *sm) +{ + return 0; +} + +static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) +{ + DMERR("bootstrap doesn't support root_size"); + + return -EINVAL; +} + +static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, + size_t max) +{ + DMERR("bootstrap doesn't support copy_root"); + + return -EINVAL; +} + +static const struct dm_space_map bootstrap_ops = { + .destroy = sm_bootstrap_destroy, + .extend = sm_bootstrap_extend, + .get_nr_blocks = sm_bootstrap_get_nr_blocks, + .get_nr_free = sm_bootstrap_get_nr_free, + .get_count = sm_bootstrap_get_count, + .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, + .set_count = sm_bootstrap_set_count, + .inc_blocks = sm_bootstrap_inc_blocks, + .dec_blocks = sm_bootstrap_dec_blocks, + .new_block = sm_bootstrap_new_block, + .commit = sm_bootstrap_commit, + .root_size = sm_bootstrap_root_size, + .copy_root = sm_bootstrap_copy_root, + .register_threshold_callback = NULL +}; + +/*----------------------------------------------------------------*/ + +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + dm_block_t old_len = smm->ll.nr_blocks; + + /* + * Flick into a mode where all blocks get allocated in the new area. + */ + smm->begin = old_len; + memcpy(sm, &bootstrap_ops, sizeof(*sm)); + + /* + * Extend. + */ + r = sm_ll_extend(&smm->ll, extra_blocks); + if (r) + goto out; + + /* + * We repeatedly increment then commit until the commit doesn't + * allocate any new blocks. + */ + do { + r = add_bop(smm, BOP_INC, old_len, smm->begin); + if (r) + goto out; + + old_len = smm->begin; + + r = apply_bops(smm); + if (r) { + DMERR("%s: apply_bops failed", __func__); + goto out; + } + + r = sm_ll_commit(&smm->ll); + if (r) + goto out; + + } while (old_len != smm->begin); + +out: + /* + * Switch back to normal behaviour. + */ + memcpy(sm, &ops, sizeof(*sm)); + return r; +} + +/*----------------------------------------------------------------*/ + +struct dm_space_map *dm_sm_metadata_init(void) +{ + struct sm_metadata *smm; + + smm = kmalloc(sizeof(*smm), GFP_KERNEL); + if (!smm) + return ERR_PTR(-ENOMEM); + + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + + return &smm->sm; +} + +int dm_sm_metadata_create(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + dm_block_t nr_blocks, + dm_block_t superblock) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + smm->begin = superblock + 1; + smm->recursion_count = 0; + smm->allocated_this_transaction = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); + + memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); + + r = sm_ll_new_metadata(&smm->ll, tm); + if (!r) { + if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) + nr_blocks = DM_SM_METADATA_MAX_BLOCKS; + r = sm_ll_extend(&smm->ll, nr_blocks); + } + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + if (r) + return r; + + /* + * Now we need to update the newly created data structures with the + * allocated blocks that they were built from. + */ + r = add_bop(smm, BOP_INC, superblock, smm->begin); + if (r) + return r; + + r = apply_bops(smm); + if (r) { + DMERR("%s: apply_bops failed", __func__); + return r; + } + + return sm_metadata_commit(sm); +} + +int dm_sm_metadata_open(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_open_metadata(&smm->ll, tm, root_le, len); + if (r) + return r; + + smm->begin = 0; + smm->recursion_count = 0; + smm->allocated_this_transaction = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); + + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + return 0; +} diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h new file mode 100644 index 000000000..64df92397 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-metadata.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_SPACE_MAP_METADATA_H +#define DM_SPACE_MAP_METADATA_H + +#include "dm-transaction-manager.h" + +#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) + +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about ~16k metadata blocks. + */ +#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) +#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) + +/* + * Unfortunately we have to use two-phase construction due to the cycle + * between the tm and sm. + */ +struct dm_space_map *dm_sm_metadata_init(void); + +/* + * Create a fresh space map. + */ +int dm_sm_metadata_create(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + dm_block_t nr_blocks, + dm_block_t superblock); + +/* + * Open from a previously-recorded root. + */ +int dm_sm_metadata_open(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + void *root_le, size_t len); + +#endif /* DM_SPACE_MAP_METADATA_H */ diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h new file mode 100644 index 000000000..85aa0a397 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map.h @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_SPACE_MAP_H +#define _LINUX_DM_SPACE_MAP_H + +#include "dm-block-manager.h" + +typedef void (*dm_sm_threshold_fn)(void *context); + +/* + * struct dm_space_map keeps a record of how many times each block in a device + * is referenced. It needs to be fixed on disk as part of the transaction. + */ +struct dm_space_map { + void (*destroy)(struct dm_space_map *sm); + + /* + * You must commit before allocating the newly added space. + */ + int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks); + + /* + * Extensions do not appear in this count until after commit has + * been called. + */ + int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count); + + /* + * Space maps must never allocate a block from the previous + * transaction, in case we need to rollback. This complicates the + * semantics of get_nr_free(), it should return the number of blocks + * that are available for allocation _now_. For instance you may + * have blocks with a zero reference count that will not be + * available for allocation until after the next commit. + */ + int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count); + + int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result); + int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b, + int *result); + int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count); + + int (*commit)(struct dm_space_map *sm); + + int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); + int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); + + /* + * new_block will increment the returned block. + */ + int (*new_block)(struct dm_space_map *sm, dm_block_t *b); + + /* + * The root contains all the information needed to fix the space map. + * Generally this info is small, so squirrel it away in a disk block + * along with other info. + */ + int (*root_size)(struct dm_space_map *sm, size_t *result); + int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); + + /* + * You can register one threshold callback which is edge-triggered + * when the free space in the space map drops below the threshold. + */ + int (*register_threshold_callback)(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); +}; + +/*----------------------------------------------------------------*/ + +static inline void dm_sm_destroy(struct dm_space_map *sm) +{ + if (sm) + sm->destroy(sm); +} + +static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + return sm->extend(sm, extra_blocks); +} + +static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + return sm->get_nr_blocks(sm, count); +} + +static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + return sm->get_nr_free(sm, count); +} + +static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + return sm->get_count(sm, b, result); +} + +static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + return sm->count_is_more_than_one(sm, b, result); +} + +static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + return sm->set_count(sm, b, count); +} + +static inline int dm_sm_commit(struct dm_space_map *sm) +{ + return sm->commit(sm); +} + +static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->inc_blocks(sm, b, e); +} + +static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + return dm_sm_inc_blocks(sm, b, b + 1); +} + +static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->dec_blocks(sm, b, e); +} + +static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + return dm_sm_dec_blocks(sm, b, b + 1); +} + +static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + return sm->new_block(sm, b); +} + +static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result) +{ + return sm->root_size(sm, result); +} + +static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) +{ + return sm->copy_root(sm, copy_to_here_le, len); +} + +static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + if (sm->register_threshold_callback) + return sm->register_threshold_callback(sm, threshold, fn, context); + + return -EINVAL; +} + + +#endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c new file mode 100644 index 000000000..557a3ecfe --- /dev/null +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -0,0 +1,519 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#include "dm-transaction-manager.h" +#include "dm-space-map.h" +#include "dm-space-map-disk.h" +#include "dm-space-map-metadata.h" +#include "dm-persistent-data-internal.h" + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "transaction manager" + +/*----------------------------------------------------------------*/ + +#define PREFETCH_SIZE 128 +#define PREFETCH_BITS 7 +#define PREFETCH_SENTINEL ((dm_block_t) -1ULL) + +struct prefetch_set { + struct mutex lock; + dm_block_t blocks[PREFETCH_SIZE]; +}; + +static unsigned int prefetch_hash(dm_block_t b) +{ + return hash_64(b, PREFETCH_BITS); +} + +static void prefetch_wipe(struct prefetch_set *p) +{ + unsigned int i; + for (i = 0; i < PREFETCH_SIZE; i++) + p->blocks[i] = PREFETCH_SENTINEL; +} + +static void prefetch_init(struct prefetch_set *p) +{ + mutex_init(&p->lock); + prefetch_wipe(p); +} + +static void prefetch_add(struct prefetch_set *p, dm_block_t b) +{ + unsigned int h = prefetch_hash(b); + + mutex_lock(&p->lock); + if (p->blocks[h] == PREFETCH_SENTINEL) + p->blocks[h] = b; + + mutex_unlock(&p->lock); +} + +static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm) +{ + unsigned int i; + + mutex_lock(&p->lock); + + for (i = 0; i < PREFETCH_SIZE; i++) + if (p->blocks[i] != PREFETCH_SENTINEL) { + dm_bm_prefetch(bm, p->blocks[i]); + p->blocks[i] = PREFETCH_SENTINEL; + } + + mutex_unlock(&p->lock); +} + +/*----------------------------------------------------------------*/ + +struct shadow_info { + struct hlist_node hlist; + dm_block_t where; +}; + +/* + * It would be nice if we scaled with the size of transaction. + */ +#define DM_HASH_SIZE 256 +#define DM_HASH_MASK (DM_HASH_SIZE - 1) + +struct dm_transaction_manager { + int is_clone; + struct dm_transaction_manager *real; + + struct dm_block_manager *bm; + struct dm_space_map *sm; + + spinlock_t lock; + struct hlist_head buckets[DM_HASH_SIZE]; + + struct prefetch_set prefetches; +}; + +/*----------------------------------------------------------------*/ + +static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) +{ + int r = 0; + unsigned int bucket = dm_hash_block(b, DM_HASH_MASK); + struct shadow_info *si; + + spin_lock(&tm->lock); + hlist_for_each_entry(si, tm->buckets + bucket, hlist) + if (si->where == b) { + r = 1; + break; + } + spin_unlock(&tm->lock); + + return r; +} + +/* + * This can silently fail if there's no memory. We're ok with this since + * creating redundant shadows causes no harm. + */ +static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) +{ + unsigned int bucket; + struct shadow_info *si; + + si = kmalloc(sizeof(*si), GFP_NOIO); + if (si) { + si->where = b; + bucket = dm_hash_block(b, DM_HASH_MASK); + spin_lock(&tm->lock); + hlist_add_head(&si->hlist, tm->buckets + bucket); + spin_unlock(&tm->lock); + } +} + +static void wipe_shadow_table(struct dm_transaction_manager *tm) +{ + struct shadow_info *si; + struct hlist_node *tmp; + struct hlist_head *bucket; + int i; + + spin_lock(&tm->lock); + for (i = 0; i < DM_HASH_SIZE; i++) { + bucket = tm->buckets + i; + hlist_for_each_entry_safe(si, tmp, bucket, hlist) + kfree(si); + + INIT_HLIST_HEAD(bucket); + } + + spin_unlock(&tm->lock); +} + +/*----------------------------------------------------------------*/ + +static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, + struct dm_space_map *sm) +{ + int i; + struct dm_transaction_manager *tm; + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->is_clone = 0; + tm->real = NULL; + tm->bm = bm; + tm->sm = sm; + + spin_lock_init(&tm->lock); + for (i = 0; i < DM_HASH_SIZE; i++) + INIT_HLIST_HEAD(tm->buckets + i); + + prefetch_init(&tm->prefetches); + + return tm; +} + +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real) +{ + struct dm_transaction_manager *tm; + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (tm) { + tm->is_clone = 1; + tm->real = real; + } + + return tm; +} +EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); + +void dm_tm_destroy(struct dm_transaction_manager *tm) +{ + if (!tm) + return; + + if (!tm->is_clone) + wipe_shadow_table(tm); + + kfree(tm); +} +EXPORT_SYMBOL_GPL(dm_tm_destroy); + +int dm_tm_pre_commit(struct dm_transaction_manager *tm) +{ + int r; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_commit(tm->sm); + if (r < 0) + return r; + + return dm_bm_flush(tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_pre_commit); + +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + wipe_shadow_table(tm); + dm_bm_unlock(root); + + return dm_bm_flush(tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_commit); + +int dm_tm_new_block(struct dm_transaction_manager *tm, + struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + dm_block_t new_block; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_new_block(tm->sm, &new_block); + if (r < 0) + return r; + + r = dm_bm_write_lock_zero(tm->bm, new_block, v, result); + if (r < 0) { + dm_sm_dec_block(tm->sm, new_block); + return r; + } + + /* + * New blocks count as shadows in that they don't need to be + * shadowed again. + */ + insert_shadow(tm, new_block); + + return 0; +} + +static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + dm_block_t new; + struct dm_block *orig_block; + + r = dm_sm_new_block(tm->sm, &new); + if (r < 0) + return r; + + r = dm_sm_dec_block(tm->sm, orig); + if (r < 0) + return r; + + r = dm_bm_read_lock(tm->bm, orig, v, &orig_block); + if (r < 0) + return r; + + /* + * It would be tempting to use dm_bm_unlock_move here, but some + * code, such as the space maps, keeps using the old data structures + * secure in the knowledge they won't be changed until the next + * transaction. Using unlock_move would force a synchronous read + * since the old block would no longer be in the cache. + */ + r = dm_bm_write_lock_zero(tm->bm, new, v, result); + if (r) { + dm_bm_unlock(orig_block); + return r; + } + + memcpy(dm_block_data(*result), dm_block_data(orig_block), + dm_bm_block_size(tm->bm)); + + dm_bm_unlock(orig_block); + return r; +} + +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, struct dm_block **result, + int *inc_children) +{ + int r; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children); + if (r < 0) + return r; + + if (is_shadow(tm, orig) && !*inc_children) + return dm_bm_write_lock(tm->bm, orig, v, result); + + r = __shadow_block(tm, orig, v, result); + if (r < 0) + return r; + insert_shadow(tm, dm_block_location(*result)); + + return r; +} +EXPORT_SYMBOL_GPL(dm_tm_shadow_block); + +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **blk) +{ + if (tm->is_clone) { + int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk); + + if (r == -EWOULDBLOCK) + prefetch_add(&tm->real->prefetches, b); + + return r; + } + + return dm_bm_read_lock(tm->bm, b, v, blk); +} +EXPORT_SYMBOL_GPL(dm_tm_read_lock); + +void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) +{ + dm_bm_unlock(b); +} +EXPORT_SYMBOL_GPL(dm_tm_unlock); + +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_block(tm->sm, b); +} +EXPORT_SYMBOL_GPL(dm_tm_inc); + +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_inc_range); + +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_block(tm->sm, b); +} +EXPORT_SYMBOL_GPL(dm_tm_dec); + +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_dec_range); + +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned int count, dm_tm_run_fn fn) +{ + uint64_t b, begin, end; + bool in_run = false; + unsigned int i; + + for (i = 0; i < count; i++, value_le++) { + b = le64_to_cpu(*value_le); + + if (in_run) { + if (b == end) + end++; + else { + fn(tm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(tm, begin, end); +} +EXPORT_SYMBOL_GPL(dm_tm_with_runs); + +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, + uint32_t *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_get_count(tm->sm, b, result); +} + +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_count_is_more_than_one(tm->sm, b, result); +} + +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) +{ + return tm->bm; +} + +void dm_tm_issue_prefetches(struct dm_transaction_manager *tm) +{ + prefetch_issue(&tm->prefetches, tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches); + +/*----------------------------------------------------------------*/ + +static int dm_tm_create_internal(struct dm_block_manager *bm, + dm_block_t sb_location, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, + int create, + void *sm_root, size_t sm_len) +{ + int r; + + *sm = dm_sm_metadata_init(); + if (IS_ERR(*sm)) + return PTR_ERR(*sm); + + *tm = dm_tm_create(bm, *sm); + if (IS_ERR(*tm)) { + dm_sm_destroy(*sm); + return PTR_ERR(*tm); + } + + if (create) { + r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm), + sb_location); + if (r) { + DMERR("couldn't create metadata space map"); + goto bad; + } + + } else { + r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len); + if (r) { + DMERR("couldn't open metadata space map"); + goto bad; + } + } + + return 0; + +bad: + dm_tm_destroy(*tm); + dm_sm_destroy(*sm); + return r; +} + +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_transaction_manager **tm, + struct dm_space_map **sm) +{ + return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0); +} +EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); + +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + void *sm_root, size_t root_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm) +{ + return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len); +} +EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h new file mode 100644 index 000000000..0f573a4a0 --- /dev/null +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_TRANSACTION_MANAGER_H +#define _LINUX_DM_TRANSACTION_MANAGER_H + +#include "dm-block-manager.h" + +struct dm_transaction_manager; +struct dm_space_map; + +/*----------------------------------------------------------------*/ + +/* + * This manages the scope of a transaction. It also enforces immutability + * of the on-disk data structures by limiting access to writeable blocks. + * + * Clients should not fiddle with the block manager directly. + */ + +void dm_tm_destroy(struct dm_transaction_manager *tm); + +/* + * The non-blocking version of a transaction manager is intended for use in + * fast path code that needs to do lookups e.g. a dm mapping function. + * You create the non-blocking variant from a normal tm. The interface is + * the same, except that most functions will just return -EWOULDBLOCK. + * Methods that return void yet may block should not be called on a clone + * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal + * tm when you've finished with it. You may not destroy the original prior + * to clones. + */ +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real); + +/* + * We use a 2-phase commit here. + * + * i) Make all changes for the transaction *except* for the superblock. + * Then call dm_tm_pre_commit() to flush them to disk. + * + * ii) Lock your superblock. Update. Then call dm_tm_commit() which will + * unlock the superblock and flush it. No other blocks should be updated + * during this period. Care should be taken to never unlock a partially + * updated superblock; perform any operations that could fail *before* you + * take the superblock lock. + */ +int dm_tm_pre_commit(struct dm_transaction_manager *tm); +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock); + +/* + * These methods are the only way to get hold of a writeable block. + */ + +/* + * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually + * write to the whole of @data before you unlock, otherwise you could get + * a data leak. (The other option is for tm_new_block() to zero new blocks + * before handing them out, which will be redundant in most, if not all, + * cases). + * Zeroes the new block and returns with write lock held. + */ +int dm_tm_new_block(struct dm_transaction_manager *tm, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * dm_tm_shadow_block() allocates a new block and copies the data from @orig + * to it. It then decrements the reference count on original block. Use + * this to update the contents of a block in a data structure, don't + * confuse this with a clone - you shouldn't access the orig block after + * this operation. Because the tm knows the scope of the transaction it + * can optimise requests for a shadow of a shadow to a no-op. Don't forget + * to unlock when you've finished with the shadow. + * + * The @inc_children flag is used to tell the caller whether it needs to + * adjust reference counts for children. (Data in the block may refer to + * other blocks.) + * + * Shadowing implicitly drops a reference on @orig so you must not have + * it locked when you call this. + */ +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, + struct dm_block **result, int *inc_children); + +/* + * Read access. You can lock any block you want. If there's a write lock + * on it outstanding then it'll block. + */ +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); + +/* + * Functions for altering the reference count of a block directly. + */ +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); + +/* + * Builds up runs of adjacent blocks, and then calls the given fn + * (typically dm_tm_inc/dec). Very useful when you have to perform + * the same tm operation on all values in a btree leaf. + */ +typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t); +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned int count, dm_tm_run_fn fn); + +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result); + +/* + * Finds out if a given block is shared (ie. has a reference count higher + * than one). + */ +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result); + +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); + +/* + * If you're using a non-blocking clone the tm will build up a list of + * requested blocks that weren't in core. This call will request those + * blocks to be prefetched. + */ +void dm_tm_issue_prefetches(struct dm_transaction_manager *tm); + +/* + * A little utility that ties the knot by producing a transaction manager + * that has a space map managed by the transaction manager... + * + * Returns a tm that has an open transaction to write the new disk sm. + * Caller should store the new sm root and commit. + * + * The superblock location is passed so the metadata space map knows it + * shouldn't be used. + */ +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_transaction_manager **tm, + struct dm_space_map **sm); + +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + void *sm_root, size_t root_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm); + +#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c new file mode 100644 index 000000000..7c6a0b443 --- /dev/null +++ b/drivers/md/raid0.c @@ -0,0 +1,846 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + raid0.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + or + + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + + RAID-0 management functions. + +*/ + +#include +#include +#include +#include +#include +#include "md.h" +#include "raid0.h" +#include "raid5.h" + +static int default_layout = 0; +module_param(default_layout, int, 0644); + +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_HAS_JOURNAL) | \ + (1L << MD_JOURNAL_CLEAN) | \ + (1L << MD_FAILFAST_SUPPORTED) |\ + (1L << MD_HAS_PPL) | \ + (1L << MD_HAS_MULTIPLE_PPLS)) + +/* + * inform the user of the raid configuration +*/ +static void dump_zones(struct mddev *mddev) +{ + int j, k; + sector_t zone_size = 0; + sector_t zone_start = 0; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + pr_debug("md: RAID0 configuration for %s - %d zone%s\n", + mdname(mddev), + conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); + for (j = 0; j < conf->nr_strip_zones; j++) { + char line[200]; + int len = 0; + + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"", + conf->devlist[j * raid_disks + k]->bdev); + pr_debug("md: zone%d=[%s]\n", j, line); + + zone_size = conf->strip_zone[j].zone_end - zone_start; + pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; + } +} + +static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) +{ + int i, c, err; + sector_t curr_zone_end, sectors; + struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev; + struct strip_zone *zone; + int cnt; + struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + unsigned blksize = 512; + + *private_conf = ERR_PTR(-ENOMEM); + if (!conf) + return -ENOMEM; + rdev_for_each(rdev1, mddev) { + pr_debug("md/raid0:%s: looking at %pg\n", + mdname(mddev), + rdev1->bdev); + c = 0; + + /* round size to chunk_size */ + sectors = rdev1->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev1->sectors = sectors * mddev->chunk_sectors; + + blksize = max(blksize, queue_logical_block_size( + rdev1->bdev->bd_disk->queue)); + + rdev_for_each(rdev2, mddev) { + pr_debug("md/raid0:%s: comparing %pg(%llu)" + " with %pg(%llu)\n", + mdname(mddev), + rdev1->bdev, + (unsigned long long)rdev1->sectors, + rdev2->bdev, + (unsigned long long)rdev2->sectors); + if (rdev2 == rdev1) { + pr_debug("md/raid0:%s: END\n", + mdname(mddev)); + break; + } + if (rdev2->sectors == rdev1->sectors) { + /* + * Not unique, don't count it as a new + * group + */ + pr_debug("md/raid0:%s: EQUAL\n", + mdname(mddev)); + c = 1; + break; + } + pr_debug("md/raid0:%s: NOT EQUAL\n", + mdname(mddev)); + } + if (!c) { + pr_debug("md/raid0:%s: ==> UNIQUE\n", + mdname(mddev)); + conf->nr_strip_zones++; + pr_debug("md/raid0:%s: %d zones\n", + mdname(mddev), conf->nr_strip_zones); + } + } + pr_debug("md/raid0:%s: FINAL %d zones\n", + mdname(mddev), conf->nr_strip_zones); + + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % blksize) { + pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n", + mdname(mddev), + mddev->chunk_sectors << 9, blksize); + err = -EINVAL; + goto abort; + } + + err = -ENOMEM; + conf->strip_zone = kcalloc(conf->nr_strip_zones, + sizeof(struct strip_zone), + GFP_KERNEL); + if (!conf->strip_zone) + goto abort; + conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *), + conf->nr_strip_zones, + mddev->raid_disks), + GFP_KERNEL); + if (!conf->devlist) + goto abort; + + /* The first zone must contain all devices, so here we check that + * there is a proper alignment of slots to devices and find them all + */ + zone = &conf->strip_zone[0]; + cnt = 0; + smallest = NULL; + dev = conf->devlist; + err = -EINVAL; + rdev_for_each(rdev1, mddev) { + int j = rdev1->raid_disk; + + if (mddev->level == 10) { + /* taking over a raid10-n2 array */ + j /= 2; + rdev1->new_raid_disk = j; + } + + if (mddev->level == 1) { + /* taiking over a raid1 array- + * we have only one active disk + */ + j = 0; + rdev1->new_raid_disk = j; + } + + if (j < 0) { + pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n", + mdname(mddev)); + goto abort; + } + if (j >= mddev->raid_disks) { + pr_warn("md/raid0:%s: bad disk number %d - aborting!\n", + mdname(mddev), j); + goto abort; + } + if (dev[j]) { + pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n", + mdname(mddev), j); + goto abort; + } + dev[j] = rdev1; + + if (!smallest || (rdev1->sectors < smallest->sectors)) + smallest = rdev1; + cnt++; + } + if (cnt != mddev->raid_disks) { + pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n", + mdname(mddev), cnt, mddev->raid_disks); + goto abort; + } + zone->nb_dev = cnt; + zone->zone_end = smallest->sectors * cnt; + + curr_zone_end = zone->zone_end; + + /* now do the other zones */ + for (i = 1; i < conf->nr_strip_zones; i++) + { + int j; + + zone = conf->strip_zone + i; + dev = conf->devlist + i * mddev->raid_disks; + + pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i); + zone->dev_start = smallest->sectors; + smallest = NULL; + c = 0; + + for (j=0; jdevlist[j]; + if (rdev->sectors <= zone->dev_start) { + pr_debug("md/raid0:%s: checking %pg ... nope\n", + mdname(mddev), + rdev->bdev); + continue; + } + pr_debug("md/raid0:%s: checking %pg ..." + " contained as device %d\n", + mdname(mddev), + rdev->bdev, c); + dev[c] = rdev; + c++; + if (!smallest || rdev->sectors < smallest->sectors) { + smallest = rdev; + pr_debug("md/raid0:%s: (%llu) is smallest!.\n", + mdname(mddev), + (unsigned long long)rdev->sectors); + } + } + + zone->nb_dev = c; + sectors = (smallest->sectors - zone->dev_start) * c; + pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", + mdname(mddev), + zone->nb_dev, (unsigned long long)sectors); + + curr_zone_end += sectors; + zone->zone_end = curr_zone_end; + + pr_debug("md/raid0:%s: current zone start: %llu\n", + mdname(mddev), + (unsigned long long)smallest->sectors); + } + + if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); + err = -EOPNOTSUPP; + goto abort; + } + + if (conf->layout == RAID0_ORIG_LAYOUT) { + for (i = 1; i < conf->nr_strip_zones; i++) { + sector_t first_sector = conf->strip_zone[i-1].zone_end; + + sector_div(first_sector, mddev->chunk_sectors); + zone = conf->strip_zone + i; + /* disk_shift is first disk index used in the zone */ + zone->disk_shift = sector_div(first_sector, + zone->nb_dev); + } + } + + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); + *private_conf = conf; + + return 0; +abort: + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + *private_conf = ERR_PTR(err); + return err; +} + +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ +static struct strip_zone *find_zone(struct r0conf *conf, + sector_t *sectorp) +{ + int i; + struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; + return z + i; + } + BUG(); +} + +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of performance +*/ +static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + unsigned int chunk_sects = mddev->chunk_sectors; + + if (is_power_of_2(chunk_sects)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*raid_disks + + sector_div(sector, zone->nb_dev)]; +} + +static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + sector_t array_sectors = 0; + struct md_rdev *rdev; + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + rdev_for_each(rdev, mddev) + array_sectors += (rdev->sectors & + ~(sector_t)(mddev->chunk_sectors-1)); + + return array_sectors; +} + +static void free_conf(struct mddev *mddev, struct r0conf *conf) +{ + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); +} + +static void raid0_free(struct mddev *mddev, void *priv) +{ + struct r0conf *conf = priv; + + free_conf(mddev, conf); + acct_bioset_exit(mddev); +} + +static int raid0_run(struct mddev *mddev) +{ + struct r0conf *conf; + int ret; + + if (mddev->chunk_sectors == 0) { + pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev)); + return -EINVAL; + } + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + if (acct_bioset_init(mddev)) { + pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev)); + return -ENOMEM; + } + + /* if private is not null, we are here after takeover */ + if (mddev->private == NULL) { + ret = create_strip_zones(mddev, &conf); + if (ret < 0) + goto exit_acct_set; + mddev->private = conf; + } + conf = mddev->private; + if (mddev->queue) { + struct md_rdev *rdev; + + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } + } + + /* calculate array device size */ + md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); + + pr_debug("md/raid0:%s: md_size is %llu sectors.\n", + mdname(mddev), + (unsigned long long)mddev->array_sectors); + + dump_zones(mddev); + + ret = md_integrity_register(mddev); + if (ret) + goto free; + + return ret; + +free: + free_conf(mddev, conf); +exit_acct_set: + acct_bioset_exit(mddev); + return ret; +} + +/* + * Convert disk_index to the disk order in which it is read/written. + * For example, if we have 4 disks, they are numbered 0,1,2,3. If we + * write the disks starting at disk 3, then the read/write order would + * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift() + * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map + * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in + * that 'output' space to understand the read/write disk ordering. + */ +static int map_disk_shift(int disk_index, int num_disks, int disk_shift) +{ + return ((disk_index + num_disks - disk_shift) % num_disks); +} + +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r0conf *conf = mddev->private; + struct strip_zone *zone; + sector_t start = bio->bi_iter.bi_sector; + sector_t end; + unsigned int stripe_size; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int disk; + sector_t orig_start, orig_end; + + orig_start = start; + zone = find_zone(conf, &start); + + if (bio_end_sector(bio) > zone->zone_end) { + struct bio *split = bio_split(bio, + zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, + &mddev->bio_set); + bio_chain(split, bio); + submit_bio_noacct(bio); + bio = split; + end = zone->zone_end; + } else + end = bio_end_sector(bio); + + orig_end = end; + if (zone != conf->strip_zone) + end = end - zone[-1].zone_end; + + /* Now start and end is the offset in zone */ + stripe_size = zone->nb_dev * mddev->chunk_sectors; + + first_stripe_index = start; + sector_div(first_stripe_index, stripe_size); + last_stripe_index = end; + sector_div(last_stripe_index, stripe_size); + + /* In the first zone the original and alternate layouts are the same */ + if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) { + sector_div(orig_start, mddev->chunk_sectors); + start_disk_index = sector_div(orig_start, zone->nb_dev); + start_disk_index = map_disk_shift(start_disk_index, + zone->nb_dev, + zone->disk_shift); + sector_div(orig_end, mddev->chunk_sectors); + end_disk_index = sector_div(orig_end, zone->nb_dev); + end_disk_index = map_disk_shift(end_disk_index, + zone->nb_dev, zone->disk_shift); + } else { + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + } + start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % + mddev->chunk_sectors) + + first_stripe_index * mddev->chunk_sectors; + end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % + mddev->chunk_sectors) + + last_stripe_index * mddev->chunk_sectors; + + for (disk = 0; disk < zone->nb_dev; disk++) { + sector_t dev_start, dev_end; + struct md_rdev *rdev; + int compare_disk; + + compare_disk = map_disk_shift(disk, zone->nb_dev, + zone->disk_shift); + + if (compare_disk < start_disk_index) + dev_start = (first_stripe_index + 1) * + mddev->chunk_sectors; + else if (compare_disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (compare_disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (compare_disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + if (dev_end <= dev_start) + continue; + + rdev = conf->devlist[(zone - conf->strip_zone) * + conf->strip_zone[0].nb_dev + disk]; + md_submit_discard_bio(mddev, rdev, bio, + dev_start + zone->dev_start + rdev->data_offset, + dev_end - dev_start); + } + bio_endio(bio); +} + +static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) +{ + struct r0conf *conf = mddev->private; + struct strip_zone *zone; + struct md_rdev *tmp_dev; + sector_t bio_sector = bio->bi_iter.bi_sector; + sector_t sector = bio_sector; + + md_account_bio(mddev, &bio); + + zone = find_zone(mddev->private, §or); + switch (conf->layout) { + case RAID0_ORIG_LAYOUT: + tmp_dev = map_sector(mddev, zone, bio_sector, §or); + break; + case RAID0_ALT_MULTIZONE_LAYOUT: + tmp_dev = map_sector(mddev, zone, sector, §or); + break; + default: + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); + bio_io_error(bio); + return; + } + + if (unlikely(is_rdev_broken(tmp_dev))) { + bio_io_error(bio); + md_error(mddev, tmp_dev); + return; + } + + bio_set_dev(bio, tmp_dev->bdev); + bio->bi_iter.bi_sector = sector + zone->dev_start + + tmp_dev->data_offset; + + if (mddev->gendisk) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_write_zeroes(mddev, bio); + submit_bio_noacct(bio); +} + +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +{ + sector_t sector; + unsigned chunk_sects; + unsigned sectors; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return true; + } + + sector = bio->bi_iter.bi_sector; + chunk_sects = mddev->chunk_sectors; + + sectors = chunk_sects - + (likely(is_power_of_2(chunk_sects)) + ? (sector & (chunk_sects-1)) + : sector_div(sector, chunk_sects)); + + if (sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, sectors, GFP_NOIO, + &mddev->bio_set); + bio_chain(split, bio); + raid0_map_submit_bio(mddev, bio); + bio = split; + } + + raid0_map_submit_bio(mddev, bio); + return true; +} + +static void raid0_status(struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); + return; +} + +static void raid0_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { + char *md_name = mdname(mddev); + + pr_crit("md/raid0%s: Disk failure on %pg detected, failing array.\n", + md_name, rdev->bdev); + } +} + +static void *raid0_takeover_raid45(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct r0conf *priv_conf; + + if (mddev->degraded != 1) { + pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", + mdname(mddev), + mddev->degraded); + return ERR_PTR(-EINVAL); + } + + rdev_for_each(rdev, mddev) { + /* check slot number for a disk */ + if (rdev->raid_disk == mddev->raid_disks-1) { + pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + rdev->sectors = mddev->dev_sectors; + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks--; + mddev->delta_disks = -1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); + + create_strip_zones(mddev, &priv_conf); + + return priv_conf; +} + +static void *raid0_takeover_raid10(struct mddev *mddev) +{ + struct r0conf *priv_conf; + + /* Check layout: + * - far_copies must be 1 + * - near_copies must be 2 + * - disks number must be even + * - all mirrors must be already degraded + */ + if (mddev->layout != ((1 << 8) + 2)) { + pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", + mdname(mddev), + mddev->layout); + return ERR_PTR(-EINVAL); + } + if (mddev->raid_disks & 1) { + pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + if (mddev->degraded != (mddev->raid_disks>>1)) { + pr_warn("md/raid0:%s: All mirrors must be already degraded!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = - mddev->raid_disks / 2; + mddev->raid_disks += mddev->delta_disks; + mddev->degraded = 0; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover_raid1(struct mddev *mddev) +{ + struct r0conf *priv_conf; + int chunksect; + + /* Check layout: + * - (N - 1) mirror drives must be already faulty + */ + if ((mddev->raid_disks - 1) != mddev->degraded) { + pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* + * a raid1 doesn't have the notion of chunk size, so + * figure out the largest suitable size we can use. + */ + chunksect = 64 * 2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect - 1))) + chunksect >>= 1; + + if ((chunksect << 9) < PAGE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = chunksect; + mddev->chunk_sectors = chunksect; + mddev->delta_disks = 1 - mddev->raid_disks; + mddev->raid_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover(struct mddev *mddev) +{ + /* raid0 can take over: + * raid4 - if all data disks are active. + * raid5 - providing it is Raid4 layout and one disk is faulty + * raid10 - assuming we have all necessary active disks + * raid1 - with (N -1) mirror drives faulty + */ + + if (mddev->bitmap) { + pr_warn("md/raid0: %s: cannot takeover array with bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + if (mddev->level == 4) + return raid0_takeover_raid45(mddev); + + if (mddev->level == 5) { + if (mddev->layout == ALGORITHM_PARITY_N) + return raid0_takeover_raid45(mddev); + + pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", + mdname(mddev), ALGORITHM_PARITY_N); + } + + if (mddev->level == 10) + return raid0_takeover_raid10(mddev); + + if (mddev->level == 1) + return raid0_takeover_raid1(mddev); + + pr_warn("Takeover from raid%i to raid0 not supported\n", + mddev->level); + + return ERR_PTR(-EINVAL); +} + +static void raid0_quiesce(struct mddev *mddev, int quiesce) +{ +} + +static struct md_personality raid0_personality= +{ + .name = "raid0", + .level = 0, + .owner = THIS_MODULE, + .make_request = raid0_make_request, + .run = raid0_run, + .free = raid0_free, + .status = raid0_status, + .size = raid0_size, + .takeover = raid0_takeover, + .quiesce = raid0_quiesce, + .error_handler = raid0_error, +}; + +static int __init raid0_init (void) +{ + return register_md_personality (&raid0_personality); +} + +static void raid0_exit (void) +{ + unregister_md_personality (&raid0_personality); +} + +module_init(raid0_init); +module_exit(raid0_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); +MODULE_ALIAS("md-personality-2"); /* RAID0 */ +MODULE_ALIAS("md-raid0"); +MODULE_ALIAS("md-level-0"); diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 000000000..8cc761ca7 --- /dev/null +++ b/drivers/md/raid0.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RAID0_H +#define _RAID0_H + +struct strip_zone { + sector_t zone_end; /* Start of the next zone (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + int nb_dev; /* # of devices attached to the zone */ + int disk_shift; /* start disk for the original layout */ +}; + +/* Linux 3.14 (20d0189b101) made an unintended change to + * the RAID0 layout for multi-zone arrays (where devices aren't all + * the same size. + * RAID0_ORIG_LAYOUT restores the original layout + * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout + * The layouts are identical when there is only one zone (all + * devices the same size). + */ + +enum r0layout { + RAID0_ORIG_LAYOUT = 1, + RAID0_ALT_MULTIZONE_LAYOUT = 2, +}; +struct r0conf { + struct strip_zone *strip_zone; + struct md_rdev **devlist; /* lists of rdevs, pointed to + * by strip_zone->dev */ + int nr_strip_zones; + enum r0layout layout; +}; + +#endif diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c new file mode 100644 index 000000000..e0c8ac814 --- /dev/null +++ b/drivers/md/raid1-10.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Maximum size of each resync request */ +#define RESYNC_BLOCK_SIZE (64*1024) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) + +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* for managing resync I/O pages */ +struct resync_pages { + void *raid_bio; + struct page *pages[RESYNC_PAGES]; +}; + +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; +}; + +static void rbio_pool_free(void *rbio, void *data) +{ + kfree(rbio); +} + +static inline int resync_alloc_pages(struct resync_pages *rp, + gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) { + rp->pages[i] = alloc_page(gfp_flags); + if (!rp->pages[i]) + goto out_free; + } + + return 0; + +out_free: + while (--i >= 0) + put_page(rp->pages[i]); + return -ENOMEM; +} + +static inline void resync_free_pages(struct resync_pages *rp) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) + put_page(rp->pages[i]); +} + +static inline void resync_get_all_pages(struct resync_pages *rp) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) + get_page(rp->pages[i]); +} + +static inline struct page *resync_fetch_page(struct resync_pages *rp, + unsigned idx) +{ + if (WARN_ON_ONCE(idx >= RESYNC_PAGES)) + return NULL; + return rp->pages[idx]; +} + +/* + * 'strct resync_pages' stores actual pages used for doing the resync + * IO, and it is per-bio, so make .bi_private points to it. + */ +static inline struct resync_pages *get_resync_pages(struct bio *bio) +{ + return bio->bi_private; +} + +/* generally called after bio_reset() for reseting bvec */ +static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, + int size) +{ + int idx = 0; + + /* initialize bvec table again */ + do { + struct page *page = resync_fetch_page(rp, idx); + int len = min_t(int, size, PAGE_SIZE); + + /* + * won't fail because the vec table is big + * enough to hold all these pages + */ + bio_add_page(bio, page, len, 0); + size -= len; + } while (idx++ < RESYNC_PAGES && size > 0); +} + + +static inline void raid1_submit_write(struct bio *bio) +{ + struct md_rdev *rdev = (void *)bio->bi_bdev; + + bio->bi_next = NULL; + bio_set_dev(bio, rdev->bdev); + if (test_bit(Faulty, &rdev->flags)) + bio_io_error(bio); + else if (unlikely(bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev))) + /* Just ignore it */ + bio_endio(bio); + else + submit_bio_noacct(bio); +} + +static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, + blk_plug_cb_fn unplug) +{ + struct raid1_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + /* + * If bitmap is not enabled, it's safe to submit the io directly, and + * this can get optimal performance. + */ + if (!md_bitmap_enabled(mddev->bitmap)) { + raid1_submit_write(bio); + return true; + } + + cb = blk_check_plugged(unplug, mddev, sizeof(*plug)); + if (!cb) + return false; + + plug = container_of(cb, struct raid1_plug_cb, cb); + bio_list_add(&plug->pending, bio); + + return true; +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c new file mode 100644 index 000000000..76f7ca53d --- /dev/null +++ b/drivers/md/raid1.c @@ -0,0 +1,3407 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala , 2000 + * + * Fixes to reconstruction by Jakob Østergaard" + * Various fixes by Neil Brown + * + * Changes by Peter T. Breuer 31/1/2003 to support + * bitmapped intelligence in resync: + * + * - bitmap marked during normal i/o + * - bitmap used to skip nondirty blocks during sync + * + * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: + * - persistent bitmap code + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "md.h" +#include "raid1.h" +#include "md-bitmap.h" + +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_HAS_JOURNAL) | \ + (1L << MD_JOURNAL_CLEAN) | \ + (1L << MD_HAS_PPL) | \ + (1L << MD_HAS_MULTIPLE_PPLS)) + +static void allow_barrier(struct r1conf *conf, sector_t sector_nr); +static void lower_barrier(struct r1conf *conf, sector_t sector_nr); + +#define raid1_log(md, fmt, args...) \ + do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) + +#include "raid1-10.c" + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) +INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, + START, LAST, static inline, raid1_rb); + +static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, + struct serial_info *si, int idx) +{ + unsigned long flags; + int ret = 0; + sector_t lo = r1_bio->sector; + sector_t hi = lo + r1_bio->sectors; + struct serial_in_rdev *serial = &rdev->serial[idx]; + + spin_lock_irqsave(&serial->serial_lock, flags); + /* collision happened */ + if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + ret = -EBUSY; + else { + si->start = lo; + si->last = hi; + raid1_rb_insert(si, &serial->serial_rb); + } + spin_unlock_irqrestore(&serial->serial_lock, flags); + + return ret; +} + +static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + struct mddev *mddev = rdev->mddev; + struct serial_info *si; + int idx = sector_to_idx(r1_bio->sector); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + if (WARN_ON(!mddev->serial_info_pool)) + return; + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + wait_event(serial->serial_io_wait, + check_and_add_serial(rdev, r1_bio, si, idx) == 0); +} + +static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct serial_info *si; + unsigned long flags; + int found = 0; + struct mddev *mddev = rdev->mddev; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + spin_lock_irqsave(&serial->serial_lock, flags); + for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + si; si = raid1_rb_iter_next(si, lo, hi)) { + if (si->start == lo && si->last == hi) { + raid1_rb_remove(si, &serial->serial_rb); + mempool_free(si, mddev->serial_info_pool); + found = 1; + break; + } + } + if (!found) + WARN(1, "The write IO is not recorded for serialization\n"); + spin_unlock_irqrestore(&serial->serial_lock, flags); + wake_up(&serial->serial_io_wait); +} + +/* + * for resync bio, r1bio pointer can be retrieved from the per-bio + * 'struct resync_pages'. + */ +static inline struct r1bio *get_resync_r1bio(struct bio *bio) +{ + return get_resync_pages(bio)->raid_bio; +} + +static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct pool_info *pi = data; + int size = offsetof(struct r1bio, bios[pi->raid_disks]); + + /* allocate a r1bio with room for raid_disks entries in the bios array */ + return kzalloc(size, gfp_flags); +} + +#define RESYNC_DEPTH 32 +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) + +static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct pool_info *pi = data; + struct r1bio *r1_bio; + struct bio *bio; + int need_pages; + int j; + struct resync_pages *rps; + + r1_bio = r1bio_pool_alloc(gfp_flags, pi); + if (!r1_bio) + return NULL; + + rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), + gfp_flags); + if (!rps) + goto out_free_r1bio; + + /* + * Allocate bios : 1 for reading, n-1 for writing + */ + for (j = pi->raid_disks ; j-- ; ) { + bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); + if (!bio) + goto out_free_bio; + bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + r1_bio->bios[j] = bio; + } + /* + * Allocate RESYNC_PAGES data pages and attach them to + * the first bio. + * If this is a user-requested check/repair, allocate + * RESYNC_PAGES for each bio. + */ + if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) + need_pages = pi->raid_disks; + else + need_pages = 1; + for (j = 0; j < pi->raid_disks; j++) { + struct resync_pages *rp = &rps[j]; + + bio = r1_bio->bios[j]; + + if (j < need_pages) { + if (resync_alloc_pages(rp, gfp_flags)) + goto out_free_pages; + } else { + memcpy(rp, &rps[0], sizeof(*rp)); + resync_get_all_pages(rp); + } + + rp->raid_bio = r1_bio; + bio->bi_private = rp; + } + + r1_bio->master_bio = NULL; + + return r1_bio; + +out_free_pages: + while (--j >= 0) + resync_free_pages(&rps[j]); + +out_free_bio: + while (++j < pi->raid_disks) { + bio_uninit(r1_bio->bios[j]); + kfree(r1_bio->bios[j]); + } + kfree(rps); + +out_free_r1bio: + rbio_pool_free(r1_bio, data); + return NULL; +} + +static void r1buf_pool_free(void *__r1_bio, void *data) +{ + struct pool_info *pi = data; + int i; + struct r1bio *r1bio = __r1_bio; + struct resync_pages *rp = NULL; + + for (i = pi->raid_disks; i--; ) { + rp = get_resync_pages(r1bio->bios[i]); + resync_free_pages(rp); + bio_uninit(r1bio->bios[i]); + kfree(r1bio->bios[i]); + } + + /* resync pages array stored in the 1st bio's .bi_private */ + kfree(rp); + + rbio_pool_free(r1bio, data); +} + +static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) +{ + int i; + + for (i = 0; i < conf->raid_disks * 2; i++) { + struct bio **bio = r1_bio->bios + i; + if (!BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + } +} + +static void free_r1bio(struct r1bio *r1_bio) +{ + struct r1conf *conf = r1_bio->mddev->private; + + put_all_bios(conf, r1_bio); + mempool_free(r1_bio, &conf->r1bio_pool); +} + +static void put_buf(struct r1bio *r1_bio) +{ + struct r1conf *conf = r1_bio->mddev->private; + sector_t sect = r1_bio->sector; + int i; + + for (i = 0; i < conf->raid_disks * 2; i++) { + struct bio *bio = r1_bio->bios[i]; + if (bio->bi_end_io) + rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); + } + + mempool_free(r1_bio, &conf->r1buf_pool); + + lower_barrier(conf, sect); +} + +static void reschedule_retry(struct r1bio *r1_bio) +{ + unsigned long flags; + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + int idx; + + idx = sector_to_idx(r1_bio->sector); + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r1_bio->retry_list, &conf->retry_list); + atomic_inc(&conf->nr_queued[idx]); + spin_unlock_irqrestore(&conf->device_lock, flags); + + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void call_bio_endio(struct r1bio *r1_bio) +{ + struct bio *bio = r1_bio->master_bio; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + bio->bi_status = BLK_STS_IOERR; + + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + bio_end_io_acct(bio, r1_bio->start_time); + bio_endio(bio); +} + +static void raid_end_bio_io(struct r1bio *r1_bio) +{ + struct bio *bio = r1_bio->master_bio; + struct r1conf *conf = r1_bio->mddev->private; + + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + pr_debug("raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(bio) - 1); + + call_bio_endio(r1_bio); + } + /* + * Wake up any possible resync thread that waits for the device + * to go idle. All I/Os, even write-behind writes, are done. + */ + allow_barrier(conf, r1_bio->sector); + + free_r1bio(r1_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int disk, struct r1bio *r1_bio) +{ + struct r1conf *conf = r1_bio->mddev->private; + + conf->mirrors[disk].head_position = + r1_bio->sector + (r1_bio->sectors); +} + +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) +{ + int mirror; + struct r1conf *conf = r1_bio->mddev->private; + int raid_disks = conf->raid_disks; + + for (mirror = 0; mirror < raid_disks * 2; mirror++) + if (r1_bio->bios[mirror] == bio) + break; + + BUG_ON(mirror == raid_disks * 2); + update_head_pos(mirror, r1_bio); + + return mirror; +} + +static void raid1_end_read_request(struct bio *bio) +{ + int uptodate = !bio->bi_status; + struct r1bio *r1_bio = bio->bi_private; + struct r1conf *conf = r1_bio->mddev->private; + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + update_head_pos(r1_bio->read_disk, r1_bio); + + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); + else if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) + /* This was a fail-fast read so we definitely + * want to retry */ + ; + else { + /* If all other devices have failed, we want to return + * the error upwards rather than fail the last device. + * Here we redefine "uptodate" to mean "Don't want to retry" + */ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + test_bit(In_sync, &rdev->flags))) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + + if (uptodate) { + raid_end_bio_io(r1_bio); + rdev_dec_pending(rdev, conf->mddev); + } else { + /* + * oops, read error: + */ + pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n", + mdname(conf->mddev), + rdev->bdev, + (unsigned long long)r1_bio->sector); + set_bit(R1BIO_ReadError, &r1_bio->state); + reschedule_retry(r1_bio); + /* don't drop the reference on read_disk yet */ + } +} + +static void close_write(struct r1bio *r1_bio) +{ + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + bio_free_pages(r1_bio->behind_master_bio); + bio_put(r1_bio->behind_master_bio); + r1_bio->behind_master_bio = NULL; + } + /* clear the bitmap if all writes complete successfully */ + md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(r1_bio->mddev); +} + +static void r1_bio_write_done(struct r1bio *r1_bio) +{ + if (!atomic_dec_and_test(&r1_bio->remaining)) + return; + + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + close_write(r1_bio); + if (test_bit(R1BIO_MadeGood, &r1_bio->state)) + reschedule_retry(r1_bio); + else + raid_end_bio_io(r1_bio); + } +} + +static void raid1_end_write_request(struct bio *bio) +{ + struct r1bio *r1_bio = bio->bi_private; + int behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + struct r1conf *conf = r1_bio->mddev->private; + struct bio *to_put = NULL; + int mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[mirror].rdev; + bool discard_error; + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + + /* + * 'one mirror IO has finished' event handler: + */ + if (bio->bi_status && !discard_error) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + conf->mddev->recovery); + + if (test_bit(FailFast, &rdev->flags) && + (bio->bi_opf & MD_FAILFAST) && + /* We never try FailFast to WriteMostly devices */ + !test_bit(WriteMostly, &rdev->flags)) { + md_error(r1_bio->mddev, rdev); + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + */ + if (!test_bit(Faulty, &rdev->flags)) + set_bit(R1BIO_WriteError, &r1_bio->state); + else { + /* Fail the request */ + set_bit(R1BIO_Degraded, &r1_bio->state); + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + } + } else { + /* + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. + */ + sector_t first_bad; + int bad_sectors; + + r1_bio->bios[mirror] = NULL; + to_put = bio; + /* + * Do not set R1BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + set_bit(R1BIO_Uptodate, &r1_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, + &first_bad, &bad_sectors) && !discard_error) { + r1_bio->bios[mirror] = IO_MADE_GOOD; + set_bit(R1BIO_MadeGood, &r1_bio->state); + } + } + + if (behind) { + if (test_bit(CollisionCheck, &rdev->flags)) + remove_serial(rdev, lo, hi); + if (test_bit(WriteMostly, &rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + pr_debug("raid1: behind end write sectors" + " %llu-%llu\n", + (unsigned long long) mbio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(mbio) - 1); + call_bio_endio(r1_bio); + } + } + } else if (rdev->mddev->serialize_policy) + remove_serial(rdev, lo, hi); + if (r1_bio->bios[mirror] == NULL) + rdev_dec_pending(rdev, conf->mddev); + + /* + * Let's see if all mirrored write operations have finished + * already. + */ + r1_bio_write_done(r1_bio); + + if (to_put) + bio_put(to_put); +} + +static sector_t align_to_barrier_unit_end(sector_t start_sector, + sector_t sectors) +{ + sector_t len; + + WARN_ON(sectors == 0); + /* + * len is the number of sectors from start_sector to end of the + * barrier unit which start_sector belongs to. + */ + len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - + start_sector; + + if (len > sectors) + len = sectors; + + return len; +} + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) +{ + const sector_t this_sector = r1_bio->sector; + int sectors; + int best_good_sectors; + int best_disk, best_dist_disk, best_pending_disk; + int has_nonrot_disk; + int disk; + sector_t best_dist; + unsigned int min_pending; + struct md_rdev *rdev; + int choose_first; + int choose_next_idle; + + rcu_read_lock(); + /* + * Check if we can balance. We can balance on the whole + * device if no resync is going on, or below the resync window. + * We take the first readable disk when above the resync window. + */ + retry: + sectors = r1_bio->sectors; + best_disk = -1; + best_dist_disk = -1; + best_dist = MaxSector; + best_pending_disk = -1; + min_pending = UINT_MAX; + best_good_sectors = 0; + has_nonrot_disk = 0; + choose_next_idle = 0; + clear_bit(R1BIO_FailFast, &r1_bio->state); + + if ((conf->mddev->recovery_cp < this_sector + sectors) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) + choose_first = 1; + else + choose_first = 0; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + sector_t dist; + sector_t first_bad; + int bad_sectors; + unsigned int pending; + bool nonrot; + + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (r1_bio->bios[disk] == IO_BLOCKED + || rdev == NULL + || test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < this_sector + sectors) + continue; + if (test_bit(WriteMostly, &rdev->flags)) { + /* Don't balance among write-mostly, just + * use the first as a last resort */ + if (best_dist_disk < 0) { + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (first_bad <= this_sector) + /* Cannot use this */ + continue; + best_good_sectors = first_bad - this_sector; + } else + best_good_sectors = sectors; + best_dist_disk = disk; + best_pending_disk = disk; + } + continue; + } + /* This is a reasonable device to use. It might + * even be best. + */ + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* already have a better device */ + continue; + if (first_bad <= this_sector) { + /* cannot read here. If this is the 'primary' + * device, then we must not read beyond + * bad_sectors from another device.. + */ + bad_sectors -= (this_sector - first_bad); + if (choose_first && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + + } else { + sector_t good_sectors = first_bad - this_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_disk = disk; + } + if (choose_first) + break; + } + continue; + } else { + if ((sectors > best_good_sectors) && (best_disk >= 0)) + best_disk = -1; + best_good_sectors = sectors; + } + + if (best_disk >= 0) + /* At least two disks to choose from so failfast is OK */ + set_bit(R1BIO_FailFast, &r1_bio->state); + + nonrot = bdev_nonrot(rdev->bdev); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); + dist = abs(this_sector - conf->mirrors[disk].head_position); + if (choose_first) { + best_disk = disk; + break; + } + /* Don't change to another disk for sequential reads */ + if (conf->mirrors[disk].next_seq_sect == this_sector + || dist == 0) { + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; + struct raid1_info *mirror = &conf->mirrors[disk]; + + best_disk = disk; + /* + * If buffered sequential IO size exceeds optimal + * iosize, check if there is idle disk. If yes, choose + * the idle disk. read_balance could already choose an + * idle disk before noticing it's a sequential IO in + * this disk. This doesn't matter because this disk + * will idle, next time it will be utilized after the + * first disk has IO size exceeds optimal iosize. In + * this way, iosize of the first disk will be optimal + * iosize at least. iosize of the second disk might be + * small, but not a big deal since when the second disk + * starts IO, the first disk is likely still busy. + */ + if (nonrot && opt_iosize > 0 && + mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= + mirror->seq_start) { + choose_next_idle = 1; + continue; + } + break; + } + + if (choose_next_idle) + continue; + + if (min_pending > pending) { + min_pending = pending; + best_pending_disk = disk; + } + + if (dist < best_dist) { + best_dist = dist; + best_dist_disk = disk; + } + } + + /* + * If all disks are rotational, choose the closest disk. If any disk is + * non-rotational, choose the disk with less pending request even the + * disk is rotational, which might/might not be optimal for raids with + * mixed ratation/non-rotational disks depending on workload. + */ + if (best_disk == -1) { + if (has_nonrot_disk || min_pending == 0) + best_disk = best_pending_disk; + else + best_disk = best_dist_disk; + } + + if (best_disk >= 0) { + rdev = rcu_dereference(conf->mirrors[best_disk].rdev); + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + sectors = best_good_sectors; + + if (conf->mirrors[best_disk].next_seq_sect != this_sector) + conf->mirrors[best_disk].seq_start = this_sector; + + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; + } + rcu_read_unlock(); + *max_sectors = sectors; + + return best_disk; +} + +static void flush_bio_list(struct r1conf *conf, struct bio *bio) +{ + /* flush any pending bitmap writes to disk before proceeding w/ I/O */ + md_bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + + raid1_submit_write(bio); + bio = next; + cond_resched(); + } +} + +static void flush_pending_writes(struct r1conf *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + */ + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct blk_plug plug; + struct bio *bio; + + bio = bio_list_get(&conf->pending_bio_list); + spin_unlock_irq(&conf->device_lock); + + /* + * As this is called in a wait_event() loop (see freeze_array), + * current->state might be TASK_UNINTERRUPTIBLE which will + * cause a warning when we prepare to wait again. As it is + * rare that this path is taken, it is perfectly safe to force + * us to go around the wait_event() loop again, so the warning + * is a false-positive. Silence the warning by resetting + * thread state + */ + __set_current_state(TASK_RUNNING); + blk_start_plug(&plug); + flush_bio_list(conf, bio); + blk_finish_plug(&plug); + } else + spin_unlock_irq(&conf->device_lock); +} + +/* Barriers.... + * Sometimes we need to suspend IO while we do something else, + * either some resync/recovery, or reconfigure the array. + * To do this we raise a 'barrier'. + * The 'barrier' is a counter that can be raised multiple times + * to count how many activities are happening which preclude + * normal IO. + * We can only raise the barrier if there is no pending IO. + * i.e. if nr_pending == 0. + * We choose only to raise the barrier if no-one is waiting for the + * barrier to go down. This means that as soon as an IO request + * is ready, no other operations which require a barrier will start + * until the IO request has had a chance. + * + * So: regular IO calls 'wait_barrier'. When that returns there + * is no backgroup IO happening, It must arrange to call + * allow_barrier when it has finished its IO. + * backgroup IO calls must call raise_barrier. Once that returns + * there is no normal IO happeing. It must arrange to call + * lower_barrier when the particular background IO completes. + * + * If resync/recovery is interrupted, returns -EINTR; + * Otherwise, returns 0. + */ +static int raise_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + spin_lock_irq(&conf->resync_lock); + + /* Wait until no block IO is waiting */ + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_waiting[idx]), + conf->resync_lock); + + /* block any new IO from starting */ + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); + + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while conf->nr_pending[idx] is not 0, meaning regular I/O + * existing in corresponding I/O barrier bucket. + * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches + * max resync count which allowed on current I/O barrier bucket. + */ + wait_event_lock_irq(conf->wait_barrier, + (!conf->array_frozen && + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) || + test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery), + conf->resync_lock); + + if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + atomic_dec(&conf->barrier[idx]); + spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); + return -EINTR; + } + + atomic_inc(&conf->nr_sync_pending); + spin_unlock_irq(&conf->resync_lock); + + return 0; +} + +static void lower_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); + + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_sync_pending); + wake_up(&conf->wait_barrier); +} + +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait) +{ + bool ret = true; + + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); + + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * frozen_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return ret; + + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + + /* Return false when nowait flag is set */ + if (nowait) { + ret = false; + } else { + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + } + + atomic_dec(&conf->nr_waiting[idx]); + spin_unlock_irq(&conf->resync_lock); + return ret; +} + +static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) +{ + int idx = sector_to_idx(sector_nr); + bool ret = true; + + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); + + if (!READ_ONCE(conf->array_frozen)) + return ret; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + + /* Return false when nowait flag is set */ + if (nowait) { + /* Return false when nowait flag is set */ + ret = false; + } else { + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + } + + atomic_dec(&conf->nr_waiting[idx]); + spin_unlock_irq(&conf->resync_lock); + return ret; +} + +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) +{ + int idx = sector_to_idx(sector_nr); + + return _wait_barrier(conf, idx, nowait); +} + +static void _allow_barrier(struct r1conf *conf, int idx) +{ + atomic_dec(&conf->nr_pending[idx]); + wake_up(&conf->wait_barrier); +} + +static void allow_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + _allow_barrier(conf, idx); +} + +/* conf->resync_lock should be held */ +static int get_unqueued_pending(struct r1conf *conf) +{ + int idx, ret; + + ret = atomic_read(&conf->nr_sync_pending); + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); + + return ret; +} + +static void freeze_array(struct r1conf *conf, int extra) +{ + /* Stop sync I/O and normal I/O and wait for everything to + * go quiet. + * This is called in two situations: + * 1) management command handlers (reshape, remove disk, quiesce). + * 2) one normal I/O request failed. + + * After array_frozen is set to 1, new sync IO will be blocked at + * raise_barrier(), and new normal I/O will blocked at _wait_barrier() + * or wait_read_barrier(). The flying I/Os will either complete or be + * queued. When everything goes quite, there are only queued I/Os left. + + * Every flying I/O contributes to a conf->nr_pending[idx], idx is the + * barrier bucket index which this I/O request hits. When all sync and + * normal I/O are queued, sum of all conf->nr_pending[] will match sum + * of all conf->nr_queued[]. But normal I/O failure is an exception, + * in handle_read_error(), we may call freeze_array() before trying to + * fix the read error. In this case, the error read I/O is not queued, + * so get_unqueued_pending() == 1. + * + * Therefore before this function returns, we need to wait until + * get_unqueued_pendings(conf) gets equal to extra. For + * normal I/O context, extra is 1, in rested situations extra is 0. + */ + spin_lock_irq(&conf->resync_lock); + conf->array_frozen = 1; + raid1_log(conf->mddev, "wait freeze"); + wait_event_lock_irq_cmd( + conf->wait_barrier, + get_unqueued_pending(conf) == extra, + conf->resync_lock, + flush_pending_writes(conf)); + spin_unlock_irq(&conf->resync_lock); +} +static void unfreeze_array(struct r1conf *conf) +{ + /* reverse the effect of the freeze */ + spin_lock_irq(&conf->resync_lock); + conf->array_frozen = 0; + spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); +} + +static void alloc_behind_master_bio(struct r1bio *r1_bio, + struct bio *bio) +{ + int size = bio->bi_iter.bi_size; + unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + int i = 0; + struct bio *behind_bio = NULL; + + behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, + &r1_bio->mddev->bio_set); + if (!behind_bio) + return; + + /* discard op, we don't support writezero/writesame yet */ + if (!bio_has_data(bio)) { + behind_bio->bi_iter.bi_size = size; + goto skip_copy; + } + + while (i < vcnt && size) { + struct page *page; + int len = min_t(int, PAGE_SIZE, size); + + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) + goto free_pages; + + bio_add_page(behind_bio, page, len, 0); + + size -= len; + i++; + } + + bio_copy_data(behind_bio, bio); +skip_copy: + r1_bio->behind_master_bio = behind_bio; + set_bit(R1BIO_BehindIO, &r1_bio->state); + + return; + +free_pages: + pr_debug("%dB behind alloc failed, doing sync I/O\n", + bio->bi_iter.bi_size); + bio_free_pages(behind_bio); + bio_put(behind_bio); +} + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule || current->bio_list) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + flush_bio_list(conf, bio); + kfree(plug); +} + +static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio) +{ + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio); + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector; +} + +static inline struct r1bio * +alloc_r1bio(struct mddev *mddev, struct bio *bio) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + + r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); + /* Ensure no bio records IO_BLOCKED */ + memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); + init_r1bio(r1_bio, mddev, bio); + return r1_bio; +} + +static void raid1_read_request(struct mddev *mddev, struct bio *bio, + int max_read_sectors, struct r1bio *r1_bio) +{ + struct r1conf *conf = mddev->private; + struct raid1_info *mirror; + struct bio *read_bio; + struct bitmap *bitmap = mddev->bitmap; + const enum req_op op = bio_op(bio); + const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; + int max_sectors; + int rdisk; + bool r1bio_existed = !!r1_bio; + char b[BDEVNAME_SIZE]; + + /* + * If r1_bio is set, we are blocking the raid1d thread + * so there is a tiny risk of deadlock. So ask for + * emergency memory if needed. + */ + gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; + + if (r1bio_existed) { + /* Need to get the block device name carefully */ + struct md_rdev *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); + if (rdev) + snprintf(b, sizeof(b), "%pg", rdev->bdev); + else + strcpy(b, "???"); + rcu_read_unlock(); + } + + /* + * Still need barrier for READ in case that whole + * array is frozen. + */ + if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, + bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return; + } + + if (!r1_bio) + r1_bio = alloc_r1bio(mddev, bio); + else + init_r1bio(r1_bio, mddev, bio); + r1_bio->sectors = max_read_sectors; + + /* + * make_request() can abort the operation when read-ahead is being + * used and no empty request is available. + */ + rdisk = read_balance(conf, r1_bio, &max_sectors); + + if (rdisk < 0) { + /* couldn't find anywhere to read from */ + if (r1bio_existed) { + pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), + b, + (unsigned long long)r1_bio->sector); + } + raid_end_bio_io(r1_bio); + return; + } + mirror = conf->mirrors + rdisk; + + if (r1bio_existed) + pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + mirror->rdev->bdev); + + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* + * Reading from a write-mostly device must take care not to + * over-take any writes that are 'behind' + */ + raid1_log(mddev, "wait behind writes"); + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + + if (max_sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, max_sectors, + gfp, &conf->bio_split); + bio_chain(split, bio); + submit_bio_noacct(bio); + bio = split; + r1_bio->master_bio = bio; + r1_bio->sectors = max_sectors; + } + + r1_bio->read_disk = rdisk; + + if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); + + read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, + &mddev->bio_set); + + r1_bio->bios[rdisk] = read_bio; + + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; + read_bio->bi_end_io = raid1_end_read_request; + bio_set_op_attrs(read_bio, op, do_sync); + if (test_bit(FailFast, &mirror->rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) + read_bio->bi_opf |= MD_FAILFAST; + read_bio->bi_private = r1_bio; + + if (mddev->gendisk) + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); + + submit_bio_noacct(read_bio); +} + +static void raid1_write_request(struct mddev *mddev, struct bio *bio, + int max_write_sectors) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + int i, disks; + struct bitmap *bitmap = mddev->bitmap; + unsigned long flags; + struct md_rdev *blocked_rdev; + int first_clone; + int max_sectors; + bool write_behind = false; + + if (mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) { + + DEFINE_WAIT(w); + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + if (!wait_barrier(conf, bio->bi_iter.bi_sector, + bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return; + } + + r1_bio = alloc_r1bio(mddev, bio); + r1_bio->sectors = max_write_sectors; + + /* first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + * If there are known/acknowledged bad blocks on any device on + * which we have seen a write error, we want to avoid writing those + * blocks. + * This potentially requires several writes to write around + * the bad blocks. Each set of writes gets it's own r1bio + * with a set of bios attached. + */ + + disks = conf->raid_disks * 2; + retry_write: + blocked_rdev = NULL; + rcu_read_lock(); + max_sectors = r1_bio->sectors; + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + + /* + * The write-behind io is only attempted on drives marked as + * write-mostly, which means we could allocate write behind + * bio later. + */ + if (rdev && test_bit(WriteMostly, &rdev->flags)) + write_behind = true; + + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + r1_bio->bios[i] = NULL; + if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (i < conf->raid_disks) + set_bit(R1BIO_Degraded, &r1_bio->state); + continue; + } + + atomic_inc(&rdev->nr_pending); + if (test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* mustn't write here until the bad block is + * acknowledged*/ + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= r1_bio->sector) { + /* Cannot write here at all */ + bad_sectors -= (r1_bio->sector - first_bad); + if (bad_sectors < max_sectors) + /* mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; + rdev_dec_pending(rdev, mddev); + /* We don't set R1BIO_Degraded as that + * only applies if the disk is + * missing, so it might be re-added, + * and we want to know to recover this + * chunk. + * In this case the device is here, + * and the fact that this chunk is not + * in-sync is recorded in the bad + * block log + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - r1_bio->sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + r1_bio->bios[i] = bio; + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + r1_bio->state = 0; + allow_barrier(conf, bio->bi_iter.bi_sector); + + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } + raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf, bio->bi_iter.bi_sector, false); + goto retry_write; + } + + /* + * When using a bitmap, we may call alloc_behind_master_bio below. + * alloc_behind_master_bio allocates a copy of the data payload a page + * at a time and thus needs a new bio that can fit the whole payload + * this bio in page sized chunks. + */ + if (write_behind && bitmap) + max_sectors = min_t(int, max_sectors, + BIO_MAX_VECS * (PAGE_SIZE >> 9)); + if (max_sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, max_sectors, + GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + submit_bio_noacct(bio); + bio = split; + r1_bio->master_bio = bio; + r1_bio->sectors = max_sectors; + } + + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); + atomic_set(&r1_bio->remaining, 1); + atomic_set(&r1_bio->behind_remaining, 0); + + first_clone = 1; + + for (i = 0; i < disks; i++) { + struct bio *mbio = NULL; + struct md_rdev *rdev = conf->mirrors[i].rdev; + if (!r1_bio->bios[i]) + continue; + + if (first_clone) { + /* do behind I/O ? + * Not if there are too many, or cannot + * allocate memory, or a reader on WriteMostly + * is waiting for behind writes to flush */ + if (bitmap && + test_bit(WriteMostly, &rdev->flags) && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait)) { + alloc_behind_master_bio(r1_bio, bio); + } + + md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); + first_clone = 0; + } + + if (r1_bio->behind_master_bio) { + mbio = bio_alloc_clone(rdev->bdev, + r1_bio->behind_master_bio, + GFP_NOIO, &mddev->bio_set); + if (test_bit(CollisionCheck, &rdev->flags)) + wait_for_serialization(rdev, r1_bio); + if (test_bit(WriteMostly, &rdev->flags)) + atomic_inc(&r1_bio->behind_remaining); + } else { + mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, + &mddev->bio_set); + + if (mddev->serialize_policy) + wait_for_serialization(rdev, r1_bio); + } + + r1_bio->bios[i] = mbio; + + mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); + mbio->bi_end_io = raid1_end_write_request; + mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); + if (test_bit(FailFast, &rdev->flags) && + !test_bit(WriteMostly, &rdev->flags) && + conf->raid_disks - mddev->degraded > 1) + mbio->bi_opf |= MD_FAILFAST; + mbio->bi_private = r1_bio; + + atomic_inc(&r1_bio->remaining); + + if (mddev->gendisk) + trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), + r1_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void *)rdev; + if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug)) { + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(mddev->thread); + } + } + + r1_bio_write_done(r1_bio); + + /* In case raid1d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); +} + +static bool raid1_make_request(struct mddev *mddev, struct bio *bio) +{ + sector_t sectors; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + /* + * There is a limit to the maximum size, but + * the read/write handler might find a lower limit + * due to bad blocks. To avoid multiple splits, + * we pass the maximum number of sectors down + * and let the lower level perform the split. + */ + sectors = align_to_barrier_unit_end( + bio->bi_iter.bi_sector, bio_sectors(bio)); + + if (bio_data_dir(bio) == READ) + raid1_read_request(mddev, bio, sectors, NULL); + else { + if (!md_write_start(mddev,bio)) + return false; + raid1_write_request(mddev, bio, sectors); + } + return true; +} + +static void raid1_status(struct seq_file *seq, struct mddev *mddev) +{ + struct r1conf *conf = mddev->private; + int i; + + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + seq_printf(seq, "%s", + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); + seq_printf(seq, "]"); +} + +/** + * raid1_error() - RAID1 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * - recovery is disabled. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + * + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ +static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r1conf *conf = mddev->private; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + if (test_bit(In_sync, &rdev->flags) && + (conf->raid_disks - mddev->degraded) == 1) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } + } + set_bit(Blocked, &rdev->flags); + if (test_and_clear_bit(In_sync, &rdev->flags)) + mddev->degraded++; + set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), rdev->bdev, + mdname(mddev), conf->raid_disks - mddev->degraded); +} + +static void print_conf(struct r1conf *conf) +{ + int i; + + pr_debug("RAID1 conf printout:\n"); + if (!conf) { + pr_debug("(!conf)\n"); + return; + } + pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); + + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) + pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + rdev->bdev); + } + rcu_read_unlock(); +} + +static void close_sync(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + _wait_barrier(conf, idx, false); + _allow_barrier(conf, idx); + } + + mempool_exit(&conf->r1buf_pool); +} + +static int raid1_spare_active(struct mddev *mddev) +{ + int i; + struct r1conf *conf = mddev->private; + int count = 0; + unsigned long flags; + + /* + * Find all failed disks within the RAID1 configuration + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. + * device_lock used to avoid races with raid1_end_read_request + * which expects 'In_sync' flags and ->degraded to be consistent. + */ + spin_lock_irqsave(&conf->device_lock, flags); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; + if (repl + && !test_bit(Candidate, &repl->flags) + && repl->recovery_offset == MaxSector + && !test_bit(Faulty, &repl->flags) + && !test_and_set_bit(In_sync, &repl->flags)) { + /* replacement has just become active */ + if (!rdev || + !test_and_clear_bit(In_sync, &rdev->flags)) + count++; + if (rdev) { + /* Replaced device not technically + * faulty, but we need to be sure + * it gets removed and never re-added + */ + set_bit(Faulty, &rdev->flags); + sysfs_notify_dirent_safe( + rdev->sysfs_state); + } + } + if (rdev + && rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { + count++; + sysfs_notify_dirent_safe(rdev->sysfs_state); + } + } + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); + + print_conf(conf); + return count; +} + +static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r1conf *conf = mddev->private; + int err = -EEXIST; + int mirror = 0; + struct raid1_info *p; + int first = 0; + int last = conf->raid_disks - 1; + + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + rdev->saved_raid_disk < conf->raid_disks && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + first = last = rdev->saved_raid_disk; + + for (mirror = first; mirror <= last; mirror++) { + p = conf->mirrors + mirror; + if (!p->rdev) { + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + p->head_position = 0; + rdev->raid_disk = mirror; + err = 0; + /* As all devices are equivalent, we don't need a full recovery + * if this was recently any drive of the array + */ + if (rdev->saved_raid_disk < 0) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + if (test_bit(WantReplacement, &p->rdev->flags) && + p[conf->raid_disks].rdev == NULL) { + /* Add this device as a replacement */ + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + break; + } + } + print_conf(conf); + return err; +} + +static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r1conf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct raid1_info *p = conf->mirrors + number; + + if (unlikely(number >= conf->raid_disks)) + goto abort; + + if (rdev != p->rdev) + p = conf->mirrors + conf->raid_disks + number; + + print_conf(conf); + if (rdev == p->rdev) { + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove non-faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + if (conf->mirrors[conf->raid_disks + number].rdev) { + /* We just removed a device that is being replaced. + * Move down the replacement. We drain all IO before + * doing this to avoid confusion. + */ + struct md_rdev *repl = + conf->mirrors[conf->raid_disks + number].rdev; + freeze_array(conf, 0); + if (atomic_read(&repl->nr_pending)) { + /* It means that some queued IO of retry_list + * hold repl. Thus, we cannot set replacement + * as NULL, avoiding rdev NULL pointer + * dereference in sync_request_write and + * handle_write_finished. + */ + err = -EBUSY; + unfreeze_array(conf); + goto abort; + } + clear_bit(Replacement, &repl->flags); + p->rdev = repl; + conf->mirrors[conf->raid_disks + number].rdev = NULL; + unfreeze_array(conf); + } + + clear_bit(WantReplacement, &rdev->flags); + err = md_integrity_register(mddev); + } +abort: + + print_conf(conf); + return err; +} + +static void end_sync_read(struct bio *bio) +{ + struct r1bio *r1_bio = get_resync_r1bio(bio); + + update_head_pos(r1_bio->read_disk, r1_bio); + + /* + * we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!bio->bi_status) + set_bit(R1BIO_Uptodate, &r1_bio->state); + + if (atomic_dec_and_test(&r1_bio->remaining)) + reschedule_retry(r1_bio); +} + +static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) +{ + sector_t sync_blocks = 0; + sector_t s = r1_bio->sector; + long sectors_to_go = r1_bio->sectors; + + /* make sure these bits don't get cleared. */ + do { + md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); + s += sync_blocks; + sectors_to_go -= sync_blocks; + } while (sectors_to_go > 0); +} + +static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +{ + if (atomic_dec_and_test(&r1_bio->remaining)) { + struct mddev *mddev = r1_bio->mddev; + int s = r1_bio->sectors; + + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } + } +} + +static void end_sync_write(struct bio *bio) +{ + int uptodate = !bio->bi_status; + struct r1bio *r1_bio = get_resync_r1bio(bio); + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + sector_t first_bad; + int bad_sectors; + struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; + + if (!uptodate) { + abort_sync_write(mddev, r1_bio); + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + mddev->recovery); + set_bit(R1BIO_WriteError, &r1_bio->state); + } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, + &first_bad, &bad_sectors) && + !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) + ) + set_bit(R1BIO_MadeGood, &r1_bio->state); + + put_sync_write_buf(r1_bio, uptodate); +} + +static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, + int sectors, struct page *page, blk_opf_t rw) +{ + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == REQ_OP_WRITE) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + rdev->mddev->recovery); + } + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + +static int fix_sync_read_error(struct r1bio *r1_bio) +{ + /* Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. + * We don't need to check is_badblock() again as we + * made sure that anything with a bad block in range + * will have bi_end_io clear. + */ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + struct bio *bio = r1_bio->bios[r1_bio->read_disk]; + struct page **pages = get_resync_pages(bio)->pages; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; + struct md_rdev *rdev; + + rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (test_bit(FailFast, &rdev->flags)) { + /* Don't try recovering from here - just fail it + * ... unless it is the last working device of course */ + md_error(mddev, rdev); + if (test_bit(Faulty, &rdev->flags)) + /* Don't try to read from here, but make sure + * put_buf does it's thing + */ + bio->bi_end_io = end_sync_write; + } + + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, sect, s<<9, + pages[idx], + REQ_OP_READ, false)) { + success = 1; + break; + } + } + d++; + if (d == conf->raid_disks * 2) + d = 0; + } while (!success && d != r1_bio->read_disk); + + if (!success) { + int abort = 0; + /* Cannot read from anywhere, this block is lost. + * Record a bad block on each device. If that doesn't + * work just disable and interrupt the recovery. + * Don't fail devices as that won't really help. + */ + pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", + mdname(mddev), bio->bi_bdev, + (unsigned long long)r1_bio->sector); + for (d = 0; d < conf->raid_disks * 2; d++) { + rdev = conf->mirrors[d].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + abort = 1; + } + if (abort) { + conf->recovery_disabled = + mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); + return 0; + } + /* Try next page */ + sectors -= s; + sect += s; + idx++; + continue; + } + + start = d; + /* write it back and re-read */ + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks * 2; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (r1_sync_page_io(rdev, sect, s, + pages[idx], + REQ_OP_WRITE) == 0) { + r1_bio->bios[d]->bi_end_io = NULL; + rdev_dec_pending(rdev, mddev); + } + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks * 2; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (r1_sync_page_io(rdev, sect, s, + pages[idx], + REQ_OP_READ) != 0) + atomic_add(s, &rdev->corrected_errors); + } + sectors -= s; + sect += s; + idx ++; + } + set_bit(R1BIO_Uptodate, &r1_bio->state); + bio->bi_status = 0; + return 1; +} + +static void process_checks(struct r1bio *r1_bio) +{ + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + int primary; + int i; + int vcnt; + + /* Fix variable parts of all bios */ + vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); + for (i = 0; i < conf->raid_disks * 2; i++) { + blk_status_t status; + struct bio *b = r1_bio->bios[i]; + struct resync_pages *rp = get_resync_pages(b); + if (b->bi_end_io != end_sync_read) + continue; + /* fixup the bio for reuse, but preserve errno */ + status = b->bi_status; + bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ); + b->bi_status = status; + b->bi_iter.bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + b->bi_end_io = end_sync_read; + rp->raid_bio = r1_bio; + b->bi_private = rp; + + /* initialize bvec table again */ + md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9); + } + for (primary = 0; primary < conf->raid_disks * 2; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + !r1_bio->bios[primary]->bi_status) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + for (i = 0; i < conf->raid_disks * 2; i++) { + int j = 0; + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + blk_status_t status = sbio->bi_status; + struct page **ppages = get_resync_pages(pbio)->pages; + struct page **spages = get_resync_pages(sbio)->pages; + struct bio_vec *bi; + int page_len[RESYNC_PAGES] = { 0 }; + struct bvec_iter_all iter_all; + + if (sbio->bi_end_io != end_sync_read) + continue; + /* Now we can 'fixup' the error value */ + sbio->bi_status = 0; + + bio_for_each_segment_all(bi, sbio, iter_all) + page_len[j++] = bi->bv_len; + + if (!status) { + for (j = vcnt; j-- ; ) { + if (memcmp(page_address(ppages[j]), + page_address(spages[j]), + page_len[j])) + break; + } + } else + j = 0; + if (j >= 0) + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && !status)) { + /* No need to write to this device. */ + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + continue; + } + + bio_copy_data(sbio, pbio); + } +} + +static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) +{ + struct r1conf *conf = mddev->private; + int i; + int disks = conf->raid_disks * 2; + struct bio *wbio; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + /* ouch - failed to read all of that. */ + if (!fix_sync_read_error(r1_bio)) + return; + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + process_checks(r1_bio); + + /* + * schedule writes + */ + atomic_set(&r1_bio->remaining, 1); + for (i = 0; i < disks ; i++) { + wbio = r1_bio->bios[i]; + if (wbio->bi_end_io == NULL || + (wbio->bi_end_io == end_sync_read && + (i == r1_bio->read_disk || + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) + continue; + if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { + abort_sync_write(mddev, r1_bio); + continue; + } + + bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) + wbio->bi_opf |= MD_FAILFAST; + + wbio->bi_end_io = end_sync_write; + atomic_inc(&r1_bio->remaining); + md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); + + submit_bio_noacct(wbio); + } + + put_sync_write_buf(r1_bio, 1); +} + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array synchronising. + */ + +static void fix_read_error(struct r1conf *conf, int read_disk, + sector_t sect, int sectors) +{ + struct mddev *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + struct md_rdev *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + sector_t first_bad; + int bad_sectors; + + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + (test_bit(In_sync, &rdev->flags) || + (!test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sect + s)) && + is_badblock(rdev, sect, s, + &first_bad, &bad_sectors) == 0) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, REQ_OP_READ, false)) + success = 1; + rdev_dec_pending(rdev, mddev); + if (success) + break; + } else + rcu_read_unlock(); + d++; + if (d == conf->raid_disks * 2) + d = 0; + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere - mark it bad */ + struct md_rdev *rdev = conf->mirrors[read_disk].rdev; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + md_error(mddev, rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks * 2; + d--; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + r1_sync_page_io(rdev, sect, s, + conf->tmppage, REQ_OP_WRITE); + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); + } + d = start; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks * 2; + d--; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (r1_sync_page_io(rdev, sect, s, + conf->tmppage, REQ_OP_READ)) { + atomic_add(s, &rdev->corrected_errors); + pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n", + mdname(mddev), s, + (unsigned long long)(sect + + rdev->data_offset), + rdev->bdev); + } + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); + } + sectors -= s; + sect += s; + } +} + +static int narrow_write_error(struct r1bio *r1_bio, int i) +{ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + struct md_rdev *rdev = conf->mirrors[i].rdev; + + /* bio has the data to be written to device 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this somehow. + * + * We currently own a reference on the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r1_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = roundup(1 << rdev->badblocks.shift, + bdev_logical_block_size(rdev->bdev) >> 9); + sector = r1_bio->sector; + sectors = ((sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors'*/ + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + wbio = bio_alloc_clone(rdev->bdev, + r1_bio->behind_master_bio, + GFP_NOIO, &mddev->bio_set); + } else { + wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio, + GFP_NOIO, &mddev->bio_set); + } + + bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + wbio->bi_iter.bi_sector = r1_bio->sector; + wbio->bi_iter.bi_size = r1_bio->sectors << 9; + + bio_trim(wbio, sector - r1_bio->sector, sectors); + wbio->bi_iter.bi_sector += rdev->data_offset; + + if (submit_bio_wait(wbio) < 0) + /* failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) +{ + int m; + int s = r1_bio->sectors; + for (m = 0; m < conf->raid_disks * 2 ; m++) { + struct md_rdev *rdev = conf->mirrors[m].rdev; + struct bio *bio = r1_bio->bios[m]; + if (bio->bi_end_io == NULL) + continue; + if (!bio->bi_status && + test_bit(R1BIO_MadeGood, &r1_bio->state)) { + rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); + } + if (bio->bi_status && + test_bit(R1BIO_WriteError, &r1_bio->state)) { + if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r1_bio); + md_done_sync(conf->mddev, s, 1); +} + +static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) +{ + int m, idx; + bool fail = false; + + for (m = 0; m < conf->raid_disks * 2 ; m++) + if (r1_bio->bios[m] == IO_MADE_GOOD) { + struct md_rdev *rdev = conf->mirrors[m].rdev; + rdev_clear_badblocks(rdev, + r1_bio->sector, + r1_bio->sectors, 0); + rdev_dec_pending(rdev, conf->mddev); + } else if (r1_bio->bios[m] != NULL) { + /* This drive got a write error. We need to + * narrow down and record precise write + * errors. + */ + fail = true; + if (!narrow_write_error(r1_bio, m)) { + md_error(conf->mddev, + conf->mirrors[m].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } + rdev_dec_pending(conf->mirrors[m].rdev, + conf->mddev); + } + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + idx = sector_to_idx(r1_bio->sector); + atomic_inc(&conf->nr_queued[idx]); + spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * get_unqueued_pending() == extra to be true. + */ + wake_up(&conf->wait_barrier); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } +} + +static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) +{ + struct mddev *mddev = conf->mddev; + struct bio *bio; + struct md_rdev *rdev; + + clear_bit(R1BIO_ReadError, &r1_bio->state); + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen + */ + + bio = r1_bio->bios[r1_bio->read_disk]; + bio_put(bio); + r1_bio->bios[r1_bio->read_disk] = NULL; + + rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (mddev->ro == 0 + && !test_bit(FailFast, &rdev->flags)) { + freeze_array(conf, 1); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, r1_bio->sectors); + unfreeze_array(conf); + } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { + md_error(mddev, rdev); + } else { + r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; + } + + rdev_dec_pending(rdev, conf->mddev); + allow_barrier(conf, r1_bio->sector); + bio = r1_bio->master_bio; + + /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ + r1_bio->state = 0; + raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); +} + +static void raid1d(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r1bio *r1_bio; + unsigned long flags; + struct r1conf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + struct blk_plug plug; + int idx; + + md_check_recovery(mddev); + + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + list_splice_init(&conf->bio_end_io_list, &tmp); + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r1_bio = list_first_entry(&tmp, struct r1bio, + retry_list); + list_del(&r1_bio->retry_list); + idx = sector_to_idx(r1_bio->sector); + atomic_dec(&conf->nr_queued[idx]); + if (mddev->degraded) + set_bit(R1BIO_Degraded, &r1_bio->state); + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } + } + + blk_start_plug(&plug); + for (;;) { + + flush_pending_writes(conf); + + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); + break; + } + r1_bio = list_entry(head->prev, struct r1bio, retry_list); + list_del(head->prev); + idx = sector_to_idx(r1_bio->sector); + atomic_dec(&conf->nr_queued[idx]); + spin_unlock_irqrestore(&conf->device_lock, flags); + + mddev = r1_bio->mddev; + conf = mddev->private; + if (test_bit(R1BIO_IsSync, &r1_bio->state)) { + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_sync_write_finished(conf, r1_bio); + else + sync_request_write(mddev, r1_bio); + } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_write_finished(conf, r1_bio); + else if (test_bit(R1BIO_ReadError, &r1_bio->state)) + handle_read_error(conf, r1_bio); + else + WARN_ON_ONCE(1); + + cond_resched(); + if (mddev->sb_flags & ~(1<r1buf_pool)); + + return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, + r1buf_pool_free, conf->poolinfo); +} + +static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) +{ + struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO); + struct resync_pages *rps; + struct bio *bio; + int i; + + for (i = conf->poolinfo->raid_disks; i--; ) { + bio = r1bio->bios[i]; + rps = bio->bi_private; + bio_reset(bio, NULL, 0); + bio->bi_private = rps; + } + r1bio->master_bio = NULL; + return r1bio; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + */ + +static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + struct bio *bio; + sector_t max_sector, nr_sectors; + int disk = -1; + int i; + int wonly = -1; + int write_targets = 0, read_targets = 0; + sector_t sync_blocks; + int still_degraded = 0; + int good_sectors = RESYNC_SECTORS; + int min_bad = 0; /* number of sectors that are bad in all devices */ + int idx = sector_to_idx(sector_nr); + int page_idx = 0; + + if (!mempool_initialized(&conf->r1buf_pool)) + if (init_resync(conf)) + return 0; + + max_sector = mddev->dev_sectors; + if (sector_nr >= max_sector) { + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chunk (there will + * only be one in raid1 resync. + * We can find the current addess in mddev->curr_resync + */ + if (mddev->curr_resync < max_sector) /* aborted */ + md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* completed sync */ + conf->fullsync = 0; + + md_bitmap_close_sync(mddev->bitmap); + close_sync(conf); + + if (mddev_is_clustered(mddev)) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + } + return 0; + } + + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + return max_sector - sector_nr; + } + /* before building a request, check if we can skip these blocks.. + * This call the bitmap_start_sync doesn't actually record anything + */ + if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* We can skip this block, and probably several more */ + *skipped = 1; + return sync_blocks; + } + + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (atomic_read(&conf->nr_waiting[idx])) + schedule_timeout_uninterruptible(1); + + /* we are incrementing sector_nr below. To be safe, we check against + * sector_nr + two times RESYNC_SECTORS + */ + + md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + + + if (raise_barrier(conf, sector_nr)) + return 0; + + r1_bio = raid1_alloc_init_r1buf(conf); + + rcu_read_lock(); + /* + * If we get a correctably read error during resync or recovery, + * we might want to read from a different device. So we + * flag all drives that could conceivably be read from for READ, + * and any others (which will be non-In_sync devices) for WRITE. + * If a read fails, we try reading from something else for which READ + * is OK. + */ + + r1_bio->mddev = mddev; + r1_bio->sector = sector_nr; + r1_bio->state = 0; + set_bit(R1BIO_IsSync, &r1_bio->state); + /* make sure good_sectors won't go across barrier unit boundary */ + good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); + + for (i = 0; i < conf->raid_disks * 2; i++) { + struct md_rdev *rdev; + bio = r1_bio->bios[i]; + + rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev == NULL || + test_bit(Faulty, &rdev->flags)) { + if (i < conf->raid_disks) + still_degraded = 1; + } else if (!test_bit(In_sync, &rdev->flags)) { + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_end_io = end_sync_write; + write_targets ++; + } else { + /* may need to read from here */ + sector_t first_bad = MaxSector; + int bad_sectors; + + if (is_badblock(rdev, sector_nr, good_sectors, + &first_bad, &bad_sectors)) { + if (first_bad > sector_nr) + good_sectors = first_bad - sector_nr; + else { + bad_sectors -= (sector_nr - first_bad); + if (min_bad == 0 || + min_bad > bad_sectors) + min_bad = bad_sectors; + } + } + if (sector_nr < first_bad) { + if (test_bit(WriteMostly, &rdev->flags)) { + if (wonly < 0) + wonly = i; + } else { + if (disk < 0) + disk = i; + } + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_end_io = end_sync_read; + read_targets++; + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + /* + * The device is suitable for reading (InSync), + * but has bad block(s) here. Let's try to correct them, + * if we are doing resync or repair. Otherwise, leave + * this device alone for this sync request. + */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_end_io = end_sync_write; + write_targets++; + } + } + if (rdev && bio->bi_end_io) { + atomic_inc(&rdev->nr_pending); + bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; + bio_set_dev(bio, rdev->bdev); + if (test_bit(FailFast, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; + } + } + rcu_read_unlock(); + if (disk < 0) + disk = wonly; + r1_bio->read_disk = disk; + + if (read_targets == 0 && min_bad > 0) { + /* These sectors are bad on all InSync devices, so we + * need to mark them bad on all write targets + */ + int ok = 1; + for (i = 0 ; i < conf->raid_disks * 2 ; i++) + if (r1_bio->bios[i]->bi_end_io == end_sync_write) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + ok = rdev_set_badblocks(rdev, sector_nr, + min_bad, 0 + ) && ok; + } + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + *skipped = 1; + put_buf(r1_bio); + + if (!ok) { + /* Cannot record the badblocks, so need to + * abort the resync. + * If there are multiple read targets, could just + * fail the really bad ones ??? + */ + conf->recovery_disabled = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return 0; + } else + return min_bad; + + } + if (min_bad > 0 && min_bad < good_sectors) { + /* only resync enough to reach the next bad->good + * transition */ + good_sectors = min_bad; + } + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) + /* extra read targets are also write targets */ + write_targets += read_targets-1; + + if (write_targets == 0 || read_targets == 0) { + /* There is nowhere to write, so all non-sync + * drives must be failed - so we are finished + */ + sector_t rv; + if (min_bad > 0) + max_sector = sector_nr + min_bad; + rv = max_sector - sector_nr; + *skipped = 1; + put_buf(r1_bio); + return rv; + } + + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + if (max_sector > sector_nr + good_sectors) + max_sector = sector_nr + good_sectors; + nr_sectors = 0; + sync_blocks = 0; + do { + struct page *page; + int len = PAGE_SIZE; + if (sector_nr + (len>>9) > max_sector) + len = (max_sector - sector_nr) << 9; + if (len == 0) + break; + if (sync_blocks == 0) { + if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, still_degraded) && + !conf->fullsync && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + break; + if ((len >> 9) > sync_blocks) + len = sync_blocks<<9; + } + + for (i = 0 ; i < conf->raid_disks * 2; i++) { + struct resync_pages *rp; + + bio = r1_bio->bios[i]; + rp = get_resync_pages(bio); + if (bio->bi_end_io) { + page = resync_fetch_page(rp, page_idx); + + /* + * won't fail because the vec table is big + * enough to hold all these pages + */ + bio_add_page(bio, page, len, 0); + } + } + nr_sectors += len>>9; + sector_nr += len>>9; + sync_blocks -= (len>>9); + } while (++page_idx < RESYNC_PAGES); + + r1_bio->sectors = nr_sectors; + + if (mddev_is_clustered(mddev) && + conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + + /* For a user-requested sync, we read all readable devices and do a + * compare + */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + atomic_set(&r1_bio->remaining, read_targets); + for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { + bio = r1_bio->bios[i]; + if (bio->bi_end_io == end_sync_read) { + read_targets--; + md_sync_acct_bio(bio, nr_sectors); + if (read_targets == 1) + bio->bi_opf &= ~MD_FAILFAST; + submit_bio_noacct(bio); + } + } + } else { + atomic_set(&r1_bio->remaining, 1); + bio = r1_bio->bios[r1_bio->read_disk]; + md_sync_acct_bio(bio, nr_sectors); + if (read_targets == 1) + bio->bi_opf &= ~MD_FAILFAST; + submit_bio_noacct(bio); + } + return nr_sectors; +} + +static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + +static struct r1conf *setup_conf(struct mddev *mddev) +{ + struct r1conf *conf; + int i; + struct raid1_info *disk; + struct md_rdev *rdev; + int err = -ENOMEM; + + conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); + if (!conf) + goto abort; + + conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_pending) + goto abort; + + conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_waiting) + goto abort; + + conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_queued) + goto abort; + + conf->barrier = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->barrier) + goto abort; + + conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info), + mddev->raid_disks, 2), + GFP_KERNEL); + if (!conf->mirrors) + goto abort; + + conf->tmppage = alloc_page(GFP_KERNEL); + if (!conf->tmppage) + goto abort; + + conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); + if (!conf->poolinfo) + goto abort; + conf->poolinfo->raid_disks = mddev->raid_disks * 2; + err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, + rbio_pool_free, conf->poolinfo); + if (err) + goto abort; + + err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); + if (err) + goto abort; + + conf->poolinfo->mddev = mddev; + + err = -EINVAL; + spin_lock_init(&conf->device_lock); + rdev_for_each(rdev, mddev) { + int disk_idx = rdev->raid_disk; + if (disk_idx >= mddev->raid_disks + || disk_idx < 0) + continue; + if (test_bit(Replacement, &rdev->flags)) + disk = conf->mirrors + mddev->raid_disks + disk_idx; + else + disk = conf->mirrors + disk_idx; + + if (disk->rdev) + goto abort; + disk->rdev = rdev; + disk->head_position = 0; + disk->seq_start = MaxSector; + } + conf->raid_disks = mddev->raid_disks; + conf->mddev = mddev; + INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); + + spin_lock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + + bio_list_init(&conf->pending_bio_list); + conf->recovery_disabled = mddev->recovery_disabled - 1; + + err = -EIO; + for (i = 0; i < conf->raid_disks * 2; i++) { + + disk = conf->mirrors + i; + + if (i < conf->raid_disks && + disk[conf->raid_disks].rdev) { + /* This slot has a replacement. */ + if (!disk->rdev) { + /* No original, just make the replacement + * a recovering spare + */ + disk->rdev = + disk[conf->raid_disks].rdev; + disk[conf->raid_disks].rdev = NULL; + } else if (!test_bit(In_sync, &disk->rdev->flags)) + /* Original is not in_sync - bad */ + goto abort; + } + + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { + disk->head_position = 0; + if (disk->rdev && + (disk->rdev->saved_raid_disk < 0)) + conf->fullsync = 1; + } + } + + err = -ENOMEM; + conf->thread = md_register_thread(raid1d, mddev, "raid1"); + if (!conf->thread) + goto abort; + + return conf; + + abort: + if (conf) { + mempool_exit(&conf->r1bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); + bioset_exit(&conf->bio_split); + kfree(conf); + } + return ERR_PTR(err); +} + +static void raid1_free(struct mddev *mddev, void *priv); +static int raid1_run(struct mddev *mddev) +{ + struct r1conf *conf; + int i; + struct md_rdev *rdev; + int ret; + + if (mddev->level != 1) { + pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", + mdname(mddev), mddev->level); + return -EIO; + } + if (mddev->reshape_position != MaxSector) { + pr_warn("md/raid1:%s: reshape_position set but not supported\n", + mdname(mddev)); + return -EIO; + } + if (mddev_init_writes_pending(mddev) < 0) + return -ENOMEM; + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in raid1_free()] + */ + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + + if (IS_ERR(conf)) + return PTR_ERR(conf); + + if (mddev->queue) + blk_queue_max_write_zeroes_sectors(mddev->queue, 0); + + rdev_for_each(rdev, mddev) { + if (!mddev->gendisk) + continue; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } + + mddev->degraded = 0; + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].rdev == NULL || + !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || + test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + mddev->degraded++; + /* + * RAID1 needs at least one disk in active + */ + if (conf->raid_disks - mddev->degraded < 1) { + md_unregister_thread(&conf->thread); + ret = -EINVAL; + goto abort; + } + + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; + + if (mddev->recovery_cp != MaxSector) + pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); + pr_info("md/raid1:%s: active with %d out of %d mirrors\n", + mdname(mddev), mddev->raid_disks - mddev->degraded, + mddev->raid_disks); + + /* + * Ok, everything is just fine now + */ + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); + + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); + + ret = md_integrity_register(mddev); + if (ret) { + md_unregister_thread(&mddev->thread); + goto abort; + } + return 0; + +abort: + raid1_free(mddev, conf); + return ret; +} + +static void raid1_free(struct mddev *mddev, void *priv) +{ + struct r1conf *conf = priv; + + mempool_exit(&conf->r1bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); + bioset_exit(&conf->bio_split); + kfree(conf); +} + +static int raid1_resize(struct mddev *mddev, sector_t sectors) +{ + /* no resync is happening, and there is enough space + * on all devices, so we can resize. + * We need to make sure resync covers any new space. + * If the array is shrinking we should possibly wait until + * any io in the removed space completes, but it hardly seems + * worth it. + */ + sector_t newsize = raid1_size(mddev, sectors, 0); + if (mddev->external_size && + mddev->array_sectors > newsize) + return -EINVAL; + if (mddev->bitmap) { + int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { + mddev->recovery_cp = mddev->dev_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->dev_sectors = sectors; + mddev->resync_max_sectors = sectors; + return 0; +} + +static int raid1_reshape(struct mddev *mddev) +{ + /* We need to: + * 1/ resize the r1bio_pool + * 2/ resize conf->mirrors + * + * We allocate a new r1bio_pool if we can. + * Then raise a device barrier and wait until all IO stops. + * Then resize conf->mirrors and swap in the new r1bio pool. + * + * At the same time, we "pack" the devices so that all the missing + * devices have the higher raid_disk numbers. + */ + mempool_t newpool, oldpool; + struct pool_info *newpoolinfo; + struct raid1_info *newmirrors; + struct r1conf *conf = mddev->private; + int cnt, raid_disks; + unsigned long flags; + int d, d2; + int ret; + + memset(&newpool, 0, sizeof(newpool)); + memset(&oldpool, 0, sizeof(oldpool)); + + /* Cannot change chunk_size, layout, or level */ + if (mddev->chunk_sectors != mddev->new_chunk_sectors || + mddev->layout != mddev->new_layout || + mddev->level != mddev->new_level) { + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->new_layout = mddev->layout; + mddev->new_level = mddev->level; + return -EINVAL; + } + + if (!mddev_is_clustered(mddev)) + md_allow_write(mddev); + + raid_disks = mddev->raid_disks + mddev->delta_disks; + + if (raid_disks < conf->raid_disks) { + cnt=0; + for (d= 0; d < conf->raid_disks; d++) + if (conf->mirrors[d].rdev) + cnt++; + if (cnt > raid_disks) + return -EBUSY; + } + + newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); + if (!newpoolinfo) + return -ENOMEM; + newpoolinfo->mddev = mddev; + newpoolinfo->raid_disks = raid_disks * 2; + + ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, + rbio_pool_free, newpoolinfo); + if (ret) { + kfree(newpoolinfo); + return ret; + } + newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), + raid_disks, 2), + GFP_KERNEL); + if (!newmirrors) { + kfree(newpoolinfo); + mempool_exit(&newpool); + return -ENOMEM; + } + + freeze_array(conf, 0); + + /* ok, everything is stopped */ + oldpool = conf->r1bio_pool; + conf->r1bio_pool = newpool; + + for (d = d2 = 0; d < conf->raid_disks; d++) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + if (rdev && rdev->raid_disk != d2) { + sysfs_unlink_rdev(mddev, rdev); + rdev->raid_disk = d2; + sysfs_unlink_rdev(mddev, rdev); + if (sysfs_link_rdev(mddev, rdev)) + pr_warn("md/raid1:%s: cannot register rd%d\n", + mdname(mddev), rdev->raid_disk); + } + if (rdev) + newmirrors[d2++].rdev = rdev; + } + kfree(conf->mirrors); + conf->mirrors = newmirrors; + kfree(conf->poolinfo); + conf->poolinfo = newpoolinfo; + + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); + conf->raid_disks = mddev->raid_disks = raid_disks; + mddev->delta_disks = 0; + + unfreeze_array(conf); + + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + mempool_exit(&oldpool); + return 0; +} + +static void raid1_quiesce(struct mddev *mddev, int quiesce) +{ + struct r1conf *conf = mddev->private; + + if (quiesce) + freeze_array(conf, 0); + else + unfreeze_array(conf); +} + +static void *raid1_takeover(struct mddev *mddev) +{ + /* raid1 can take over: + * raid5 with 2 devices, any layout or chunk size + */ + if (mddev->level == 5 && mddev->raid_disks == 2) { + struct r1conf *conf; + mddev->new_level = 1; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + conf = setup_conf(mddev); + if (!IS_ERR(conf)) { + /* Array must appear to be quiesced */ + conf->array_frozen = 1; + mddev_clear_unsupported_flags(mddev, + UNSUPPORTED_MDDEV_FLAGS); + } + return conf; + } + return ERR_PTR(-EINVAL); +} + +static struct md_personality raid1_personality = +{ + .name = "raid1", + .level = 1, + .owner = THIS_MODULE, + .make_request = raid1_make_request, + .run = raid1_run, + .free = raid1_free, + .status = raid1_status, + .error_handler = raid1_error, + .hot_add_disk = raid1_add_disk, + .hot_remove_disk= raid1_remove_disk, + .spare_active = raid1_spare_active, + .sync_request = raid1_sync_request, + .resize = raid1_resize, + .size = raid1_size, + .check_reshape = raid1_reshape, + .quiesce = raid1_quiesce, + .takeover = raid1_takeover, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&raid1_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&raid1_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); +MODULE_ALIAS("md-personality-3"); /* RAID1 */ +MODULE_ALIAS("md-raid1"); +MODULE_ALIAS("md-level-1"); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 000000000..ebb678882 --- /dev/null +++ b/drivers/md/raid1.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RAID1_H +#define _RAID1_H + +/* + * each barrier unit size is 64MB fow now + * note: it must be larger than RESYNC_DEPTH + */ +#define BARRIER_UNIT_SECTOR_BITS 17 +#define BARRIER_UNIT_SECTOR_SIZE (1<<17) +/* + * In struct r1conf, the following members are related to I/O barrier + * buckets, + * atomic_t *nr_pending; + * atomic_t *nr_waiting; + * atomic_t *nr_queued; + * atomic_t *barrier; + * Each of them points to array of atomic_t variables, each array is + * designed to have BARRIER_BUCKETS_NR elements and occupy a single + * memory page. The data width of atomic_t variables is 4 bytes, equal + * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined + * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of + * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly + * occupies a single memory page. + */ +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) +#define BARRIER_BUCKETS_NR (1<reconfig_mutex + * 2/ when resync/recovery is known to be happening - i.e. in code that is + * called as part of performing resync/recovery. + * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer + * and if it is non-NULL, increment rdev->nr_pending before dropping the + * RCU lock. + * When .rdev is set to NULL, the nr_pending count checked again and if it has + * been incremented, the pointer is put back in .rdev. + */ + +struct raid1_info { + struct md_rdev *rdev; + sector_t head_position; + + /* When choose the best device for a read (read_balance()) + * we try to keep sequential reads one the same device + */ + sector_t next_seq_sect; + sector_t seq_start; +}; + +/* + * memory pools need a pointer to the mddev, so they can force an unplug + * when memory is tight, and a count of the number of drives that the + * pool was allocated for, so they know how much to allocate and free. + * mddev->raid_disks cannot be used, as it can change while a pool is active + * These two datums are stored in a kmalloced struct. + * The 'raid_disks' here is twice the raid_disks in r1conf. + * This allows space for each 'real' device can have a replacement in the + * second half of the array. + */ + +struct pool_info { + struct mddev *mddev; + int raid_disks; +}; + +struct r1conf { + struct mddev *mddev; + struct raid1_info *mirrors; /* twice 'raid_disks' to + * allow for replacements. + */ + int raid_disks; + + spinlock_t device_lock; + + /* list of 'struct r1bio' that need to be processed by raid1d, + * whether to retry a read, writeout a resync or recovery + * block, or anything else. + */ + struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; + + /* queue pending writes to be submitted on unplug */ + struct bio_list pending_bio_list; + + /* for use when syncing mirrors: + * We don't allow both normal IO and resync/recovery IO at + * the same time - resync/recovery can only happen when there + * is no other IO. So when either is active, the other has to wait. + * See more details description in raid1.c near raise_barrier(). + */ + wait_queue_head_t wait_barrier; + spinlock_t resync_lock; + atomic_t nr_sync_pending; + atomic_t *nr_pending; + atomic_t *nr_waiting; + atomic_t *nr_queued; + atomic_t *barrier; + int array_frozen; + + /* Set to 1 if a full sync is needed, (fresh device added). + * Cleared when a sync completes. + */ + int fullsync; + + /* When the same as mddev->recovery_disabled we don't allow + * recovery to be attempted as we expect a read error. + */ + int recovery_disabled; + + /* poolinfo contains information about the content of the + * mempools - it changes when the array grows or shrinks + */ + struct pool_info *poolinfo; + mempool_t r1bio_pool; + mempool_t r1buf_pool; + + struct bio_set bio_split; + + /* temporary buffer to synchronous IO when attempting to repair + * a read error. + */ + struct page *tmppage; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; + + /* Keep track of cluster resync window to send to other + * nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; + +}; + +/* + * this is our 'private' RAID1 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct r1bio { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ + sector_t sector; + int sectors; + unsigned long state; + unsigned long start_time; + struct mddev *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_disk; + + struct list_head retry_list; + + /* + * When R1BIO_BehindIO is set, we store pages for write behind + * in behind_master_bio. + */ + struct bio *behind_master_bio; + + /* + * if the IO is in WRITE direction, then multiple bios are used. + * We choose the number when they are allocated. + */ + struct bio *bios[]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ +}; + +/* bits for r1bio.state */ +enum r1bio_state { + R1BIO_Uptodate, + R1BIO_IsSync, + R1BIO_Degraded, + R1BIO_BehindIO, +/* Set ReadError on bios that experience a readerror so that + * raid1d knows what to do with them. + */ + R1BIO_ReadError, +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ + R1BIO_Returned, +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag + */ + R1BIO_MadeGood, + R1BIO_WriteError, + R1BIO_FailFast, +}; + +static inline int sector_to_idx(sector_t sector) +{ + return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, + BARRIER_BUCKETS_NR_BITS); +} +#endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c new file mode 100644 index 000000000..7b318e7e8 --- /dev/null +++ b/drivers/md/raid10.c @@ -0,0 +1,5290 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * raid10.c : Multiple Devices driver for Linux + * + * Copyright (C) 2000-2004 Neil Brown + * + * RAID-10 support for md. + * + * Base on code in raid1.c. See raid1.c for further copyright information. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "raid10.h" +#include "raid0.h" +#include "md-bitmap.h" + +/* + * RAID10 provides a combination of RAID0 and RAID1 functionality. + * The layout of data is defined by + * chunk_size + * raid_disks + * near_copies (stored in low byte of layout) + * far_copies (stored in second byte of layout) + * far_offset (stored in bit 16 of layout ) + * use_far_sets (stored in bit 17 of layout ) + * use_far_sets_bugfixed (stored in bit 18 of layout ) + * + * The data to be stored is divided into chunks using chunksize. Each device + * is divided into far_copies sections. In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive). The starting device for each section is offset + * near_copies from the starting device of the previous section. Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive. near_copies and far_copies must be at least one, and their product + * is at most raid_disks. + * + * If far_offset is true, then the far_copies are handled a bit differently. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true. In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size. The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array. This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + * A B C D A B C D E + * ... ... + * D A B C E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + * [A B] [C D] [A B] [C D E] + * |...| |...| |...| | ... | + * [B A] [D C] [B A] [E C D] + */ + +static void allow_barrier(struct r10conf *conf); +static void lower_barrier(struct r10conf *conf); +static int _enough(struct r10conf *conf, int previous, int ignore); +static int enough(struct r10conf *conf, int ignore); +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, + int *skipped); +static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); +static void end_reshape_write(struct bio *bio); +static void end_reshape(struct r10conf *conf); + +#define raid10_log(md, fmt, args...) \ + do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) + +#include "raid1-10.c" + +#define NULL_CMD +#define cmd_before(conf, cmd) \ + do { \ + write_sequnlock_irq(&(conf)->resync_lock); \ + cmd; \ + } while (0) +#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) + +#define wait_event_barrier_cmd(conf, cond, cmd) \ + wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ + cmd_after(conf)) + +#define wait_event_barrier(conf, cond) \ + wait_event_barrier_cmd(conf, cond, NULL_CMD) + +/* + * for resync bio, r10bio pointer can be retrieved from the per-bio + * 'struct resync_pages'. + */ +static inline struct r10bio *get_resync_r10bio(struct bio *bio) +{ + return get_resync_pages(bio)->raid_bio; +} + +static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct r10conf *conf = data; + int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); + + /* allocate a r10bio with room for raid_disks entries in the + * bios array */ + return kzalloc(size, gfp_flags); +} + +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +/* amount of memory to reserve for resync requests */ +#define RESYNC_WINDOW (1024*1024) +/* maximum number of concurrent requests, memory permitting */ +#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) +#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) + +/* + * When performing a resync, we need to read and compare, so + * we need as many pages are there are copies. + * When performing a recovery, we need 2 bios, one for read, + * one for write (we recover only one drive per r10buf) + * + */ +static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct r10conf *conf = data; + struct r10bio *r10_bio; + struct bio *bio; + int j; + int nalloc, nalloc_rp; + struct resync_pages *rps; + + r10_bio = r10bio_pool_alloc(gfp_flags, conf); + if (!r10_bio) + return NULL; + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + /* allocate once for all bios */ + if (!conf->have_replacement) + nalloc_rp = nalloc; + else + nalloc_rp = nalloc * 2; + rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags); + if (!rps) + goto out_free_r10bio; + + /* + * Allocate bios. + */ + for (j = nalloc ; j-- ; ) { + bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); + if (!bio) + goto out_free_bio; + bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + r10_bio->devs[j].bio = bio; + if (!conf->have_replacement) + continue; + bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); + if (!bio) + goto out_free_bio; + bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + r10_bio->devs[j].repl_bio = bio; + } + /* + * Allocate RESYNC_PAGES data pages and attach them + * where needed. + */ + for (j = 0; j < nalloc; j++) { + struct bio *rbio = r10_bio->devs[j].repl_bio; + struct resync_pages *rp, *rp_repl; + + rp = &rps[j]; + if (rbio) + rp_repl = &rps[nalloc + j]; + + bio = r10_bio->devs[j].bio; + + if (!j || test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + if (resync_alloc_pages(rp, gfp_flags)) + goto out_free_pages; + } else { + memcpy(rp, &rps[0], sizeof(*rp)); + resync_get_all_pages(rp); + } + + rp->raid_bio = r10_bio; + bio->bi_private = rp; + if (rbio) { + memcpy(rp_repl, rp, sizeof(*rp)); + rbio->bi_private = rp_repl; + } + } + + return r10_bio; + +out_free_pages: + while (--j >= 0) + resync_free_pages(&rps[j]); + + j = 0; +out_free_bio: + for ( ; j < nalloc; j++) { + if (r10_bio->devs[j].bio) + bio_uninit(r10_bio->devs[j].bio); + kfree(r10_bio->devs[j].bio); + if (r10_bio->devs[j].repl_bio) + bio_uninit(r10_bio->devs[j].repl_bio); + kfree(r10_bio->devs[j].repl_bio); + } + kfree(rps); +out_free_r10bio: + rbio_pool_free(r10_bio, conf); + return NULL; +} + +static void r10buf_pool_free(void *__r10_bio, void *data) +{ + struct r10conf *conf = data; + struct r10bio *r10bio = __r10_bio; + int j; + struct resync_pages *rp = NULL; + + for (j = conf->copies; j--; ) { + struct bio *bio = r10bio->devs[j].bio; + + if (bio) { + rp = get_resync_pages(bio); + resync_free_pages(rp); + bio_uninit(bio); + kfree(bio); + } + + bio = r10bio->devs[j].repl_bio; + if (bio) { + bio_uninit(bio); + kfree(bio); + } + } + + /* resync pages array stored in the 1st bio's .bi_private */ + kfree(rp); + + rbio_pool_free(r10bio, conf); +} + +static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) +{ + int i; + + for (i = 0; i < conf->geo.raid_disks; i++) { + struct bio **bio = & r10_bio->devs[i].bio; + if (!BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + bio = &r10_bio->devs[i].repl_bio; + if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + } +} + +static void free_r10bio(struct r10bio *r10_bio) +{ + struct r10conf *conf = r10_bio->mddev->private; + + put_all_bios(conf, r10_bio); + mempool_free(r10_bio, &conf->r10bio_pool); +} + +static void put_buf(struct r10bio *r10_bio) +{ + struct r10conf *conf = r10_bio->mddev->private; + + mempool_free(r10_bio, &conf->r10buf_pool); + + lower_barrier(conf); +} + +static void wake_up_barrier(struct r10conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + +static void reschedule_retry(struct r10bio *r10_bio) +{ + unsigned long flags; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r10_bio->retry_list, &conf->retry_list); + conf->nr_queued ++; + spin_unlock_irqrestore(&conf->device_lock, flags); + + /* wake up frozen array... */ + wake_up(&conf->wait_barrier); + + md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid_end_bio_io(struct r10bio *r10_bio) +{ + struct bio *bio = r10_bio->master_bio; + struct r10conf *conf = r10_bio->mddev->private; + + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + + if (r10_bio->start_time) + bio_end_io_acct(bio, r10_bio->start_time); + bio_endio(bio); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + + free_r10bio(r10_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int slot, struct r10bio *r10_bio) +{ + struct r10conf *conf = r10_bio->mddev->private; + + conf->mirrors[r10_bio->devs[slot].devnum].head_position = + r10_bio->devs[slot].addr + (r10_bio->sectors); +} + +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, + struct bio *bio, int *slotp, int *replp) +{ + int slot; + int repl = 0; + + for (slot = 0; slot < conf->geo.raid_disks; slot++) { + if (r10_bio->devs[slot].bio == bio) + break; + if (r10_bio->devs[slot].repl_bio == bio) { + repl = 1; + break; + } + } + + update_head_pos(slot, r10_bio); + + if (slotp) + *slotp = slot; + if (replp) + *replp = repl; + return r10_bio->devs[slot].devnum; +} + +static void raid10_end_read_request(struct bio *bio) +{ + int uptodate = !bio->bi_status; + struct r10bio *r10_bio = bio->bi_private; + int slot; + struct md_rdev *rdev; + struct r10conf *conf = r10_bio->mddev->private; + + slot = r10_bio->read_slot; + rdev = r10_bio->devs[slot].rdev; + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + update_head_pos(slot, r10_bio); + + if (uptodate) { + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + set_bit(R10BIO_Uptodate, &r10_bio->state); + } else { + /* If all other devices that store this block have + * failed, we want to return the error upwards rather + * than fail the last device. Here we redefine + * "uptodate" to mean "Don't want to retry" + */ + if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), + rdev->raid_disk)) + uptodate = 1; + } + if (uptodate) { + raid_end_bio_io(r10_bio); + rdev_dec_pending(rdev, conf->mddev); + } else { + /* + * oops, read error - keep the refcount on the rdev + */ + pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", + mdname(conf->mddev), + rdev->bdev, + (unsigned long long)r10_bio->sector); + set_bit(R10BIO_ReadError, &r10_bio->state); + reschedule_retry(r10_bio); + } +} + +static void close_write(struct r10bio *r10_bio) +{ + /* clear the bitmap if all writes complete successfully */ + md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); + md_write_end(r10_bio->mddev); +} + +static void one_write_done(struct r10bio *r10_bio) +{ + if (atomic_dec_and_test(&r10_bio->remaining)) { + if (test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else { + close_write(r10_bio); + if (test_bit(R10BIO_MadeGood, &r10_bio->state)) + reschedule_retry(r10_bio); + else + raid_end_bio_io(r10_bio); + } + } +} + +static void raid10_end_write_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + int dev; + int dec_rdev = 1; + struct r10conf *conf = r10_bio->mddev->private; + int slot, repl; + struct md_rdev *rdev = NULL; + struct bio *to_put = NULL; + bool discard_error; + + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + smp_rmb(); + repl = 0; + rdev = conf->mirrors[dev].rdev; + } + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (bio->bi_status && !discard_error) { + if (repl) + /* Never record new bad blocks to replacement, + * just fail it. + */ + md_error(rdev->mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + + dec_rdev = 0; + if (test_bit(FailFast, &rdev->flags) && + (bio->bi_opf & MD_FAILFAST)) { + md_error(rdev->mddev, rdev); + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + */ + if (!test_bit(Faulty, &rdev->flags)) + set_bit(R10BIO_WriteError, &r10_bio->state); + else { + /* Fail the request */ + set_bit(R10BIO_Degraded, &r10_bio->state); + r10_bio->devs[slot].bio = NULL; + to_put = bio; + dec_rdev = 1; + } + } + } else { + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + sector_t first_bad; + int bad_sectors; + + /* + * Do not set R10BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors) && !discard_error) { + bio_put(bio); + if (repl) + r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; + else + r10_bio->devs[slot].bio = IO_MADE_GOOD; + dec_rdev = 0; + set_bit(R10BIO_MadeGood, &r10_bio->state); + } + } + + /* + * + * Let's see if all mirrored write operations have finished + * already. + */ + one_write_done(r10_bio); + if (dec_rdev) + rdev_dec_pending(rdev, conf->mddev); + if (to_put) + bio_put(to_put); +} + +/* + * RAID10 layout manager + * As well as the chunksize and raid_disks count, there are two + * parameters: near_copies and far_copies. + * near_copies * far_copies must be <= raid_disks. + * Normally one of these will be 1. + * If both are 1, we get raid0. + * If near_copies == raid_disks, we get raid1. + * + * Chunks are laid out in raid0 style with near_copies copies of the + * first chunk, followed by near_copies copies of the next chunk and + * so on. + * If far_copies > 1, then after 1/far_copies of the array has been assigned + * as described above, we start again with a device offset of near_copies. + * So we effectively have another copy of the whole array further down all + * the drives, but with blocks on different drives. + * With this layout, and block is never stored twice on the one device. + * + * raid10_find_phys finds the sector offset of a given virtual sector + * on each device that it is on. + * + * raid10_find_virt does the reverse mapping, from a device and a + * sector offset to a virtual address + */ + +static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) +{ + int n,f; + sector_t sector; + sector_t chunk; + sector_t stripe; + int dev; + int slot = 0; + int last_far_set_start, last_far_set_size; + + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + last_far_set_size = geo->far_set_size; + last_far_set_size += (geo->raid_disks % geo->far_set_size); + + /* now calculate first sector/dev */ + chunk = r10bio->sector >> geo->chunk_shift; + sector = r10bio->sector & geo->chunk_mask; + + chunk *= geo->near_copies; + stripe = chunk; + dev = sector_div(stripe, geo->raid_disks); + if (geo->far_offset) + stripe *= geo->far_copies; + + sector += stripe << geo->chunk_shift; + + /* and calculate all the others */ + for (n = 0; n < geo->near_copies; n++) { + int d = dev; + int set; + sector_t s = sector; + r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; + slot++; + + for (f = 1; f < geo->far_copies; f++) { + set = d / geo->far_set_size; + d += geo->near_copies; + + if ((geo->raid_disks % geo->far_set_size) && + (d > last_far_set_start)) { + d -= last_far_set_start; + d %= last_far_set_size; + d += last_far_set_start; + } else { + d %= geo->far_set_size; + d += geo->far_set_size * set; + } + s += geo->stride; + r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; + slot++; + } + dev++; + if (dev >= geo->raid_disks) { + dev = 0; + sector += (geo->chunk_mask + 1); + } + } +} + +static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) +{ + struct geom *geo = &conf->geo; + + if (conf->reshape_progress != MaxSector && + ((r10bio->sector >= conf->reshape_progress) != + conf->mddev->reshape_backwards)) { + set_bit(R10BIO_Previous, &r10bio->state); + geo = &conf->prev; + } else + clear_bit(R10BIO_Previous, &r10bio->state); + + __raid10_find_phys(geo, r10bio); +} + +static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) +{ + sector_t offset, chunk, vchunk; + /* Never use conf->prev as this is only called during resync + * or recovery, so reshape isn't happening + */ + struct geom *geo = &conf->geo; + int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; + int far_set_size = geo->far_set_size; + int last_far_set_start; + + if (geo->raid_disks % geo->far_set_size) { + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + if (dev >= last_far_set_start) { + far_set_size = geo->far_set_size; + far_set_size += (geo->raid_disks % geo->far_set_size); + far_set_start = last_far_set_start; + } + } + + offset = sector & geo->chunk_mask; + if (geo->far_offset) { + int fc; + chunk = sector >> geo->chunk_shift; + fc = sector_div(chunk, geo->far_copies); + dev -= fc * geo->near_copies; + if (dev < far_set_start) + dev += far_set_size; + } else { + while (sector >= geo->stride) { + sector -= geo->stride; + if (dev < (geo->near_copies + far_set_start)) + dev += far_set_size - geo->near_copies; + else + dev -= geo->near_copies; + } + chunk = sector >> geo->chunk_shift; + } + vchunk = chunk * geo->raid_disks + dev; + sector_div(vchunk, geo->near_copies); + return (vchunk << geo->chunk_shift) + offset; +} + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ + +/* + * FIXME: possibly should rethink readbalancing and do it differently + * depending on near_copies / far_copies geometry. + */ +static struct md_rdev *read_balance(struct r10conf *conf, + struct r10bio *r10_bio, + int *max_sectors) +{ + const sector_t this_sector = r10_bio->sector; + int disk, slot; + int sectors = r10_bio->sectors; + int best_good_sectors; + sector_t new_distance, best_dist; + struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; + int do_balance; + int best_dist_slot, best_pending_slot; + bool has_nonrot_disk = false; + unsigned int min_pending; + struct geom *geo = &conf->geo; + + raid10_find_phys(conf, r10_bio); + rcu_read_lock(); + best_dist_slot = -1; + min_pending = UINT_MAX; + best_dist_rdev = NULL; + best_pending_rdev = NULL; + best_dist = MaxSector; + best_good_sectors = 0; + do_balance = 1; + clear_bit(R10BIO_FailFast, &r10_bio->state); + /* + * Check if we can balance. We can balance on the whole + * device if no resync is going on (recovery is ok), or below + * the resync window. We take the first readable disk when + * above the resync window. + */ + if ((conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) + do_balance = 0; + + for (slot = 0; slot < conf->copies ; slot++) { + sector_t first_bad; + int bad_sectors; + sector_t dev_sector; + unsigned int pending; + bool nonrot; + + if (r10_bio->devs[slot].bio == IO_BLOCKED) + continue; + disk = r10_bio->devs[slot].devnum; + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + r10_bio->devs[slot].addr + sectors > + rdev->recovery_offset) { + /* + * Read replacement first to prevent reading both rdev + * and replacement as NULL during replacement replace + * rdev. + */ + smp_mb(); + rdev = rcu_dereference(conf->mirrors[disk].rdev); + } + if (rdev == NULL || + test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + continue; + + dev_sector = r10_bio->devs[slot].addr; + if (is_badblock(rdev, dev_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* Already have a better slot */ + continue; + if (first_bad <= dev_sector) { + /* Cannot read here. If this is the + * 'primary' device, then we must not read + * beyond 'bad_sectors' from another device. + */ + bad_sectors -= (dev_sector - first_bad); + if (!do_balance && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + } else { + sector_t good_sectors = + first_bad - dev_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_dist_slot = slot; + best_dist_rdev = rdev; + } + if (!do_balance) + /* Must read from here */ + break; + } + continue; + } else + best_good_sectors = sectors; + + if (!do_balance) + break; + + nonrot = bdev_nonrot(rdev->bdev); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); + if (min_pending > pending && nonrot) { + min_pending = pending; + best_pending_slot = slot; + best_pending_rdev = rdev; + } + + if (best_dist_slot >= 0) + /* At least 2 disks to choose from so failfast is OK */ + set_bit(R10BIO_FailFast, &r10_bio->state); + /* This optimisation is debatable, and completely destroys + * sequential read speed for 'far copies' arrays. So only + * keep it for 'near' arrays, and review those later. + */ + if (geo->near_copies > 1 && !pending) + new_distance = 0; + + /* for far > 1 always use the lowest address */ + else if (geo->far_copies > 1) + new_distance = r10_bio->devs[slot].addr; + else + new_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); + + if (new_distance < best_dist) { + best_dist = new_distance; + best_dist_slot = slot; + best_dist_rdev = rdev; + } + } + if (slot >= conf->copies) { + if (has_nonrot_disk) { + slot = best_pending_slot; + rdev = best_pending_rdev; + } else { + slot = best_dist_slot; + rdev = best_dist_rdev; + } + } + + if (slot >= 0) { + atomic_inc(&rdev->nr_pending); + r10_bio->read_slot = slot; + } else + rdev = NULL; + rcu_read_unlock(); + *max_sectors = best_good_sectors; + + return rdev; +} + +static void flush_pending_writes(struct r10conf *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + */ + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct blk_plug plug; + struct bio *bio; + + bio = bio_list_get(&conf->pending_bio_list); + spin_unlock_irq(&conf->device_lock); + + /* + * As this is called in a wait_event() loop (see freeze_array), + * current->state might be TASK_UNINTERRUPTIBLE which will + * cause a warning when we prepare to wait again. As it is + * rare that this path is taken, it is perfectly safe to force + * us to go around the wait_event() loop again, so the warning + * is a false-positive. Silence the warning by resetting + * thread state + */ + __set_current_state(TASK_RUNNING); + + blk_start_plug(&plug); + /* flush any pending bitmap writes to disk + * before proceeding w/ I/O */ + md_bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + + raid1_submit_write(bio); + bio = next; + } + blk_finish_plug(&plug); + } else + spin_unlock_irq(&conf->device_lock); +} + +/* Barriers.... + * Sometimes we need to suspend IO while we do something else, + * either some resync/recovery, or reconfigure the array. + * To do this we raise a 'barrier'. + * The 'barrier' is a counter that can be raised multiple times + * to count how many activities are happening which preclude + * normal IO. + * We can only raise the barrier if there is no pending IO. + * i.e. if nr_pending == 0. + * We choose only to raise the barrier if no-one is waiting for the + * barrier to go down. This means that as soon as an IO request + * is ready, no other operations which require a barrier will start + * until the IO request has had a chance. + * + * So: regular IO calls 'wait_barrier'. When that returns there + * is no backgroup IO happening, It must arrange to call + * allow_barrier when it has finished its IO. + * backgroup IO calls must call raise_barrier. Once that returns + * there is no normal IO happeing. It must arrange to call + * lower_barrier when the particular background IO completes. + */ + +static void raise_barrier(struct r10conf *conf, int force) +{ + write_seqlock_irq(&conf->resync_lock); + BUG_ON(force && !conf->barrier); + + /* Wait until no block IO is waiting (unless 'force') */ + wait_event_barrier(conf, force || !conf->nr_waiting); + + /* block any new IO from starting */ + WRITE_ONCE(conf->barrier, conf->barrier + 1); + + /* Now wait for all pending IO to complete */ + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH); + + write_sequnlock_irq(&conf->resync_lock); +} + +static void lower_barrier(struct r10conf *conf) +{ + unsigned long flags; + + write_seqlock_irqsave(&conf->resync_lock, flags); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + write_sequnlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static bool stop_waiting_barrier(struct r10conf *conf) +{ + struct bio_list *bio_list = current->bio_list; + + /* barrier is dropped */ + if (!conf->barrier) + return true; + + /* + * If there are already pending requests (preventing the barrier from + * rising completely), and the pre-process bio queue isn't empty, then + * don't wait, as we need to empty that queue to get the nr_pending + * count down. + */ + if (atomic_read(&conf->nr_pending) && bio_list && + (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) + return true; + + /* + * move on if io is issued from raid10d(), nr_pending is not released + * from original io(see handle_read_error()). All raise barrier is + * blocked until this io is done. + */ + if (conf->mddev->thread->tsk == current) { + WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); + return true; + } + + return false; +} + +static bool wait_barrier_nolock(struct r10conf *conf) +{ + unsigned int seq = read_seqbegin(&conf->resync_lock); + + if (READ_ONCE(conf->barrier)) + return false; + + atomic_inc(&conf->nr_pending); + if (!read_seqretry(&conf->resync_lock, seq)) + return true; + + if (atomic_dec_and_test(&conf->nr_pending)) + wake_up_barrier(conf); + + return false; +} + +static bool wait_barrier(struct r10conf *conf, bool nowait) +{ + bool ret = true; + + if (wait_barrier_nolock(conf)) + return true; + + write_seqlock_irq(&conf->resync_lock); + if (conf->barrier) { + /* Return false when nowait flag is set */ + if (nowait) { + ret = false; + } else { + conf->nr_waiting++; + raid10_log(conf->mddev, "wait barrier"); + wait_event_barrier(conf, stop_waiting_barrier(conf)); + conf->nr_waiting--; + } + if (!conf->nr_waiting) + wake_up(&conf->wait_barrier); + } + /* Only increment nr_pending when we wait */ + if (ret) + atomic_inc(&conf->nr_pending); + write_sequnlock_irq(&conf->resync_lock); + return ret; +} + +static void allow_barrier(struct r10conf *conf) +{ + if ((atomic_dec_and_test(&conf->nr_pending)) || + (conf->array_freeze_pending)) + wake_up_barrier(conf); +} + +static void freeze_array(struct r10conf *conf, int extra) +{ + /* stop syncio and normal IO and wait for everything to + * go quiet. + * We increment barrier and nr_waiting, and then + * wait until nr_pending match nr_queued+extra + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (extra) + * must match the number of pending IOs (nr_pending) before + * we continue. + */ + write_seqlock_irq(&conf->resync_lock); + conf->array_freeze_pending++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); + conf->nr_waiting++; + wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) == + conf->nr_queued + extra, flush_pending_writes(conf)); + conf->array_freeze_pending--; + write_sequnlock_irq(&conf->resync_lock); +} + +static void unfreeze_array(struct r10conf *conf) +{ + /* reverse the effect of the freeze */ + write_seqlock_irq(&conf->resync_lock); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + write_sequnlock_irq(&conf->resync_lock); +} + +static sector_t choose_data_offset(struct r10bio *r10_bio, + struct md_rdev *rdev) +{ + if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || + test_bit(R10BIO_Previous, &r10_bio->state)) + return rdev->data_offset; + else + return rdev->new_data_offset; +} + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule || current->bio_list) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + md_bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + + raid1_submit_write(bio); + bio = next; + } + kfree(plug); +} + +/* + * 1. Register the new request and wait if the reconstruction thread has put + * up a bar for new requests. Continue immediately if no resync is active + * currently. + * 2. If IO spans the reshape position. Need to wait for reshape to pass. + */ +static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, + struct bio *bio, sector_t sectors) +{ + /* Bail out if REQ_NOWAIT is set for the bio */ + if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return false; + } + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio->bi_iter.bi_sector < conf->reshape_progress && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { + allow_barrier(conf); + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return false; + } + raid10_log(conf->mddev, "wait reshape"); + wait_event(conf->wait_barrier, + conf->reshape_progress <= bio->bi_iter.bi_sector || + conf->reshape_progress >= bio->bi_iter.bi_sector + + sectors); + wait_barrier(conf, false); + } + return true; +} + +static void raid10_read_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + struct bio *read_bio; + const enum req_op op = bio_op(bio); + const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; + int max_sectors; + struct md_rdev *rdev; + char b[BDEVNAME_SIZE]; + int slot = r10_bio->read_slot; + struct md_rdev *err_rdev = NULL; + gfp_t gfp = GFP_NOIO; + + if (slot >= 0 && r10_bio->devs[slot].rdev) { + /* + * This is an error retry, but we cannot + * safely dereference the rdev in the r10_bio, + * we must use the one in conf. + * If it has already been disconnected (unlikely) + * we lose the device name in error messages. + */ + int disk; + /* + * As we are blocking raid10, it is a little safer to + * use __GFP_HIGH. + */ + gfp = GFP_NOIO | __GFP_HIGH; + + rcu_read_lock(); + disk = r10_bio->devs[slot].devnum; + err_rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (err_rdev) + snprintf(b, sizeof(b), "%pg", err_rdev->bdev); + else { + strcpy(b, "???"); + /* This never gets dereferenced */ + err_rdev = r10_bio->devs[slot].rdev; + } + rcu_read_unlock(); + } + + if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) + return; + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { + if (err_rdev) { + pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); + } + raid_end_bio_io(r10_bio); + return; + } + if (err_rdev) + pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", + mdname(mddev), + rdev->bdev, + (unsigned long long)r10_bio->sector); + if (max_sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, max_sectors, + gfp, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + submit_bio_noacct(bio); + wait_barrier(conf, false); + bio = split; + r10_bio->master_bio = bio; + r10_bio->sectors = max_sectors; + } + slot = r10_bio->read_slot; + + if (!r10_bio->start_time && + blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); + read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); + + r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; + + read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + + choose_data_offset(r10_bio, rdev); + read_bio->bi_end_io = raid10_end_read_request; + bio_set_op_attrs(read_bio, op, do_sync); + if (test_bit(FailFast, &rdev->flags) && + test_bit(R10BIO_FailFast, &r10_bio->state)) + read_bio->bi_opf |= MD_FAILFAST; + read_bio->bi_private = r10_bio; + + if (mddev->gendisk) + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), + r10_bio->sector); + submit_bio_noacct(read_bio); + return; +} + +static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, + struct bio *bio, bool replacement, + int n_copy) +{ + const enum req_op op = bio_op(bio); + const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; + const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; + unsigned long flags; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev; + int devnum = r10_bio->devs[n_copy].devnum; + struct bio *mbio; + + if (replacement) { + rdev = conf->mirrors[devnum].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[devnum].rdev; + } + } else + rdev = conf->mirrors[devnum].rdev; + + mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); + if (replacement) + r10_bio->devs[n_copy].repl_bio = mbio; + else + r10_bio->devs[n_copy].bio = mbio; + + mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + + choose_data_offset(r10_bio, rdev)); + mbio->bi_end_io = raid10_end_write_request; + bio_set_op_attrs(mbio, op, do_sync | do_fua); + if (!replacement && test_bit(FailFast, + &conf->mirrors[devnum].rdev->flags) + && enough(conf, devnum)) + mbio->bi_opf |= MD_FAILFAST; + mbio->bi_private = r10_bio; + + if (conf->mddev->gendisk) + trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), + r10_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void *)rdev; + + atomic_inc(&r10_bio->remaining); + + if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug)) { + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(mddev->thread); + } +} + +static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, + struct md_rdev **prrdev) +{ + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(mirror->replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(mirror->rdev); + if (rdev == rrdev) + rrdev = NULL; + + *prrdev = rrdev; + return rdev; +} + +static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) +{ + int i; + struct r10conf *conf = mddev->private; + struct md_rdev *blocked_rdev; + +retry_wait: + blocked_rdev = NULL; + rcu_read_lock(); + for (i = 0; i < conf->copies; i++) { + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + /* + * Discard request doesn't care the write result + * so it doesn't need to wait blocked disk here. + */ + if (!r10_bio->sectors) + continue; + + is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* + * Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + } + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + allow_barrier(conf); + raid10_log(conf->mddev, "%s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf, false); + goto retry_wait; + } +} + +static void raid10_write_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int i; + sector_t sectors; + int max_sectors; + + if ((mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { + DEFINE_WAIT(w); + /* Bail out if REQ_NOWAIT is set for the bio */ + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + + sectors = r10_bio->sectors; + if (!regular_request_wait(mddev, conf, bio, sectors)) + return; + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + (mddev->reshape_backwards + ? (bio->bi_iter.bi_sector < conf->reshape_safe && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) + : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && + bio->bi_iter.bi_sector < conf->reshape_progress))) { + /* Need to update reshape_position in metadata */ + mddev->reshape_position = conf->reshape_progress; + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(mddev->thread); + if (bio->bi_opf & REQ_NOWAIT) { + allow_barrier(conf); + bio_wouldblock_error(bio); + return; + } + raid10_log(conf->mddev, "wait reshape metadata"); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + + conf->reshape_safe = mddev->reshape_position; + } + + /* first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + * If there are known/acknowledged bad blocks on any device + * on which we have seen a write error, we want to avoid + * writing to those blocks. This potentially requires several + * writes to write around the bad blocks. Each set of writes + * gets its own r10_bio with a set of bios attached. + */ + + r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ + raid10_find_phys(conf, r10_bio); + + wait_blocked_dev(mddev, r10_bio); + + rcu_read_lock(); + max_sectors = r10_bio->sectors; + + for (i = 0; i < conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + + r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].repl_bio = NULL; + + if (!rdev && !rrdev) { + set_bit(R10BIO_Degraded, &r10_bio->state); + continue; + } + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, dev_sector, max_sectors, + &first_bad, &bad_sectors); + if (is_bad && first_bad <= dev_sector) { + /* Cannot write here at all */ + bad_sectors -= (dev_sector - first_bad); + if (bad_sectors < max_sectors) + /* Mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; + /* We don't set R10BIO_Degraded as that + * only applies if the disk is missing, + * so it might be re-added, and we want to + * know to recover this chunk. + * In this case the device is here, and the + * fact that this chunk is not in-sync is + * recorded in the bad block log. + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - dev_sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[i].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + if (max_sectors < r10_bio->sectors) + r10_bio->sectors = max_sectors; + + if (r10_bio->sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, r10_bio->sectors, + GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + submit_bio_noacct(bio); + wait_barrier(conf, false); + bio = split; + r10_bio->master_bio = bio; + } + + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); + atomic_set(&r10_bio->remaining, 1); + md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + + for (i = 0; i < conf->copies; i++) { + if (r10_bio->devs[i].bio) + raid10_write_one_disk(mddev, r10_bio, bio, false, i); + if (r10_bio->devs[i].repl_bio) + raid10_write_one_disk(mddev, r10_bio, bio, true, i); + } + one_write_done(r10_bio); +} + +static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = sectors; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector; + r10_bio->state = 0; + r10_bio->read_slot = -1; + r10_bio->start_time = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * + conf->geo.raid_disks); + + if (bio_data_dir(bio) == READ) + raid10_read_request(mddev, bio, r10_bio); + else + raid10_write_request(mddev, bio, r10_bio); +} + +static void raid_end_discard_bio(struct r10bio *r10bio) +{ + struct r10conf *conf = r10bio->mddev->private; + struct r10bio *first_r10bio; + + while (atomic_dec_and_test(&r10bio->remaining)) { + + allow_barrier(conf); + + if (!test_bit(R10BIO_Discard, &r10bio->state)) { + first_r10bio = (struct r10bio *)r10bio->master_bio; + free_r10bio(r10bio); + r10bio = first_r10bio; + } else { + md_write_end(r10bio->mddev); + bio_endio(r10bio->master_bio); + free_r10bio(r10bio); + break; + } + } +} + +static void raid10_end_discard_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + struct md_rdev *rdev = NULL; + int dev; + int slot, repl; + + /* + * We don't care the return value of discard bio + */ + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + /* + * raid10_remove_disk uses smp_mb to make sure rdev is set to + * replacement before setting replacement to NULL. It can read + * rdev first without barrier protect even replacment is NULL + */ + smp_rmb(); + rdev = conf->mirrors[dev].rdev; + } + + raid_end_discard_bio(r10_bio); + rdev_dec_pending(rdev, conf->mddev); +} + +/* + * There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct geom *geo = &conf->geo; + int far_copies = geo->far_copies; + bool first_copy = true; + struct r10bio *r10_bio, *first_r10bio; + struct bio *split; + int disk; + sector_t chunk; + unsigned int stripe_size; + unsigned int stripe_data_disks; + sector_t split_size; + sector_t bio_start, bio_end; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int remainder; + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return -EAGAIN; + + if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return 0; + } + wait_barrier(conf, false); + + /* + * Check reshape again to avoid reshape happens after checking + * MD_RECOVERY_RESHAPE and before wait_barrier + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + goto out; + + if (geo->near_copies) + stripe_data_disks = geo->raid_disks / geo->near_copies + + geo->raid_disks % geo->near_copies; + else + stripe_data_disks = geo->raid_disks; + + stripe_size = stripe_data_disks << geo->chunk_shift; + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* + * Maybe one discard bio is smaller than strip size or across one + * stripe and discard region is larger than one stripe size. For far + * offset layout, if the discard region is not aligned with stripe + * size, there is hole when we submit discard bio to member disk. + * For simplicity, we only handle discard bio which discard region + * is bigger than stripe_size * 2 + */ + if (bio_sectors(bio) < stripe_size*2) + goto out; + + /* + * Keep bio aligned with strip size. + */ + div_u64_rem(bio_start, stripe_size, &remainder); + if (remainder) { + split_size = stripe_size - remainder; + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + /* Resend the fist split part */ + submit_bio_noacct(split); + wait_barrier(conf, false); + } + div_u64_rem(bio_end, stripe_size, &remainder); + if (remainder) { + split_size = bio_sectors(bio) - remainder; + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + /* Resend the second split part */ + submit_bio_noacct(bio); + bio = split; + wait_barrier(conf, false); + } + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* + * Raid10 uses chunk as the unit to store data. It's similar like raid0. + * One stripe contains the chunks from all member disk (one chunk from + * one disk at the same HBA address). For layout detail, see 'man md 4' + */ + chunk = bio_start >> geo->chunk_shift; + chunk *= geo->near_copies; + first_stripe_index = chunk; + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); + if (geo->far_offset) + first_stripe_index *= geo->far_copies; + start_disk_offset = (bio_start & geo->chunk_mask) + + (first_stripe_index << geo->chunk_shift); + + chunk = bio_end >> geo->chunk_shift; + chunk *= geo->near_copies; + last_stripe_index = chunk; + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); + if (geo->far_offset) + last_stripe_index *= geo->far_copies; + end_disk_offset = (bio_end & geo->chunk_mask) + + (last_stripe_index << geo->chunk_shift); + +retry_discard: + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + wait_blocked_dev(mddev, r10_bio); + + /* + * For far layout it needs more than one r10bio to cover all regions. + * Inspired by raid10_sync_request, we can use the first r10bio->master_bio + * to record the discard bio. Other r10bio->master_bio record the first + * r10bio. The first r10bio only release after all other r10bios finish. + * The discard bio returns only first r10bio finishes + */ + if (first_copy) { + r10_bio->master_bio = bio; + set_bit(R10BIO_Discard, &r10_bio->state); + first_copy = false; + first_r10bio = r10_bio; + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + + /* + * first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + */ + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); + r10_bio->devs[disk].bio = NULL; + r10_bio->devs[disk].repl_bio = NULL; + + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + if (!rdev && !rrdev) + continue; + + if (rdev) { + r10_bio->devs[disk].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[disk].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; + + /* + * Now start to calculate the start and end address for each disk. + * The space between dev_start and dev_end is the discard region. + * + * For dev_start, it needs to consider three conditions: + * 1st, the disk is before start_disk, you can imagine the disk in + * the next stripe. So the dev_start is the start address of next + * stripe. + * 2st, the disk is after start_disk, it means the disk is at the + * same stripe of first disk + * 3st, the first disk itself, we can use start_disk_offset directly + */ + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + /* + * It only handles discard bio which size is >= stripe size, so + * dev_end > dev_start all the time. + * It doesn't need to use rcu lock to get rdev here. We already + * add rdev->nr_pending in the first loop. + */ + if (r10_bio->devs[disk].bio) { + struct md_rdev *rdev = conf->mirrors[disk].rdev; + mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, + &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; + r10_bio->devs[disk].bio = mbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rdev, mbio, + dev_start + choose_data_offset(r10_bio, rdev), + dev_end - dev_start); + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { + struct md_rdev *rrdev = conf->mirrors[disk].replacement; + rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, + &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; + r10_bio->devs[disk].repl_bio = rbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rrdev, rbio, + dev_start + choose_data_offset(r10_bio, rrdev), + dev_end - dev_start); + bio_endio(rbio); + } + } + + if (!geo->far_offset && --far_copies) { + first_stripe_index += geo->stride >> geo->chunk_shift; + start_disk_offset += geo->stride; + last_stripe_index += geo->stride >> geo->chunk_shift; + end_disk_offset += geo->stride; + atomic_inc(&first_r10bio->remaining); + raid_end_discard_bio(r10_bio); + wait_barrier(conf, false); + goto retry_discard; + } + + raid_end_discard_bio(r10_bio); + + return 0; +out: + allow_barrier(conf); + return -EAGAIN; +} + +static bool raid10_make_request(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); + int chunk_sects = chunk_mask + 1; + int sectors = bio_sectors(bio); + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + if (!md_write_start(mddev, bio)) + return false; + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + if (!raid10_handle_discard(mddev, bio)) + return true; + + /* + * If this request crosses a chunk boundary, we need to split + * it. + */ + if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + + sectors > chunk_sects + && (conf->geo.near_copies < conf->geo.raid_disks + || conf->prev.near_copies < + conf->prev.raid_disks))) + sectors = chunk_sects - + (bio->bi_iter.bi_sector & + (chunk_sects - 1)); + __make_request(mddev, bio, sectors); + + /* In case raid10d snuck in to freeze_array */ + wake_up_barrier(conf); + return true; +} + +static void raid10_status(struct seq_file *seq, struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + int i; + + if (conf->geo.near_copies < conf->geo.raid_disks) + seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); + if (conf->geo.near_copies > 1) + seq_printf(seq, " %d near-copies", conf->geo.near_copies); + if (conf->geo.far_copies > 1) { + if (conf->geo.far_offset) + seq_printf(seq, " %d offset-copies", conf->geo.far_copies); + else + seq_printf(seq, " %d far-copies", conf->geo.far_copies); + if (conf->geo.far_set_size != conf->geo.raid_disks) + seq_printf(seq, " %d devices per set", conf->geo.far_set_size); + } + seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, + conf->geo.raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); + seq_printf(seq, "]"); +} + +/* check if there are enough drives for + * every block to appear on atleast one. + * Don't consider the device numbered 'ignore' + * as we might be about to remove it. + */ +static int _enough(struct r10conf *conf, int previous, int ignore) +{ + int first = 0; + int has_enough = 0; + int disks, ncopies; + if (previous) { + disks = conf->prev.raid_disks; + ncopies = conf->prev.near_copies; + } else { + disks = conf->geo.raid_disks; + ncopies = conf->geo.near_copies; + } + + rcu_read_lock(); + do { + int n = conf->copies; + int cnt = 0; + int this = first; + while (n--) { + struct md_rdev *rdev; + if (this != ignore && + (rdev = rcu_dereference(conf->mirrors[this].rdev)) && + test_bit(In_sync, &rdev->flags)) + cnt++; + this = (this+1) % disks; + } + if (cnt == 0) + goto out; + first = (first + ncopies) % disks; + } while (first != 0); + has_enough = 1; +out: + rcu_read_unlock(); + return has_enough; +} + +static int enough(struct r10conf *conf, int ignore) +{ + /* when calling 'enough', both 'prev' and 'geo' must + * be stable. + * This is ensured if ->reconfig_mutex or ->device_lock + * is held. + */ + return _enough(conf, 0, ignore) && + _enough(conf, 1, ignore); +} + +/** + * raid10_error() - RAID10 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + * + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ +static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r10conf *conf = mddev->private; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } + } + if (test_and_clear_bit(In_sync, &rdev->flags)) + mddev->degraded++; + + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(Blocked, &rdev->flags); + set_bit(Faulty, &rdev->flags); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + spin_unlock_irqrestore(&conf->device_lock, flags); + pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", + mdname(mddev), rdev->bdev, + mdname(mddev), conf->geo.raid_disks - mddev->degraded); +} + +static void print_conf(struct r10conf *conf) +{ + int i; + struct md_rdev *rdev; + + pr_debug("RAID10 conf printout:\n"); + if (!conf) { + pr_debug("(!conf)\n"); + return; + } + pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, + conf->geo.raid_disks); + + /* This is only called with ->reconfix_mutex held, so + * rcu protection of rdev is not needed */ + for (i = 0; i < conf->geo.raid_disks; i++) { + rdev = conf->mirrors[i].rdev; + if (rdev) + pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + rdev->bdev); + } +} + +static void close_sync(struct r10conf *conf) +{ + wait_barrier(conf, false); + allow_barrier(conf); + + mempool_exit(&conf->r10buf_pool); +} + +static int raid10_spare_active(struct mddev *mddev) +{ + int i; + struct r10conf *conf = mddev->private; + struct raid10_info *tmp; + int count = 0; + unsigned long flags; + + /* + * Find all non-in_sync disks within the RAID10 configuration + * and mark them in_sync + */ + for (i = 0; i < conf->geo.raid_disks; i++) { + tmp = conf->mirrors + i; + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + count++; + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); + } + } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); + + print_conf(conf); + return count; +} + +static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r10conf *conf = mddev->private; + int err = -EEXIST; + int mirror; + int first = 0; + int last = conf->geo.raid_disks - 1; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is + * very different from resync + */ + return -EBUSY; + if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) + return -EINVAL; + + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + if (rdev->saved_raid_disk >= first && + rdev->saved_raid_disk < conf->geo.raid_disks && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else + mirror = first; + for ( ; mirror <= last ; mirror++) { + struct raid10_info *p = &conf->mirrors[mirror]; + if (p->recovery_disabled == mddev->recovery_disabled) + continue; + if (p->rdev) { + if (!test_bit(WantReplacement, &p->rdev->flags) || + p->replacement != NULL) + continue; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } + + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + p->head_position = 0; + p->recovery_disabled = mddev->recovery_disabled - 1; + rdev->raid_disk = mirror; + err = 0; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + + print_conf(conf); + return err; +} + +static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r10conf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct md_rdev **rdevp; + struct raid10_info *p; + + print_conf(conf); + if (unlikely(number >= mddev->raid_disks)) + return 0; + p = conf->mirrors + number; + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove non-faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != p->recovery_disabled && + (!p->replacement || p->replacement == rdev) && + number < conf->geo.raid_disks && + enough(conf, -1)) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } + } + if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither -- if they are careful. + */ + p->replacement = NULL; + } + + clear_bit(WantReplacement, &rdev->flags); + err = md_integrity_register(mddev); + +abort: + + print_conf(conf); + return err; +} + +static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) +{ + struct r10conf *conf = r10_bio->mddev->private; + + if (!bio->bi_status) + set_bit(R10BIO_Uptodate, &r10_bio->state); + else + /* The write handler will notice the lack of + * R10BIO_Uptodate and record any errors etc + */ + atomic_add(r10_bio->sectors, + &conf->mirrors[d].rdev->corrected_errors); + + /* for reconstruct, we always reschedule after a read. + * for resync, only after all reads + */ + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); + if (test_bit(R10BIO_IsRecover, &r10_bio->state) || + atomic_dec_and_test(&r10_bio->remaining)) { + /* we have read all the blocks, + * do the comparison in process context in raid10d + */ + reschedule_retry(r10_bio); + } +} + +static void end_sync_read(struct bio *bio) +{ + struct r10bio *r10_bio = get_resync_r10bio(bio); + struct r10conf *conf = r10_bio->mddev->private; + int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); + + __end_sync_read(r10_bio, bio, d); +} + +static void end_reshape_read(struct bio *bio) +{ + /* reshape read bio isn't allocated from r10buf_pool */ + struct r10bio *r10_bio = bio->bi_private; + + __end_sync_read(r10_bio, bio, r10_bio->read_slot); +} + +static void end_sync_request(struct r10bio *r10_bio) +{ + struct mddev *mddev = r10_bio->mddev; + + while (atomic_dec_and_test(&r10_bio->remaining)) { + if (r10_bio->master_bio == NULL) { + /* the primary of several recovery bios */ + sector_t s = r10_bio->sectors; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); + md_done_sync(mddev, s, 1); + break; + } else { + struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); + r10_bio = r10_bio2; + } + } +} + +static void end_sync_write(struct bio *bio) +{ + struct r10bio *r10_bio = get_resync_r10bio(bio); + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + int d; + sector_t first_bad; + int bad_sectors; + int slot; + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + else + rdev = conf->mirrors[d].rdev; + + if (bio->bi_status) { + if (repl) + md_error(mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + } + } else if (is_badblock(rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) + set_bit(R10BIO_MadeGood, &r10_bio->state); + + rdev_dec_pending(rdev, mddev); + + end_sync_request(r10_bio); +} + +/* + * Note: sync and recover and handled very differently for raid10 + * This code is for resync. + * For resync, we read through virtual addresses and read all blocks. + * If there is any error, we schedule a write. The lowest numbered + * drive is authoritative. + * However requests come for physical address, so we need to map. + * For every physical address there are raid_disks/copies virtual addresses, + * which is always are least one, but is not necessarly an integer. + * This means that a physical address can span multiple chunks, so we may + * have to submit multiple io requests for a single sync request. + */ +/* + * We check if all blocks are in-sync and only write to blocks that + * aren't in sync + */ +static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int i, first; + struct bio *tbio, *fbio; + int vcnt; + struct page **tpages, **fpages; + + atomic_set(&r10_bio->remaining, 1); + + /* find the first device with a block */ + for (i=0; icopies; i++) + if (!r10_bio->devs[i].bio->bi_status) + break; + + if (i == conf->copies) + goto done; + + first = i; + fbio = r10_bio->devs[i].bio; + fbio->bi_iter.bi_size = r10_bio->sectors << 9; + fbio->bi_iter.bi_idx = 0; + fpages = get_resync_pages(fbio)->pages; + + vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); + /* now find blocks with errors */ + for (i=0 ; i < conf->copies ; i++) { + int j, d; + struct md_rdev *rdev; + struct resync_pages *rp; + + tbio = r10_bio->devs[i].bio; + + if (tbio->bi_end_io != end_sync_read) + continue; + if (i == first) + continue; + + tpages = get_resync_pages(tbio)->pages; + d = r10_bio->devs[i].devnum; + rdev = conf->mirrors[d].rdev; + if (!r10_bio->devs[i].bio->bi_status) { + /* We know that the bi_io_vec layout is the same for + * both 'first' and 'i', so we just compare them. + * All vec entries are PAGE_SIZE; + */ + int sectors = r10_bio->sectors; + for (j = 0; j < vcnt; j++) { + int len = PAGE_SIZE; + if (sectors < (len / 512)) + len = sectors * 512; + if (memcmp(page_address(fpages[j]), + page_address(tpages[j]), + len)) + break; + sectors -= len/512; + } + if (j == vcnt) + continue; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + /* Don't fix anything. */ + continue; + } else if (test_bit(FailFast, &rdev->flags)) { + /* Just give up on this device */ + md_error(rdev->mddev, rdev); + continue; + } + /* Ok, we need to write this bio, either to correct an + * inconsistency or to correct an unreadable block. + * First we need to fixup bv_offset, bv_len and + * bi_vecs, as the read request might have corrupted these + */ + rp = get_resync_pages(tbio); + bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE); + + md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); + + rp->raid_bio = r10_bio; + tbio->bi_private = rp; + tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; + tbio->bi_end_io = end_sync_write; + + bio_copy_data(tbio, fbio); + + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); + + if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + tbio->bi_opf |= MD_FAILFAST; + tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; + submit_bio_noacct(tbio); + } + + /* Now write out to any replacement devices + * that are active + */ + for (i = 0; i < conf->copies; i++) { + int d; + + tbio = r10_bio->devs[i].repl_bio; + if (!tbio || !tbio->bi_end_io) + continue; + if (r10_bio->devs[i].bio->bi_end_io != end_sync_write + && r10_bio->devs[i].bio != fbio) + bio_copy_data(tbio, fbio); + d = r10_bio->devs[i].devnum; + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].replacement->bdev, + bio_sectors(tbio)); + submit_bio_noacct(tbio); + } + +done: + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_done_sync(mddev, r10_bio->sectors, 1); + put_buf(r10_bio); + } +} + +/* + * Now for the recovery code. + * Recovery happens across physical sectors. + * We recover all non-is_sync drives by finding the virtual address of + * each, and then choose a working drive that also has that virt address. + * There is a separate r10_bio for each non-in_sync drive. + * Only the first two slots are in use. The first for reading, + * The second for writing. + * + */ +static void fix_recovery_read_error(struct r10bio *r10_bio) +{ + /* We got a read error during recovery. + * We repeat the read in smaller page-sized sections. + * If a read succeeds, write it to the new device or record + * a bad block if we cannot. + * If a read fails, record a bad block on both old and + * new devices. + */ + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + struct bio *bio = r10_bio->devs[0].bio; + sector_t sect = 0; + int sectors = r10_bio->sectors; + int idx = 0; + int dr = r10_bio->devs[0].devnum; + int dw = r10_bio->devs[1].devnum; + struct page **pages = get_resync_pages(bio)->pages; + + while (sectors) { + int s = sectors; + struct md_rdev *rdev; + sector_t addr; + int ok; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rdev = conf->mirrors[dr].rdev; + addr = r10_bio->devs[0].addr + sect, + ok = sync_page_io(rdev, + addr, + s << 9, + pages[idx], + REQ_OP_READ, false); + if (ok) { + rdev = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = sync_page_io(rdev, + addr, + s << 9, + pages[idx], + REQ_OP_WRITE, false); + if (!ok) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } + } + if (!ok) { + /* We don't worry if we cannot set a bad block - + * it really is bad so there is no loss in not + * recording it yet + */ + rdev_set_badblocks(rdev, addr, s, 0); + + if (rdev != conf->mirrors[dw].rdev) { + /* need bad block on destination too */ + struct md_rdev *rdev2 = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = rdev_set_badblocks(rdev2, addr, s, 0); + if (!ok) { + /* just abort the recovery */ + pr_notice("md/raid10:%s: recovery aborted due to read error\n", + mdname(mddev)); + + conf->mirrors[dw].recovery_disabled + = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + break; + } + } + } + + sectors -= s; + sect += s; + idx++; + } +} + +static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int d; + struct bio *wbio = r10_bio->devs[1].bio; + struct bio *wbio2 = r10_bio->devs[1].repl_bio; + + /* Need to test wbio2->bi_end_io before we call + * submit_bio_noacct as if the former is NULL, + * the latter is free to free wbio2. + */ + if (wbio2 && !wbio2->bi_end_io) + wbio2 = NULL; + + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { + fix_recovery_read_error(r10_bio); + if (wbio->bi_end_io) + end_sync_request(r10_bio); + if (wbio2) + end_sync_request(r10_bio); + return; + } + + /* + * share the pages with the first bio + * and submit the write request + */ + d = r10_bio->devs[1].devnum; + if (wbio->bi_end_io) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); + submit_bio_noacct(wbio); + } + if (wbio2) { + atomic_inc(&conf->mirrors[d].replacement->nr_pending); + md_sync_acct(conf->mirrors[d].replacement->bdev, + bio_sectors(wbio2)); + submit_bio_noacct(wbio2); + } +} + +/* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + * + */ +static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + long cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + cur_time_mon = ktime_get_seconds(); + + if (rdev->last_read_error == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (long)(cur_time_mon - + rdev->last_read_error) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); + else + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); +} + +static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, + int sectors, struct page *page, enum req_op op) +{ + sector_t first_bad; + int bad_sectors; + + if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) + && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) + return -1; + if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) + /* success */ + return 1; + if (op == REQ_OP_WRITE) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array synchronising. + */ + +static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) +{ + int sect = 0; /* Offset from r10_bio->sector */ + int sectors = r10_bio->sectors; + struct md_rdev *rdev; + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int d = r10_bio->devs[r10_bio->read_slot].devnum; + + /* still own a reference to this rdev, so it cannot + * have been cleared recently. + */ + rdev = conf->mirrors[d].rdev; + + if (test_bit(Faulty, &rdev->flags)) + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; + + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + if (atomic_read(&rdev->read_errors) > max_read_errors) { + pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", + mdname(mddev), rdev->bdev, + atomic_read(&rdev->read_errors), max_read_errors); + pr_notice("md/raid10:%s: %pg: Failing raid device\n", + mdname(mddev), rdev->bdev); + md_error(mddev, rdev); + r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + return; + } + + while(sectors) { + int s = sectors; + int sl = r10_bio->read_slot; + int success = 0; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rcu_read_lock(); + do { + sector_t first_bad; + int bad_sectors; + + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, + &first_bad, &bad_sectors) == 0) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s<<9, + conf->tmppage, + REQ_OP_READ, false); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; + } + sl++; + if (sl == conf->copies) + sl = 0; + } while (!success && sl != r10_bio->read_slot); + rcu_read_unlock(); + + if (!success) { + /* Cannot read from anywhere, just mark the block + * as bad on the first device to discourage future + * reads. + */ + int dn = r10_bio->devs[r10_bio->read_slot].devnum; + rdev = conf->mirrors[dn].rdev; + + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[r10_bio->read_slot].addr + + sect, + s, 0)) { + md_error(mddev, rdev); + r10_bio->devs[r10_bio->read_slot].bio + = IO_BLOCKED; + } + break; + } + + start = sl; + /* write it back and re-read */ + rcu_read_lock(); + while (sl != r10_bio->read_slot) { + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (!rdev || + test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + continue; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s, conf->tmppage, REQ_OP_WRITE) + == 0) { + /* Well, this device is dead */ + pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, + rdev)), + rdev->bdev); + pr_notice("md/raid10:%s: %pg: failing drive\n", + mdname(mddev), + rdev->bdev); + } + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + sl = start; + while (sl != r10_bio->read_slot) { + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (!rdev || + test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + continue; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + switch (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s, conf->tmppage, REQ_OP_READ)) { + case 0: + /* Well, this device is dead */ + pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, rdev)), + rdev->bdev); + pr_notice("md/raid10:%s: %pg: failing drive\n", + mdname(mddev), + rdev->bdev); + break; + case 1: + pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, rdev)), + rdev->bdev); + atomic_add(s, &rdev->corrected_errors); + } + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + rcu_read_unlock(); + + sectors -= s; + sect += s; + } +} + +static int narrow_write_error(struct r10bio *r10_bio, int i) +{ + struct bio *bio = r10_bio->master_bio; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; + /* bio has the data to be written to slot 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this. + * + * We currently own a reference to the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r10_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = roundup(1 << rdev->badblocks.shift, + bdev_logical_block_size(rdev->bdev) >> 9); + sector = r10_bio->sector; + sectors = ((r10_bio->sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + while (sect_to_write) { + struct bio *wbio; + sector_t wsector; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors' */ + wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, + &mddev->bio_set); + bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); + wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); + wbio->bi_iter.bi_sector = wsector + + choose_data_offset(r10_bio, rdev); + bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + + if (submit_bio_wait(wbio) < 0) + /* Failure! */ + ok = rdev_set_badblocks(rdev, wsector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) +{ + int slot = r10_bio->read_slot; + struct bio *bio; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev = r10_bio->devs[slot].rdev; + + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen. + */ + bio = r10_bio->devs[slot].bio; + bio_put(bio); + r10_bio->devs[slot].bio = NULL; + + if (mddev->ro) + r10_bio->devs[slot].bio = IO_BLOCKED; + else if (!test_bit(FailFast, &rdev->flags)) { + freeze_array(conf, 1); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); + } else + md_error(mddev, rdev); + + rdev_dec_pending(rdev, mddev); + r10_bio->state = 0; + raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + /* + * allow_barrier after re-submit to ensure no sync io + * can be issued while regular io pending. + */ + allow_barrier(conf); +} + +static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) +{ + /* Some sort of write request has finished and it + * succeeded in writing where we thought there was a + * bad block. So forget the bad block. + * Or possibly if failed and we need to record + * a bad block. + */ + int m; + struct md_rdev *rdev; + + if (test_bit(R10BIO_IsSync, &r10_bio->state) || + test_bit(R10BIO_IsRecover, &r10_bio->state)) { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + rdev = conf->mirrors[dev].rdev; + if (r10_bio->devs[m].bio == NULL || + r10_bio->devs[m].bio->bi_end_io == NULL) + continue; + if (!r10_bio->devs[m].bio->bi_status) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + rdev = conf->mirrors[dev].replacement; + if (r10_bio->devs[m].repl_bio == NULL || + r10_bio->devs[m].repl_bio->bi_end_io == NULL) + continue; + + if (!r10_bio->devs[m].repl_bio->bi_status) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r10_bio); + } else { + bool fail = false; + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + struct bio *bio = r10_bio->devs[m].bio; + rdev = conf->mirrors[dev].rdev; + if (bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + rdev_dec_pending(rdev, conf->mddev); + } else if (bio != NULL && bio->bi_status) { + fail = true; + if (!narrow_write_error(r10_bio, m)) { + md_error(conf->mddev, rdev); + set_bit(R10BIO_Degraded, + &r10_bio->state); + } + rdev_dec_pending(rdev, conf->mddev); + } + bio = r10_bio->devs[m].repl_bio; + rdev = conf->mirrors[dev].replacement; + if (rdev && bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + rdev_dec_pending(rdev, conf->mddev); + } + } + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); + conf->nr_queued++; + spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * nr_pending == nr_queued + extra to be true. + */ + wake_up(&conf->wait_barrier); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } + } +} + +static void raid10d(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r10bio *r10_bio; + unsigned long flags; + struct r10conf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + struct blk_plug plug; + + md_check_recovery(mddev); + + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + while (!list_empty(&conf->bio_end_io_list)) { + list_move(conf->bio_end_io_list.prev, &tmp); + conf->nr_queued--; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r10_bio = list_first_entry(&tmp, struct r10bio, + retry_list); + list_del(&r10_bio->retry_list); + if (mddev->degraded) + set_bit(R10BIO_Degraded, &r10_bio->state); + + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } + } + + blk_start_plug(&plug); + for (;;) { + + flush_pending_writes(conf); + + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); + break; + } + r10_bio = list_entry(head->prev, struct r10bio, retry_list); + list_del(head->prev); + conf->nr_queued--; + spin_unlock_irqrestore(&conf->device_lock, flags); + + mddev = r10_bio->mddev; + conf = mddev->private; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + handle_write_completed(conf, r10_bio); + else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) + reshape_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_IsSync, &r10_bio->state)) + sync_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) + recovery_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_ReadError, &r10_bio->state)) + handle_read_error(mddev, r10_bio); + else + WARN_ON_ONCE(1); + + cond_resched(); + if (mddev->sb_flags & ~(1<r10buf_pool)); + conf->have_replacement = 0; + for (i = 0; i < conf->geo.raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->have_replacement = 1; + ret = mempool_init(&conf->r10buf_pool, buffs, + r10buf_pool_alloc, r10buf_pool_free, conf); + if (ret) + return ret; + conf->next_resync = 0; + return 0; +} + +static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) +{ + struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); + struct rsync_pages *rp; + struct bio *bio; + int nalloc; + int i; + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + for (i = 0; i < nalloc; i++) { + bio = r10bio->devs[i].bio; + rp = bio->bi_private; + bio_reset(bio, NULL, 0); + bio->bi_private = rp; + bio = r10bio->devs[i].repl_bio; + if (bio) { + rp = bio->bi_private; + bio_reset(bio, NULL, 0); + bio->bi_private = rp; + } + } + return r10bio; +} + +/* + * Set cluster_sync_high since we need other nodes to add the + * range [cluster_sync_low, cluster_sync_high] to suspend list. + */ +static void raid10_set_cluster_sync_high(struct r10conf *conf) +{ + sector_t window_size; + int extra_chunk, chunks; + + /* + * First, here we define "stripe" as a unit which across + * all member devices one time, so we get chunks by use + * raid_disks / near_copies. Otherwise, if near_copies is + * close to raid_disks, then resync window could increases + * linearly with the increase of raid_disks, which means + * we will suspend a really large IO window while it is not + * necessary. If raid_disks is not divisible by near_copies, + * an extra chunk is needed to ensure the whole "stripe" is + * covered. + */ + + chunks = conf->geo.raid_disks / conf->geo.near_copies; + if (conf->geo.raid_disks % conf->geo.near_copies == 0) + extra_chunk = 0; + else + extra_chunk = 1; + window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; + + /* + * At least use a 32M window to align with raid1's resync window + */ + window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? + CLUSTER_RESYNC_WINDOW_SECTORS : window_size; + + conf->cluster_sync_high = conf->cluster_sync_low + window_size; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + * + * Resync and recovery are handled very differently. + * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. + * + * For resync, we iterate over virtual addresses, read all copies, + * and update if there are differences. If only one copy is live, + * skip it. + * For recovery, we iterate over physical addresses, read a good + * value for each non-in_sync drive, and over-write. + * + * So, for recovery we may have several outstanding complex requests for a + * given address, one for each out-of-sync device. We model this by allocating + * a number of r10_bio structures, one for each out-of-sync device. + * As we setup these structures, we collect all bio's together into a list + * which we then process collectively to add pages, and then process again + * to pass to submit_bio_noacct. + * + * The r10_bio structures are linked using a borrowed master_bio pointer. + * This link is counted in ->remaining. When the r10_bio that points to NULL + * has its remaining count decremented to 0, the whole complex operation + * is complete. + * + */ + +static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + struct bio *biolist = NULL, *bio; + sector_t max_sector, nr_sectors; + int i; + int max_sync; + sector_t sync_blocks; + sector_t sectors_skipped = 0; + int chunks_skipped = 0; + sector_t chunk_mask = conf->geo.chunk_mask; + int page_idx = 0; + + /* + * Allow skipping a full rebuild for incremental assembly + * of a clean array, like RAID1 does. + */ + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + mddev->reshape_position == MaxSector && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + return mddev->dev_sectors - sector_nr; + } + + if (!mempool_initialized(&conf->r10buf_pool)) + if (init_resync(conf)) + return 0; + + skipped: + max_sector = mddev->dev_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sector = mddev->resync_max_sectors; + if (sector_nr >= max_sector) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chucks (there can + * be several when recovering multiple devices). + * as we may have started syncing it but not finished. + * We can find the current address in + * mddev->curr_resync, but for recovery, + * we need to convert that to several + * virtual addresses. + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + close_sync(conf); + return 0; + } + + if (mddev->curr_resync < max_sector) { /* aborted */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else for (i = 0; i < conf->geo.raid_disks; i++) { + sector_t sect = + raid10_find_virt(conf, mddev->curr_resync, i); + md_bitmap_end_sync(mddev->bitmap, sect, + &sync_blocks, 1); + } + } else { + /* completed sync */ + if ((!mddev->bitmap || conf->fullsync) + && conf->have_replacement + && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* Completed a full sync so the replacements + * are now fully recovered. + */ + rcu_read_lock(); + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = + rcu_dereference(conf->mirrors[i].replacement); + if (rdev) + rdev->recovery_offset = MaxSector; + } + rcu_read_unlock(); + } + conf->fullsync = 0; + } + md_bitmap_close_sync(mddev->bitmap); + close_sync(conf); + *skipped = 1; + return sectors_skipped; + } + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return reshape_request(mddev, sector_nr, skipped); + + if (chunks_skipped >= conf->geo.raid_disks) { + /* if there has been nothing to do on any drive, + * then there is nothing to do at all.. + */ + *skipped = 1; + return (max_sector - sector_nr) + sectors_skipped; + } + + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + + /* make sure whole request will fit in a chunk - if chunks + * are meaningful + */ + if (conf->geo.near_copies < conf->geo.raid_disks && + max_sector > (sector_nr | chunk_mask)) + max_sector = (sector_nr | chunk_mask) + 1; + + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (conf->nr_waiting) + schedule_timeout_uninterruptible(1); + + /* Again, very different code for resync and recovery. + * Both must result in an r10bio with a list of bios that + * have bi_end_io, bi_sector, bi_bdev set, + * and bi_private set to the r10bio. + * For recovery, we may actually create several r10bios + * with 2 bios in each, that correspond to the bios in the main one. + * In this case, the subordinate r10bios link back through a + * borrowed master_bio pointer, and the counter in the master + * includes a ref from each subordinate. + */ + /* First, we decide what to do and set ->bi_end_io + * To end_sync_read if we want to read, and + * end_sync_write if we will want to write. + */ + + max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); + if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* recovery... the complicated one */ + int j; + r10_bio = NULL; + + for (i = 0 ; i < conf->geo.raid_disks; i++) { + int still_degraded; + struct r10bio *rb2; + sector_t sect; + int must_sync; + int any_working; + int need_recover = 0; + struct raid10_info *mirror = &conf->mirrors[i]; + struct md_rdev *mrdev, *mreplace; + + rcu_read_lock(); + mrdev = rcu_dereference(mirror->rdev); + mreplace = rcu_dereference(mirror->replacement); + + if (mrdev != NULL && + !test_bit(Faulty, &mrdev->flags) && + !test_bit(In_sync, &mrdev->flags)) + need_recover = 1; + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; + + if (!need_recover && !mreplace) { + rcu_read_unlock(); + continue; + } + + still_degraded = 0; + /* want to reconstruct this device */ + rb2 = r10_bio; + sect = raid10_find_virt(conf, sector_nr, i); + if (sect >= mddev->resync_max_sectors) { + /* last stripe is not complete - don't + * try to recover this sector. + */ + rcu_read_unlock(); + continue; + } + /* Unless we are doing a full sync, or a replacement + * we only need to recover the block if it is set in + * the bitmap + */ + must_sync = md_bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); + if (sync_blocks < max_sync) + max_sync = sync_blocks; + if (!must_sync && + mreplace == NULL && + !conf->fullsync) { + /* yep, skip the sync_blocks here, but don't assume + * that there will never be anything to do here + */ + chunks_skipped = -1; + rcu_read_unlock(); + continue; + } + atomic_inc(&mrdev->nr_pending); + if (mreplace) + atomic_inc(&mreplace->nr_pending); + rcu_read_unlock(); + + r10_bio = raid10_alloc_init_r10buf(conf); + r10_bio->state = 0; + raise_barrier(conf, rb2 != NULL); + atomic_set(&r10_bio->remaining, 0); + + r10_bio->master_bio = (struct bio*)rb2; + if (rb2) + atomic_inc(&rb2->remaining); + r10_bio->mddev = mddev; + set_bit(R10BIO_IsRecover, &r10_bio->state); + r10_bio->sector = sect; + + raid10_find_phys(conf, r10_bio); + + /* Need to check if the array will still be + * degraded + */ + rcu_read_lock(); + for (j = 0; j < conf->geo.raid_disks; j++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[j].rdev); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + still_degraded = 1; + break; + } + } + + must_sync = md_bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); + + any_working = 0; + for (j=0; jcopies;j++) { + int k; + int d = r10_bio->devs[j].devnum; + sector_t from_addr, to_addr; + struct md_rdev *rdev = + rcu_dereference(conf->mirrors[d].rdev); + sector_t sector, first_bad; + int bad_sectors; + if (!rdev || + !test_bit(In_sync, &rdev->flags)) + continue; + /* This is where we read from */ + any_working = 1; + sector = r10_bio->devs[j].addr; + + if (is_badblock(rdev, sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector + - first_bad); + if (max_sync > bad_sectors) + max_sync = bad_sectors; + continue; + } + } + bio = r10_bio->devs[0].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_end_io = end_sync_read; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (test_bit(FailFast, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; + from_addr = r10_bio->devs[j].addr; + bio->bi_iter.bi_sector = from_addr + + rdev->data_offset; + bio_set_dev(bio, rdev->bdev); + atomic_inc(&rdev->nr_pending); + /* and we write to 'i' (if not in_sync) */ + + for (k=0; kcopies; k++) + if (r10_bio->devs[k].devnum == i) + break; + BUG_ON(k == conf->copies); + to_addr = r10_bio->devs[k].addr; + r10_bio->devs[0].devnum = d; + r10_bio->devs[0].addr = from_addr; + r10_bio->devs[1].devnum = i; + r10_bio->devs[1].addr = to_addr; + + if (need_recover) { + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_end_io = end_sync_write; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_iter.bi_sector = to_addr + + mrdev->data_offset; + bio_set_dev(bio, mrdev->bdev); + atomic_inc(&r10_bio->remaining); + } else + r10_bio->devs[1].bio->bi_end_io = NULL; + + /* and maybe write to replacement */ + bio = r10_bio->devs[1].repl_bio; + if (bio) + bio->bi_end_io = NULL; + /* Note: if replace is not NULL, then bio + * cannot be NULL as r10buf_pool_alloc will + * have allocated it. + */ + if (!mreplace) + break; + bio->bi_next = biolist; + biolist = bio; + bio->bi_end_io = end_sync_write; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_iter.bi_sector = to_addr + + mreplace->data_offset; + bio_set_dev(bio, mreplace->bdev); + atomic_inc(&r10_bio->remaining); + break; + } + rcu_read_unlock(); + if (j == conf->copies) { + /* Cannot recover, so abort the recovery or + * record a bad block */ + if (any_working) { + /* problem is that there are bad blocks + * on other device(s) + */ + int k; + for (k = 0; k < conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + if (!test_bit(In_sync, + &mrdev->flags) + && !rdev_set_badblocks( + mrdev, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + if (mreplace && + !rdev_set_badblocks( + mreplace, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + } + if (!any_working) { + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) + pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", + mdname(mddev)); + mirror->recovery_disabled + = mddev->recovery_disabled; + } + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; + rdev_dec_pending(mrdev, mddev); + if (mreplace) + rdev_dec_pending(mreplace, mddev); + break; + } + rdev_dec_pending(mrdev, mddev); + if (mreplace) + rdev_dec_pending(mreplace, mddev); + if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { + /* Only want this if there is elsewhere to + * read from. 'j' is currently the first + * readable copy. + */ + int targets = 1; + for (; j < conf->copies; j++) { + int d = r10_bio->devs[j].devnum; + if (conf->mirrors[d].rdev && + test_bit(In_sync, + &conf->mirrors[d].rdev->flags)) + targets++; + } + if (targets == 1) + r10_bio->devs[0].bio->bi_opf + &= ~MD_FAILFAST; + } + } + if (biolist == NULL) { + while (r10_bio) { + struct r10bio *rb2 = r10_bio; + r10_bio = (struct r10bio*) rb2->master_bio; + rb2->master_bio = NULL; + put_buf(rb2); + } + goto giveup; + } + } else { + /* resync. Schedule a read for every block at this virt offset */ + int count = 0; + + /* + * Since curr_resync_completed could probably not update in + * time, and we will set cluster_sync_low based on it. + * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for + * safety reason, which ensures curr_resync_completed is + * updated in bitmap_cond_end_sync. + */ + md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + + if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, mddev->degraded) && + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, + &mddev->recovery)) { + /* We can skip this block */ + *skipped = 1; + return sync_blocks + sectors_skipped; + } + if (sync_blocks < max_sync) + max_sync = sync_blocks; + r10_bio = raid10_alloc_init_r10buf(conf); + r10_bio->state = 0; + + r10_bio->mddev = mddev; + atomic_set(&r10_bio->remaining, 0); + raise_barrier(conf, 0); + conf->next_resync = sector_nr; + + r10_bio->master_bio = NULL; + r10_bio->sector = sector_nr; + set_bit(R10BIO_IsSync, &r10_bio->state); + raid10_find_phys(conf, r10_bio); + r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; + + for (i = 0; i < conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + sector_t first_bad, sector; + int bad_sectors; + struct md_rdev *rdev; + + if (r10_bio->devs[i].repl_bio) + r10_bio->devs[i].repl_bio->bi_end_io = NULL; + + bio = r10_bio->devs[i].bio; + bio->bi_status = BLK_STS_IOERR; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); + continue; + } + sector = r10_bio->devs[i].addr; + if (is_badblock(rdev, sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector - first_bad); + if (max_sync > bad_sectors) + max_sync = bad_sectors; + rcu_read_unlock(); + continue; + } + } + atomic_inc(&rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + bio->bi_next = biolist; + biolist = bio; + bio->bi_end_io = end_sync_read; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (test_bit(FailFast, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio_set_dev(bio, rdev->bdev); + count++; + + rdev = rcu_dereference(conf->mirrors[d].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); + continue; + } + atomic_inc(&rdev->nr_pending); + + /* Need to set up for writing to the replacement */ + bio = r10_bio->devs[i].repl_bio; + bio->bi_status = BLK_STS_IOERR; + + sector = r10_bio->devs[i].addr; + bio->bi_next = biolist; + biolist = bio; + bio->bi_end_io = end_sync_write; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + if (test_bit(FailFast, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio_set_dev(bio, rdev->bdev); + count++; + rcu_read_unlock(); + } + + if (count < 2) { + for (i=0; icopies; i++) { + int d = r10_bio->devs[i].devnum; + if (r10_bio->devs[i].bio->bi_end_io) + rdev_dec_pending(conf->mirrors[d].rdev, + mddev); + if (r10_bio->devs[i].repl_bio && + r10_bio->devs[i].repl_bio->bi_end_io) + rdev_dec_pending( + conf->mirrors[d].replacement, + mddev); + } + put_buf(r10_bio); + biolist = NULL; + goto giveup; + } + } + + nr_sectors = 0; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + do { + struct page *page; + int len = PAGE_SIZE; + if (sector_nr + (len>>9) > max_sector) + len = (max_sector - sector_nr) << 9; + if (len == 0) + break; + for (bio= biolist ; bio ; bio=bio->bi_next) { + struct resync_pages *rp = get_resync_pages(bio); + page = resync_fetch_page(rp, page_idx); + /* + * won't fail because the vec table is big enough + * to hold all these pages + */ + bio_add_page(bio, page, len, 0); + } + nr_sectors += len>>9; + sector_nr += len>>9; + } while (++page_idx < RESYNC_PAGES); + r10_bio->sectors = nr_sectors; + + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* It is resync not recovery */ + if (conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + raid10_set_cluster_sync_high(conf); + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } else if (mddev_is_clustered(mddev)) { + /* This is recovery not resync */ + sector_t sect_va1, sect_va2; + bool broadcast_msg = false; + + for (i = 0; i < conf->geo.raid_disks; i++) { + /* + * sector_nr is a device address for recovery, so we + * need translate it to array address before compare + * with cluster_sync_high. + */ + sect_va1 = raid10_find_virt(conf, sector_nr, i); + + if (conf->cluster_sync_high < sect_va1 + nr_sectors) { + broadcast_msg = true; + /* + * curr_resync_completed is similar as + * sector_nr, so make the translation too. + */ + sect_va2 = raid10_find_virt(conf, + mddev->curr_resync_completed, i); + + if (conf->cluster_sync_low == 0 || + conf->cluster_sync_low > sect_va2) + conf->cluster_sync_low = sect_va2; + } + } + if (broadcast_msg) { + raid10_set_cluster_sync_high(conf); + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } + + while (biolist) { + bio = biolist; + biolist = biolist->bi_next; + + bio->bi_next = NULL; + r10_bio = get_resync_r10bio(bio); + r10_bio->sectors = nr_sectors; + + if (bio->bi_end_io == end_sync_read) { + md_sync_acct_bio(bio, nr_sectors); + bio->bi_status = 0; + submit_bio_noacct(bio); + } + } + + if (sectors_skipped) + /* pretend they weren't skipped, it makes + * no important difference in this case + */ + md_done_sync(mddev, sectors_skipped, 1); + + return sectors_skipped + nr_sectors; + giveup: + /* There is nowhere to write, so all non-sync + * drives must be failed or in resync, all drives + * have a bad block, so try the next chunk... + */ + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + + sectors_skipped += (max_sector - sector_nr); + chunks_skipped ++; + sector_nr = max_sector; + goto skipped; +} + +static sector_t +raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + struct r10conf *conf = mddev->private; + + if (!raid_disks) + raid_disks = min(conf->geo.raid_disks, + conf->prev.raid_disks); + if (!sectors) + sectors = conf->dev_sectors; + + size = sectors >> conf->geo.chunk_shift; + sector_div(size, conf->geo.far_copies); + size = size * raid_disks; + sector_div(size, conf->geo.near_copies); + + return size << conf->geo.chunk_shift; +} + +static void calc_sectors(struct r10conf *conf, sector_t size) +{ + /* Calculate the number of sectors-per-device that will + * actually be used, and set conf->dev_sectors and + * conf->stride + */ + + size = size >> conf->geo.chunk_shift; + sector_div(size, conf->geo.far_copies); + size = size * conf->geo.raid_disks; + sector_div(size, conf->geo.near_copies); + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" */ + size = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); + + conf->dev_sectors = size << conf->geo.chunk_shift; + + if (conf->geo.far_offset) + conf->geo.stride = 1 << conf->geo.chunk_shift; + else { + sector_div(size, conf->geo.far_copies); + conf->geo.stride = size << conf->geo.chunk_shift; + } +} + +enum geo_type {geo_new, geo_old, geo_start}; +static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) +{ + int nc, fc, fo; + int layout, chunk, disks; + switch (new) { + case geo_old: + layout = mddev->layout; + chunk = mddev->chunk_sectors; + disks = mddev->raid_disks - mddev->delta_disks; + break; + case geo_new: + layout = mddev->new_layout; + chunk = mddev->new_chunk_sectors; + disks = mddev->raid_disks; + break; + default: /* avoid 'may be unused' warnings */ + case geo_start: /* new when starting reshape - raid_disks not + * updated yet. */ + layout = mddev->new_layout; + chunk = mddev->new_chunk_sectors; + disks = mddev->raid_disks + mddev->delta_disks; + break; + } + if (layout >> 19) + return -1; + if (chunk < (PAGE_SIZE >> 9) || + !is_power_of_2(chunk)) + return -2; + nc = layout & 255; + fc = (layout >> 8) & 255; + fo = layout & (1<<16); + geo->raid_disks = disks; + geo->near_copies = nc; + geo->far_copies = fc; + geo->far_offset = fo; + switch (layout >> 17) { + case 0: /* original layout. simple but not always optimal */ + geo->far_set_size = disks; + break; + case 1: /* "improved" layout which was buggy. Hopefully no-one is + * actually using this, but leave code here just in case.*/ + geo->far_set_size = disks/fc; + WARN(geo->far_set_size < fc, + "This RAID10 layout does not provide data safety - please backup and create new array\n"); + break; + case 2: /* "improved" layout fixed to match documentation */ + geo->far_set_size = fc * nc; + break; + default: /* Not a valid layout */ + return -1; + } + geo->chunk_mask = chunk - 1; + geo->chunk_shift = ffz(~chunk); + return nc*fc; +} + +static void raid10_free_conf(struct r10conf *conf) +{ + if (!conf) + return; + + mempool_exit(&conf->r10bio_pool); + kfree(conf->mirrors); + kfree(conf->mirrors_old); + kfree(conf->mirrors_new); + safe_put_page(conf->tmppage); + bioset_exit(&conf->bio_split); + kfree(conf); +} + +static struct r10conf *setup_conf(struct mddev *mddev) +{ + struct r10conf *conf = NULL; + int err = -EINVAL; + struct geom geo; + int copies; + + copies = setup_geo(&geo, mddev, geo_new); + + if (copies == -2) { + pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", + mdname(mddev), PAGE_SIZE); + goto out; + } + + if (copies < 2 || copies > mddev->raid_disks) { + pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", + mdname(mddev), mddev->new_layout); + goto out; + } + + err = -ENOMEM; + conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); + if (!conf) + goto out; + + /* FIXME calc properly */ + conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks), + sizeof(struct raid10_info), + GFP_KERNEL); + if (!conf->mirrors) + goto out; + + conf->tmppage = alloc_page(GFP_KERNEL); + if (!conf->tmppage) + goto out; + + conf->geo = geo; + conf->copies = copies; + err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, + rbio_pool_free, conf); + if (err) + goto out; + + err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); + if (err) + goto out; + + calc_sectors(conf, mddev->dev_sectors); + if (mddev->reshape_position == MaxSector) { + conf->prev = conf->geo; + conf->reshape_progress = MaxSector; + } else { + if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { + err = -EINVAL; + goto out; + } + conf->reshape_progress = mddev->reshape_position; + if (conf->prev.far_offset) + conf->prev.stride = 1 << conf->prev.chunk_shift; + else + /* far_copies must be 1 */ + conf->prev.stride = conf->dev_sectors; + } + conf->reshape_safe = conf->reshape_progress; + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); + + seqlock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + atomic_set(&conf->nr_pending, 0); + + err = -ENOMEM; + conf->thread = md_register_thread(raid10d, mddev, "raid10"); + if (!conf->thread) + goto out; + + conf->mddev = mddev; + return conf; + + out: + raid10_free_conf(conf); + return ERR_PTR(err); +} + +static void raid10_set_io_opt(struct r10conf *conf) +{ + int raid_disks = conf->geo.raid_disks; + + if (!(conf->geo.raid_disks % conf->geo.near_copies)) + raid_disks /= conf->geo.near_copies; + blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * + raid_disks); +} + +static int raid10_run(struct mddev *mddev) +{ + struct r10conf *conf; + int i, disk_idx; + struct raid10_info *disk; + struct md_rdev *rdev; + sector_t size; + sector_t min_offset_diff = 0; + int first = 1; + + if (mddev_init_writes_pending(mddev) < 0) + return -ENOMEM; + + if (mddev->private == NULL) { + conf = setup_conf(mddev); + if (IS_ERR(conf)) + return PTR_ERR(conf); + mddev->private = conf; + } + conf = mddev->private; + if (!conf) + goto out; + + mddev->thread = conf->thread; + conf->thread = NULL; + + if (mddev_is_clustered(conf->mddev)) { + int fc, fo; + + fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); + if (fc > 1 || fo > 0) { + pr_err("only near layout is supported by clustered" + " raid10\n"); + goto out_free_conf; + } + } + + if (mddev->queue) { + blk_queue_max_write_zeroes_sectors(mddev->queue, 0); + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + raid10_set_io_opt(conf); + } + + rdev_for_each(rdev, mddev) { + long long diff; + + disk_idx = rdev->raid_disk; + if (disk_idx < 0) + continue; + if (disk_idx >= conf->geo.raid_disks && + disk_idx >= conf->prev.raid_disks) + continue; + disk = conf->mirrors + disk_idx; + + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto out_free_conf; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto out_free_conf; + disk->rdev = rdev; + } + diff = (rdev->new_data_offset - rdev->data_offset); + if (!mddev->reshape_backwards) + diff = -diff; + if (diff < 0) + diff = 0; + if (first || diff < min_offset_diff) + min_offset_diff = diff; + + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + disk->head_position = 0; + first = 0; + } + + /* need to check that every block has at least one working mirror */ + if (!enough(conf, -1)) { + pr_err("md/raid10:%s: not enough operational mirrors.\n", + mdname(mddev)); + goto out_free_conf; + } + + if (conf->reshape_progress != MaxSector) { + /* must ensure that shape change is supported */ + if (conf->geo.far_copies != 1 && + conf->geo.far_offset == 0) + goto out_free_conf; + if (conf->prev.far_copies != 1 && + conf->prev.far_offset == 0) + goto out_free_conf; + } + + mddev->degraded = 0; + for (i = 0; + i < conf->geo.raid_disks + || i < conf->prev.raid_disks; + i++) { + + disk = conf->mirrors + i; + + if (!disk->rdev && disk->replacement) { + /* The replacement is all we have - use it */ + disk->rdev = disk->replacement; + disk->replacement = NULL; + clear_bit(Replacement, &disk->rdev->flags); + } + + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { + disk->head_position = 0; + mddev->degraded++; + if (disk->rdev && + disk->rdev->saved_raid_disk < 0) + conf->fullsync = 1; + } + + if (disk->replacement && + !test_bit(In_sync, &disk->replacement->flags) && + disk->replacement->saved_raid_disk < 0) { + conf->fullsync = 1; + } + + disk->recovery_disabled = mddev->recovery_disabled - 1; + } + + if (mddev->recovery_cp != MaxSector) + pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); + pr_info("md/raid10:%s: active with %d out of %d devices\n", + mdname(mddev), conf->geo.raid_disks - mddev->degraded, + conf->geo.raid_disks); + /* + * Ok, everything is just fine now + */ + mddev->dev_sectors = conf->dev_sectors; + size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + mddev->resync_max_sectors = size; + set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); + + if (md_integrity_register(mddev)) + goto out_free_conf; + + if (conf->reshape_progress != MaxSector) { + unsigned long before_length, after_length; + + before_length = ((1 << conf->prev.chunk_shift) * + conf->prev.far_copies); + after_length = ((1 << conf->geo.chunk_shift) * + conf->geo.far_copies); + + if (max(before_length, after_length) > min_offset_diff) { + /* This cannot work */ + pr_warn("md/raid10: offset difference not enough to continue reshape\n"); + goto out_free_conf; + } + conf->offset_diff = min_offset_diff; + + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) + goto out_free_conf; + } + + return 0; + +out_free_conf: + md_unregister_thread(&mddev->thread); + raid10_free_conf(conf); + mddev->private = NULL; +out: + return -EIO; +} + +static void raid10_free(struct mddev *mddev, void *priv) +{ + raid10_free_conf(priv); +} + +static void raid10_quiesce(struct mddev *mddev, int quiesce) +{ + struct r10conf *conf = mddev->private; + + if (quiesce) + raise_barrier(conf, 0); + else + lower_barrier(conf); +} + +static int raid10_resize(struct mddev *mddev, sector_t sectors) +{ + /* Resize of 'far' arrays is not supported. + * For 'near' and 'offset' arrays we can set the + * number of sectors used to be an appropriate multiple + * of the chunk size. + * For 'offset', this is far_copies*chunksize. + * For 'near' the multiplier is the LCM of + * near_copies and raid_disks. + * So if far_copies > 1 && !far_offset, fail. + * Else find LCM(raid_disks, near_copy)*far_copies and + * multiply by chunk_size. Then round to this number. + * This is mostly done by raid10_size() + */ + struct r10conf *conf = mddev->private; + sector_t oldsize, size; + + if (mddev->reshape_position != MaxSector) + return -EBUSY; + + if (conf->geo.far_copies > 1 && !conf->geo.far_offset) + return -EINVAL; + + oldsize = raid10_size(mddev, 0, 0); + size = raid10_size(mddev, sectors, 0); + if (mddev->external_size && + mddev->array_sectors > size) + return -EINVAL; + if (mddev->bitmap) { + int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, size); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > oldsize) { + mddev->recovery_cp = oldsize; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + calc_sectors(conf, sectors); + mddev->dev_sectors = conf->dev_sectors; + mddev->resync_max_sectors = size; + return 0; +} + +static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) +{ + struct md_rdev *rdev; + struct r10conf *conf; + + if (mddev->degraded > 0) { + pr_warn("md/raid10:%s: Error: degraded raid0!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + sector_div(size, devs); + + /* Set new parameters */ + mddev->new_level = 10; + /* new layout: far_copies = 1, near_copies = 2 */ + mddev->new_layout = (1<<8) + 2; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = mddev->raid_disks; + mddev->raid_disks *= 2; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + mddev->dev_sectors = size; + + conf = setup_conf(mddev); + if (!IS_ERR(conf)) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) { + rdev->new_raid_disk = rdev->raid_disk * 2; + rdev->sectors = size; + } + WRITE_ONCE(conf->barrier, 1); + } + + return conf; +} + +static void *raid10_takeover(struct mddev *mddev) +{ + struct r0conf *raid0_conf; + + /* raid10 can take over: + * raid0 - providing it has only two drives + */ + if (mddev->level == 0) { + /* for raid0 takeover only one zone is supported */ + raid0_conf = mddev->private; + if (raid0_conf->nr_strip_zones > 1) { + pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + return raid10_takeover_raid0(mddev, + raid0_conf->strip_zone->zone_end, + raid0_conf->strip_zone->nb_dev); + } + return ERR_PTR(-EINVAL); +} + +static int raid10_check_reshape(struct mddev *mddev) +{ + /* Called when there is a request to change + * - layout (to ->new_layout) + * - chunk size (to ->new_chunk_sectors) + * - raid_disks (by delta_disks) + * or when trying to restart a reshape that was ongoing. + * + * We need to validate the request and possibly allocate + * space if that might be an issue later. + * + * Currently we reject any reshape of a 'far' mode array, + * allow chunk size to change if new is generally acceptable, + * allow raid_disks to increase, and allow + * a switch between 'near' mode and 'offset' mode. + */ + struct r10conf *conf = mddev->private; + struct geom geo; + + if (conf->geo.far_copies != 1 && !conf->geo.far_offset) + return -EINVAL; + + if (setup_geo(&geo, mddev, geo_start) != conf->copies) + /* mustn't change number of copies */ + return -EINVAL; + if (geo.far_copies > 1 && !geo.far_offset) + /* Cannot switch to 'far' mode */ + return -EINVAL; + + if (mddev->array_sectors & geo.chunk_mask) + /* not factor of array size */ + return -EINVAL; + + if (!enough(conf, -1)) + return -EINVAL; + + kfree(conf->mirrors_new); + conf->mirrors_new = NULL; + if (mddev->delta_disks > 0) { + /* allocate new 'mirrors' list */ + conf->mirrors_new = + kcalloc(mddev->raid_disks + mddev->delta_disks, + sizeof(struct raid10_info), + GFP_KERNEL); + if (!conf->mirrors_new) + return -ENOMEM; + } + return 0; +} + +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int calc_degraded(struct r10conf *conf) +{ + int degraded, degraded2; + int i; + + rcu_read_lock(); + degraded = 0; + /* 'prev' section first */ + for (i = 0; i < conf->prev.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (!test_bit(In_sync, &rdev->flags)) + /* When we can reduce the number of devices in + * an array, this might not contribute to + * 'degraded'. It does now. + */ + degraded++; + } + rcu_read_unlock(); + if (conf->geo.raid_disks == conf->prev.raid_disks) + return degraded; + rcu_read_lock(); + degraded2 = 0; + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded2++; + else if (!test_bit(In_sync, &rdev->flags)) { + /* If reshape is increasing the number of devices, + * this section has already been recovered, so + * it doesn't contribute to degraded. + * else it does. + */ + if (conf->geo.raid_disks <= conf->prev.raid_disks) + degraded2++; + } + } + rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static int raid10_start_reshape(struct mddev *mddev) +{ + /* A 'reshape' has been requested. This commits + * the various 'new' fields and sets MD_RECOVER_RESHAPE + * This also checks if there are enough spares and adds them + * to the array. + * We currently require enough spares to make the final + * array non-degraded. We also require that the difference + * between old and new data_offset - on each device - is + * enough that we never risk over-writing. + */ + + unsigned long before_length, after_length; + sector_t min_offset_diff = 0; + int first = 1; + struct geom new; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev; + int spares = 0; + int ret; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + if (setup_geo(&new, mddev, geo_start) != conf->copies) + return -EINVAL; + + before_length = ((1 << conf->prev.chunk_shift) * + conf->prev.far_copies); + after_length = ((1 << conf->geo.chunk_shift) * + conf->geo.far_copies); + + rdev_for_each(rdev, mddev) { + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) { + long long diff = (rdev->new_data_offset + - rdev->data_offset); + if (!mddev->reshape_backwards) + diff = -diff; + if (diff < 0) + diff = 0; + if (first || diff < min_offset_diff) + min_offset_diff = diff; + first = 0; + } + } + + if (max(before_length, after_length) > min_offset_diff) + return -EINVAL; + + if (spares < mddev->delta_disks) + return -EINVAL; + + conf->offset_diff = min_offset_diff; + spin_lock_irq(&conf->device_lock); + if (conf->mirrors_new) { + memcpy(conf->mirrors_new, conf->mirrors, + sizeof(struct raid10_info)*conf->prev.raid_disks); + smp_mb(); + kfree(conf->mirrors_old); + conf->mirrors_old = conf->mirrors; + conf->mirrors = conf->mirrors_new; + conf->mirrors_new = NULL; + } + setup_geo(&conf->geo, mddev, geo_start); + smp_mb(); + if (mddev->reshape_backwards) { + sector_t size = raid10_size(mddev, 0, 0); + if (size < mddev->array_sectors) { + spin_unlock_irq(&conf->device_lock); + pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", + mdname(mddev)); + return -EINVAL; + } + mddev->resync_max_sectors = size; + conf->reshape_progress = size; + } else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; + spin_unlock_irq(&conf->device_lock); + + if (mddev->delta_disks && mddev->bitmap) { + struct mdp_superblock_1 *sb = NULL; + sector_t oldsize, newsize; + + oldsize = raid10_size(mddev, 0, 0); + newsize = raid10_size(mddev, 0, conf->geo.raid_disks); + + if (!mddev_is_clustered(mddev)) { + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + goto abort; + else + goto out; + } + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk > -1 && + !test_bit(Faulty, &rdev->flags)) + sb = page_address(rdev->sb_page); + } + + /* + * some node is already performing reshape, and no need to + * call md_bitmap_resize again since it should be called when + * receiving BITMAP_RESIZE msg + */ + if ((sb && (le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) + goto out; + + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + goto abort; + + ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); + if (ret) { + md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); + goto abort; + } + } +out: + if (mddev->delta_disks > 0) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid10_add_disk(mddev, rdev) == 0) { + if (rdev->raid_disk >= + conf->prev.raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; + + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); + } + } else if (rdev->raid_disk >= conf->prev.raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + } + } + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post numbers. + */ + spin_lock_irq(&conf->device_lock); + mddev->degraded = calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); + mddev->raid_disks = conf->geo.raid_disks; + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) { + ret = -EAGAIN; + goto abort; + } + conf->reshape_checkpoint = jiffies; + md_wakeup_thread(mddev->sync_thread); + md_new_event(); + return 0; + +abort: + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + conf->geo = conf->prev; + mddev->raid_disks = conf->geo.raid_disks; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); + conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; + mddev->reshape_position = MaxSector; + spin_unlock_irq(&conf->device_lock); + return ret; +} + +/* Calculate the last device-address that could contain + * any block from the chunk that includes the array-address 's' + * and report the next address. + * i.e. the address returned will be chunk-aligned and after + * any data that is in the chunk containing 's'. + */ +static sector_t last_dev_address(sector_t s, struct geom *geo) +{ + s = (s | geo->chunk_mask) + 1; + s >>= geo->chunk_shift; + s *= geo->near_copies; + s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); + s *= geo->far_copies; + s <<= geo->chunk_shift; + return s; +} + +/* Calculate the first device-address that could contain + * any block from the chunk that includes the array-address 's'. + * This too will be the start of a chunk + */ +static sector_t first_dev_address(sector_t s, struct geom *geo) +{ + s >>= geo->chunk_shift; + s *= geo->near_copies; + sector_div(s, geo->raid_disks); + s *= geo->far_copies; + s <<= geo->chunk_shift; + return s; +} + +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) +{ + /* We simply copy at most one chunk (smallest of old and new) + * at a time, possibly less if that exceeds RESYNC_PAGES, + * or we hit a bad block or something. + * This might mean we pause for normal IO in the middle of + * a chunk, but that is not a problem as mddev->reshape_position + * can record any location. + * + * If we will want to write to a location that isn't + * yet recorded as 'safe' (i.e. in metadata on disk) then + * we need to flush all reshape requests and update the metadata. + * + * When reshaping forwards (e.g. to more devices), we interpret + * 'safe' as the earliest block which might not have been copied + * down yet. We divide this by previous stripe size and multiply + * by previous stripe length to get lowest device offset that we + * cannot write to yet. + * We interpret 'sector_nr' as an address that we want to write to. + * From this we use last_device_address() to find where we might + * write to, and first_device_address on the 'safe' position. + * If this 'next' write position is after the 'safe' position, + * we must update the metadata to increase the 'safe' position. + * + * When reshaping backwards, we round in the opposite direction + * and perform the reverse test: next write position must not be + * less than current safe position. + * + * In all this the minimum difference in data offsets + * (conf->offset_diff - always positive) allows a bit of slack, + * so next can be after 'safe', but not by more than offset_diff + * + * We need to prepare all the bios here before we start any IO + * to ensure the size we choose is acceptable to all devices. + * The means one for each copy for write-out and an extra one for + * read-in. + * We store the read-in bio in ->master_bio and the others in + * ->devs[x].bio and ->devs[x].repl_bio. + */ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + sector_t next, safe, last; + int max_sectors; + int nr_sectors; + int s; + struct md_rdev *rdev; + int need_flush = 0; + struct bio *blist; + struct bio *bio, *read_bio; + int sectors_done = 0; + struct page **pages; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->reshape_backwards && + conf->reshape_progress < raid10_size(mddev, 0, 0)) { + sector_nr = (raid10_size(mddev, 0, 0) + - conf->reshape_progress); + } else if (!mddev->reshape_backwards && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; + if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify_dirent_safe(mddev->sysfs_completed); + *skipped = 1; + return sector_nr; + } + } + + /* We don't use sector_nr to track where we are up to + * as that doesn't work well for ->reshape_backwards. + * So just use ->reshape_progress. + */ + if (mddev->reshape_backwards) { + /* 'next' is the earliest device address that we might + * write to for this chunk in the new layout + */ + next = first_dev_address(conf->reshape_progress - 1, + &conf->geo); + + /* 'safe' is the last device address that we might read from + * in the old layout after a restart + */ + safe = last_dev_address(conf->reshape_safe - 1, + &conf->prev); + + if (next + conf->offset_diff < safe) + need_flush = 1; + + last = conf->reshape_progress - 1; + sector_nr = last & ~(sector_t)(conf->geo.chunk_mask + & conf->prev.chunk_mask); + if (sector_nr + RESYNC_SECTORS < last) + sector_nr = last + 1 - RESYNC_SECTORS; + } else { + /* 'next' is after the last device address that we + * might write to for this chunk in the new layout + */ + next = last_dev_address(conf->reshape_progress, &conf->geo); + + /* 'safe' is the earliest device address that we might + * read from in the old layout after a restart + */ + safe = first_dev_address(conf->reshape_safe, &conf->prev); + + /* Need to update metadata if 'next' might be beyond 'safe' + * as that would possibly corrupt data + */ + if (next > safe + conf->offset_diff) + need_flush = 1; + + sector_nr = conf->reshape_progress; + last = sector_nr | (conf->geo.chunk_mask + & conf->prev.chunk_mask); + + if (sector_nr + RESYNC_SECTORS <= last) + last = sector_nr + RESYNC_SECTORS - 1; + } + + if (need_flush || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { + /* Need to update reshape_position in metadata */ + wait_barrier(conf, false); + mddev->reshape_position = conf->reshape_progress; + if (mddev->reshape_backwards) + mddev->curr_resync_completed = raid10_size(mddev, 0, 0) + - conf->reshape_progress; + else + mddev->curr_resync_completed = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->sb_flags == 0 || + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + allow_barrier(conf); + return sectors_done; + } + conf->reshape_safe = mddev->reshape_position; + allow_barrier(conf); + } + + raise_barrier(conf, 0); +read_more: + /* Now schedule reads for blocks from sector_nr to last */ + r10_bio = raid10_alloc_init_r10buf(conf); + r10_bio->state = 0; + raise_barrier(conf, 1); + atomic_set(&r10_bio->remaining, 0); + r10_bio->mddev = mddev; + r10_bio->sector = sector_nr; + set_bit(R10BIO_IsReshape, &r10_bio->state); + r10_bio->sectors = last - sector_nr + 1; + rdev = read_balance(conf, r10_bio, &max_sectors); + BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); + + if (!rdev) { + /* Cannot read from here, so need to record bad blocks + * on all the target devices. + */ + // FIXME + mempool_free(r10_bio, &conf->r10buf_pool); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return sectors_done; + } + + read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ, + GFP_KERNEL, &mddev->bio_set); + read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + + rdev->data_offset); + read_bio->bi_private = r10_bio; + read_bio->bi_end_io = end_reshape_read; + r10_bio->master_bio = read_bio; + r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; + + /* + * Broadcast RESYNC message to other nodes, so all nodes would not + * write to the region to avoid conflict. + */ + if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { + struct mdp_superblock_1 *sb = NULL; + int sb_reshape_pos = 0; + + conf->cluster_sync_low = sector_nr; + conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; + sb = page_address(rdev->sb_page); + if (sb) { + sb_reshape_pos = le64_to_cpu(sb->reshape_position); + /* + * Set cluster_sync_low again if next address for array + * reshape is less than cluster_sync_low. Since we can't + * update cluster_sync_low until it has finished reshape. + */ + if (sb_reshape_pos < conf->cluster_sync_low) + conf->cluster_sync_low = sb_reshape_pos; + } + + md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, + conf->cluster_sync_high); + } + + /* Now find the locations in the new layout */ + __raid10_find_phys(&conf->geo, r10_bio); + + blist = read_bio; + read_bio->bi_next = NULL; + + rcu_read_lock(); + for (s = 0; s < conf->copies*2; s++) { + struct bio *b; + int d = r10_bio->devs[s/2].devnum; + struct md_rdev *rdev2; + if (s&1) { + rdev2 = rcu_dereference(conf->mirrors[d].replacement); + b = r10_bio->devs[s/2].repl_bio; + } else { + rdev2 = rcu_dereference(conf->mirrors[d].rdev); + b = r10_bio->devs[s/2].bio; + } + if (!rdev2 || test_bit(Faulty, &rdev2->flags)) + continue; + + bio_set_dev(b, rdev2->bdev); + b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + + rdev2->new_data_offset; + b->bi_end_io = end_reshape_write; + bio_set_op_attrs(b, REQ_OP_WRITE, 0); + b->bi_next = blist; + blist = b; + } + + /* Now add as many pages as possible to all of these bios. */ + + nr_sectors = 0; + pages = get_resync_pages(r10_bio->devs[0].bio)->pages; + for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { + struct page *page = pages[s / (PAGE_SIZE >> 9)]; + int len = (max_sectors - s) << 9; + if (len > PAGE_SIZE) + len = PAGE_SIZE; + for (bio = blist; bio ; bio = bio->bi_next) { + /* + * won't fail because the vec table is big enough + * to hold all these pages + */ + bio_add_page(bio, page, len, 0); + } + sector_nr += len >> 9; + nr_sectors += len >> 9; + } + rcu_read_unlock(); + r10_bio->sectors = nr_sectors; + + /* Now submit the read */ + md_sync_acct_bio(read_bio, r10_bio->sectors); + atomic_inc(&r10_bio->remaining); + read_bio->bi_next = NULL; + submit_bio_noacct(read_bio); + sectors_done += nr_sectors; + if (sector_nr <= last) + goto read_more; + + lower_barrier(conf); + + /* Now that we have done the whole section we can + * update reshape_progress + */ + if (mddev->reshape_backwards) + conf->reshape_progress -= sectors_done; + else + conf->reshape_progress += sectors_done; + + return sectors_done; +} + +static void end_reshape_request(struct r10bio *r10_bio); +static int handle_reshape_read_error(struct mddev *mddev, + struct r10bio *r10_bio); +static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + /* Reshape read completed. Hopefully we have a block + * to write out. + * If we got a read error then we do sync 1-page reads from + * elsewhere until we find the data - or give up. + */ + struct r10conf *conf = mddev->private; + int s; + + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + if (handle_reshape_read_error(mddev, r10_bio) < 0) { + /* Reshape has been aborted */ + md_done_sync(mddev, r10_bio->sectors, 0); + return; + } + + /* We definitely have the data in the pages, schedule the + * writes. + */ + atomic_set(&r10_bio->remaining, 1); + for (s = 0; s < conf->copies*2; s++) { + struct bio *b; + int d = r10_bio->devs[s/2].devnum; + struct md_rdev *rdev; + rcu_read_lock(); + if (s&1) { + rdev = rcu_dereference(conf->mirrors[d].replacement); + b = r10_bio->devs[s/2].repl_bio; + } else { + rdev = rcu_dereference(conf->mirrors[d].rdev); + b = r10_bio->devs[s/2].bio; + } + if (!rdev || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); + continue; + } + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + md_sync_acct_bio(b, r10_bio->sectors); + atomic_inc(&r10_bio->remaining); + b->bi_next = NULL; + submit_bio_noacct(b); + } + end_reshape_request(r10_bio); +} + +static void end_reshape(struct r10conf *conf) +{ + if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) + return; + + spin_lock_irq(&conf->device_lock); + conf->prev = conf->geo; + md_finish_reshape(conf->mddev); + smp_wmb(); + conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; + spin_unlock_irq(&conf->device_lock); + + if (conf->mddev->queue) + raid10_set_io_opt(conf); + conf->fullsync = 0; +} + +static void raid10_update_reshape_pos(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + sector_t lo, hi; + + md_cluster_ops->resync_info_get(mddev, &lo, &hi); + if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) + || mddev->reshape_position == MaxSector) + conf->reshape_progress = mddev->reshape_position; + else + WARN_ON_ONCE(1); +} + +static int handle_reshape_read_error(struct mddev *mddev, + struct r10bio *r10_bio) +{ + /* Use sync reads to get the blocks from somewhere else */ + int sectors = r10_bio->sectors; + struct r10conf *conf = mddev->private; + struct r10bio *r10b; + int slot = 0; + int idx = 0; + struct page **pages; + + r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); + if (!r10b) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return -ENOMEM; + } + + /* reshape IOs share pages from .devs[0].bio */ + pages = get_resync_pages(r10_bio->devs[0].bio)->pages; + + r10b->sector = r10_bio->sector; + __raid10_find_phys(&conf->prev, r10b); + + while (sectors) { + int s = sectors; + int success = 0; + int first_slot = slot; + + if (s > (PAGE_SIZE >> 9)) + s = PAGE_SIZE >> 9; + + rcu_read_lock(); + while (!success) { + int d = r10b->devs[slot].devnum; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + sector_t addr; + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + goto failed; + + addr = r10b->devs[slot].addr + idx * PAGE_SIZE; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev, + addr, + s << 9, + pages[idx], + REQ_OP_READ, false); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; + failed: + slot++; + if (slot >= conf->copies) + slot = 0; + if (slot == first_slot) + break; + } + rcu_read_unlock(); + if (!success) { + /* couldn't read this block, must give up */ + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + kfree(r10b); + return -EIO; + } + sectors -= s; + idx++; + } + kfree(r10b); + return 0; +} + +static void end_reshape_write(struct bio *bio) +{ + struct r10bio *r10_bio = get_resync_r10bio(bio); + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + int d; + int slot; + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + if (!rdev) { + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + + if (bio->bi_status) { + /* FIXME should record badblock */ + md_error(mddev, rdev); + } + + rdev_dec_pending(rdev, mddev); + end_reshape_request(r10_bio); +} + +static void end_reshape_request(struct r10bio *r10_bio) +{ + if (!atomic_dec_and_test(&r10_bio->remaining)) + return; + md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); + bio_put(r10_bio->master_bio); + put_buf(r10_bio); +} + +static void raid10_finish_reshape(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return; + + if (mddev->delta_disks > 0) { + if (mddev->recovery_cp > mddev->resync_max_sectors) { + mddev->recovery_cp = mddev->resync_max_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->resync_max_sectors = mddev->array_sectors; + } else { + int d; + rcu_read_lock(); + for (d = conf->geo.raid_disks ; + d < conf->geo.raid_disks - mddev->delta_disks; + d++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = rcu_dereference(conf->mirrors[d].replacement); + if (rdev) + clear_bit(In_sync, &rdev->flags); + } + rcu_read_unlock(); + } + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = 1 << conf->geo.chunk_shift; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; +} + +static struct md_personality raid10_personality = +{ + .name = "raid10", + .level = 10, + .owner = THIS_MODULE, + .make_request = raid10_make_request, + .run = raid10_run, + .free = raid10_free, + .status = raid10_status, + .error_handler = raid10_error, + .hot_add_disk = raid10_add_disk, + .hot_remove_disk= raid10_remove_disk, + .spare_active = raid10_spare_active, + .sync_request = raid10_sync_request, + .quiesce = raid10_quiesce, + .size = raid10_size, + .resize = raid10_resize, + .takeover = raid10_takeover, + .check_reshape = raid10_check_reshape, + .start_reshape = raid10_start_reshape, + .finish_reshape = raid10_finish_reshape, + .update_reshape_pos = raid10_update_reshape_pos, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&raid10_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&raid10_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); +MODULE_ALIAS("md-personality-9"); /* RAID10 */ +MODULE_ALIAS("md-raid10"); +MODULE_ALIAS("md-level-10"); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 000000000..8c072ce0b --- /dev/null +++ b/drivers/md/raid10.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RAID10_H +#define _RAID10_H + +/* Note: raid10_info.rdev can be set to NULL asynchronously by + * raid10_remove_disk. + * There are three safe ways to access raid10_info.rdev. + * 1/ when holding mddev->reconfig_mutex + * 2/ when resync/recovery/reshape is known to be happening - i.e. in code + * that is called as part of performing resync/recovery/reshape. + * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer + * and if it is non-NULL, increment rdev->nr_pending before dropping the + * RCU lock. + * When .rdev is set to NULL, the nr_pending count checked again and if it has + * been incremented, the pointer is put back in .rdev. + */ + +struct raid10_info { + struct md_rdev *rdev, *replacement; + sector_t head_position; + int recovery_disabled; /* matches + * mddev->recovery_disabled + * when we shouldn't try + * recovering this device. + */ +}; + +struct r10conf { + struct mddev *mddev; + struct raid10_info *mirrors; + struct raid10_info *mirrors_new, *mirrors_old; + spinlock_t device_lock; + + /* geometry */ + struct geom { + int raid_disks; + int near_copies; /* number of copies laid out + * raid0 style */ + int far_copies; /* number of copies laid out + * at large strides across drives + */ + int far_offset; /* far_copies are offset by 1 + * stripe instead of many + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. + */ + int far_set_size; /* The number of devices in a set, + * where a 'set' are devices that + * contain far/offset copies of + * each other. + */ + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + } prev, geo; + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + + sector_t dev_sectors; /* temp copy of + * mddev->dev_sectors */ + sector_t reshape_progress; + sector_t reshape_safe; + unsigned long reshape_checkpoint; + sector_t offset_diff; + + struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; + + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + + seqlock_t resync_lock; + atomic_t nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + int array_freeze_pending; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + int have_replacement; /* There is at least one + * replacement device. + */ + wait_queue_head_t wait_barrier; + + mempool_t r10bio_pool; + mempool_t r10buf_pool; + struct page *tmppage; + struct bio_set bio_split; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; + + /* + * Keep track of cluster resync window to send to other nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; +}; + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + unsigned long start_time; + struct mddev *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + * We sometimes need an extra bio to write to the replacement. + */ + struct r10dev { + struct bio *bio; + union { + struct bio *repl_bio; /* used for resync and + * writes */ + struct md_rdev *rdev; /* used for reads + * (read_slot >= 0) */ + }; + sector_t addr; + int devnum; + } devs[]; +}; + +/* bits for r10bio.state */ +enum r10bio_state { + R10BIO_Uptodate, + R10BIO_IsSync, + R10BIO_IsRecover, + R10BIO_IsReshape, + R10BIO_Degraded, +/* Set ReadError on bios that experience a read error + * so that raid10d knows what to do with them. + */ + R10BIO_ReadError, +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag. + */ + R10BIO_MadeGood, + R10BIO_WriteError, +/* During a reshape we might be performing IO on the + * 'previous' part of the array, in which case this + * flag is set + */ + R10BIO_Previous, +/* failfast devices did receive failfast requests. */ + R10BIO_FailFast, + R10BIO_Discard, +}; +#endif diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c new file mode 100644 index 000000000..eb66d0bfe --- /dev/null +++ b/drivers/md/raid5-cache.c @@ -0,0 +1,3183 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Shaohua Li + * Copyright (C) 2016 Song Liu + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "raid5.h" +#include "md-bitmap.h" +#include "raid5-log.h" + +/* + * metadata/data stored in disk with 4k size unit (a block) regardless + * underneath hardware sector size. only works with PAGE_SIZE == 4096 + */ +#define BLOCK_SECTORS (8) +#define BLOCK_SECTOR_SHIFT (3) + +/* + * log->max_free_space is min(1/4 disk size, 10G reclaimable space). + * + * In write through mode, the reclaim runs every log->max_free_space. + * This can prevent the recovery scans for too long + */ +#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ +#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) + +/* wake up reclaim thread periodically */ +#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) +/* start flush with these full stripes */ +#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4) +/* reclaim stripes in groups */ +#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) + +/* + * We only need 2 bios per I/O unit to make progress, but ensure we + * have a few more available to not get too tight. + */ +#define R5L_POOL_SIZE 4 + +static char *r5c_journal_mode_str[] = {"write-through", + "write-back"}; +/* + * raid5 cache state machine + * + * With the RAID cache, each stripe works in two phases: + * - caching phase + * - writing-out phase + * + * These two phases are controlled by bit STRIPE_R5C_CACHING: + * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase + * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase + * + * When there is no journal, or the journal is in write-through mode, + * the stripe is always in writing-out phase. + * + * For write-back journal, the stripe is sent to caching phase on write + * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off + * the write-out phase by clearing STRIPE_R5C_CACHING. + * + * Stripes in caching phase do not write the raid disks. Instead, all + * writes are committed from the log device. Therefore, a stripe in + * caching phase handles writes as: + * - write to log device + * - return IO + * + * Stripes in writing-out phase handle writes as: + * - calculate parity + * - write pending data and parity to journal + * - write data and parity to raid disks + * - return IO for pending writes + */ + +struct r5l_log { + struct md_rdev *rdev; + + u32 uuid_checksum; + + sector_t device_size; /* log device size, round to + * BLOCK_SECTORS */ + sector_t max_free_space; /* reclaim run if free space is at + * this size */ + + sector_t last_checkpoint; /* log tail. where recovery scan + * starts from */ + u64 last_cp_seq; /* log tail sequence */ + + sector_t log_start; /* log head. where new data appends */ + u64 seq; /* log head sequence */ + + sector_t next_checkpoint; + + struct mutex io_mutex; + struct r5l_io_unit *current_io; /* current io_unit accepting new data */ + + spinlock_t io_list_lock; + struct list_head running_ios; /* io_units which are still running, + * and have not yet been completely + * written to the log */ + struct list_head io_end_ios; /* io_units which have been completely + * written to the log but not yet written + * to the RAID */ + struct list_head flushing_ios; /* io_units which are waiting for log + * cache flush */ + struct list_head finished_ios; /* io_units which settle down in log disk */ + struct bio flush_bio; + + struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ + + struct kmem_cache *io_kc; + mempool_t io_pool; + struct bio_set bs; + mempool_t meta_pool; + + struct md_thread *reclaim_thread; + unsigned long reclaim_target; /* number of space that need to be + * reclaimed. if it's 0, reclaim spaces + * used by io_units which are in + * IO_UNIT_STRIPE_END state (eg, reclaim + * doesn't wait for specific io_unit + * switching to IO_UNIT_STRIPE_END + * state) */ + wait_queue_head_t iounit_wait; + + struct list_head no_space_stripes; /* pending stripes, log has no space */ + spinlock_t no_space_stripes_lock; + + bool need_cache_flush; + + /* for r5c_cache */ + enum r5c_journal_mode r5c_journal_mode; + + /* all stripes in r5cache, in the order of seq at sh->log_start */ + struct list_head stripe_in_journal_list; + + spinlock_t stripe_in_journal_lock; + atomic_t stripe_in_journal_count; + + /* to submit async io_units, to fulfill ordering of flush */ + struct work_struct deferred_io_work; + /* to disable write back during in degraded mode */ + struct work_struct disable_writeback_work; + + /* to for chunk_aligned_read in writeback mode, details below */ + spinlock_t tree_lock; + struct radix_tree_root big_stripe_tree; +}; + +/* + * Enable chunk_aligned_read() with write back cache. + * + * Each chunk may contain more than one stripe (for example, a 256kB + * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For + * chunk_aligned_read, these stripes are grouped into one "big_stripe". + * For each big_stripe, we count how many stripes of this big_stripe + * are in the write back cache. These data are tracked in a radix tree + * (big_stripe_tree). We use radix_tree item pointer as the counter. + * r5c_tree_index() is used to calculate keys for the radix tree. + * + * chunk_aligned_read() calls r5c_big_stripe_cached() to look up + * big_stripe of each chunk in the tree. If this big_stripe is in the + * tree, chunk_aligned_read() aborts. This look up is protected by + * rcu_read_lock(). + * + * It is necessary to remember whether a stripe is counted in + * big_stripe_tree. Instead of adding new flag, we reuses existing flags: + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these + * two flags are set, the stripe is counted in big_stripe_tree. This + * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to + * r5c_try_caching_write(); and moving clear_bit of + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to + * r5c_finish_stripe_write_out(). + */ + +/* + * radix tree requests lowest 2 bits of data pointer to be 2b'00. + * So it is necessary to left shift the counter by 2 bits before using it + * as data pointer of the tree. + */ +#define R5C_RADIX_COUNT_SHIFT 2 + +/* + * calculate key for big_stripe_tree + * + * sect: align_bi->bi_iter.bi_sector or sh->sector + */ +static inline sector_t r5c_tree_index(struct r5conf *conf, + sector_t sect) +{ + sector_div(sect, conf->chunk_sectors); + return sect; +} + +/* + * an IO range starts from a meta data block and end at the next meta data + * block. The io unit's the meta data block tracks data/parity followed it. io + * unit is written to log disk with normal write, as we always flush log disk + * first and then start move data to raid disks, there is no requirement to + * write io unit with FLUSH/FUA + */ +struct r5l_io_unit { + struct r5l_log *log; + + struct page *meta_page; /* store meta block */ + int meta_offset; /* current offset in meta_page */ + + struct bio *current_bio;/* current_bio accepting new data */ + + atomic_t pending_stripe;/* how many stripes not flushed to raid */ + u64 seq; /* seq number of the metablock */ + sector_t log_start; /* where the io_unit starts */ + sector_t log_end; /* where the io_unit ends */ + struct list_head log_sibling; /* log->running_ios */ + struct list_head stripe_list; /* stripes added to the io_unit */ + + int state; + bool need_split_bio; + struct bio *split_bio; + + unsigned int has_flush:1; /* include flush request */ + unsigned int has_fua:1; /* include fua request */ + unsigned int has_null_flush:1; /* include null flush request */ + unsigned int has_flush_payload:1; /* include flush payload */ + /* + * io isn't sent yet, flush/fua request can only be submitted till it's + * the first IO in running_ios list + */ + unsigned int io_deferred:1; + + struct bio_list flush_barriers; /* size == 0 flush bios */ +}; + +/* r5l_io_unit state */ +enum r5l_io_unit_state { + IO_UNIT_RUNNING = 0, /* accepting new IO */ + IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, + * don't accepting new bio */ + IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ + IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ +}; + +bool r5c_is_writeback(struct r5l_log *log) +{ + return (log != NULL && + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); +} + +static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) +{ + start += inc; + if (start >= log->device_size) + start = start - log->device_size; + return start; +} + +static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, + sector_t end) +{ + if (end >= start) + return end - start; + else + return end + log->device_size - start; +} + +static bool r5l_has_free_space(struct r5l_log *log, sector_t size) +{ + sector_t used_size; + + used_size = r5l_ring_distance(log, log->last_checkpoint, + log->log_start); + + return log->device_size > used_size + size; +} + +static void __r5l_set_io_unit_state(struct r5l_io_unit *io, + enum r5l_io_unit_state state) +{ + if (WARN_ON(io->state >= state)) + return; + io->state = state; +} + +static void +r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) +{ + struct bio *wbi, *wbi2; + + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_iter.bi_sector < + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); + md_write_end(conf->mddev); + bio_endio(wbi); + wbi = wbi2; + } +} + +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks) +{ + int i; + + for (i = sh->disks; i--; ) { + if (sh->dev[i].written) { + set_bit(R5_UPTODATE, &sh->dev[i].flags); + r5c_return_dev_pending_writes(conf, &sh->dev[i]); + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); + } + } +} + +void r5l_wake_reclaim(struct r5l_log *log, sector_t space); + +/* Check whether we should flush some stripes to free up stripe cache */ +void r5c_check_stripe_cache_usage(struct r5conf *conf) +{ + int total_cached; + + if (!r5c_is_writeback(conf->log)) + return; + + total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + + atomic_read(&conf->r5c_cached_full_stripes); + + /* + * The following condition is true for either of the following: + * - stripe cache pressure high: + * total_cached > 3/4 min_nr_stripes || + * empty_inactive_list_nr > 0 + * - stripe cache pressure moderate: + * total_cached > 1/2 min_nr_stripes + */ + if (total_cached > conf->min_nr_stripes * 1 / 2 || + atomic_read(&conf->empty_inactive_list_nr) > 0) + r5l_wake_reclaim(conf->log, 0); +} + +/* + * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full + * stripes in the cache + */ +void r5c_check_cached_full_stripe(struct r5conf *conf) +{ + if (!r5c_is_writeback(conf->log)) + return; + + /* + * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes + * or a full stripe (chunk size / 4k stripes). + */ + if (atomic_read(&conf->r5c_cached_full_stripes) >= + min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), + conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) + r5l_wake_reclaim(conf->log, 0); +} + +/* + * Total log space (in sectors) needed to flush all data in cache + * + * To avoid deadlock due to log space, it is necessary to reserve log + * space to flush critical stripes (stripes that occupying log space near + * last_checkpoint). This function helps check how much log space is + * required to flush all cached stripes. + * + * To reduce log space requirements, two mechanisms are used to give cache + * flush higher priorities: + * 1. In handle_stripe_dirtying() and schedule_reconstruction(), + * stripes ALREADY in journal can be flushed w/o pending writes; + * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal + * can be delayed (r5l_add_no_space_stripe). + * + * In cache flush, the stripe goes through 1 and then 2. For a stripe that + * already passed 1, flushing it requires at most (conf->max_degraded + 1) + * pages of journal space. For stripes that has not passed 1, flushing it + * requires (conf->raid_disks + 1) pages of journal space. There are at + * most (conf->group_cnt + 1) stripe that passed 1. So total journal space + * required to flush all cached stripes (in pages) is: + * + * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks + 1) + * or + * (stripe_in_journal_count) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks - max_degraded) + */ +static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) +{ + struct r5l_log *log = conf->log; + + if (!r5c_is_writeback(log)) + return 0; + + return BLOCK_SECTORS * + ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + + (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); +} + +/* + * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL + * + * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of + * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log + * device is less than 2x of reclaim_required_space. + */ +static inline void r5c_update_log_state(struct r5l_log *log) +{ + struct r5conf *conf = log->rdev->mddev->private; + sector_t free_space; + sector_t reclaim_space; + bool wake_reclaim = false; + + if (!r5c_is_writeback(log)) + return; + + free_space = r5l_ring_distance(log, log->log_start, + log->last_checkpoint); + reclaim_space = r5c_log_required_to_flush_cache(conf); + if (free_space < 2 * reclaim_space) + set_bit(R5C_LOG_CRITICAL, &conf->cache_state); + else { + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) + wake_reclaim = true; + clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); + } + if (free_space < 3 * reclaim_space) + set_bit(R5C_LOG_TIGHT, &conf->cache_state); + else + clear_bit(R5C_LOG_TIGHT, &conf->cache_state); + + if (wake_reclaim) + r5l_wake_reclaim(log, 0); +} + +/* + * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. + * This function should only be called in write-back mode. + */ +void r5c_make_stripe_write_out(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + struct r5l_log *log = conf->log; + + BUG_ON(!r5c_is_writeback(log)); + + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + clear_bit(STRIPE_R5C_CACHING, &sh->state); + + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); +} + +static void r5c_handle_data_cached(struct stripe_head *sh) +{ + int i; + + for (i = sh->disks; i--; ) + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + set_bit(R5_InJournal, &sh->dev[i].flags); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + } + clear_bit(STRIPE_LOG_TRAPPED, &sh->state); +} + +/* + * this journal write must contain full parity, + * it may also contain some data pages + */ +static void r5c_handle_parity_cached(struct stripe_head *sh) +{ + int i; + + for (i = sh->disks; i--; ) + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + set_bit(R5_Wantwrite, &sh->dev[i].flags); +} + +/* + * Setting proper flags after writing (or flushing) data and/or parity to the + * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). + */ +static void r5c_finish_cache_stripe(struct stripe_head *sh) +{ + struct r5l_log *log = sh->raid_conf->log; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { + BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + /* + * Set R5_InJournal for parity dev[pd_idx]. This means + * all data AND parity in the journal. For RAID 6, it is + * NOT necessary to set the flag for dev[qd_idx], as the + * two parities are written out together. + */ + set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); + } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { + r5c_handle_data_cached(sh); + } else { + r5c_handle_parity_cached(sh); + set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); + } +} + +static void r5l_io_run_stripes(struct r5l_io_unit *io) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { + list_del_init(&sh->log_list); + + r5c_finish_cache_stripe(sh); + + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + +static void r5l_log_run_stripes(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + lockdep_assert_held(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + + list_move_tail(&io->log_sibling, &log->finished_ios); + r5l_io_run_stripes(io); + } +} + +static void r5l_move_to_end_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + lockdep_assert_held(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + list_move_tail(&io->log_sibling, &log->io_end_ios); + } +} + +static void __r5l_stripe_write_finished(struct r5l_io_unit *io); +static void r5l_log_endio(struct bio *bio) +{ + struct r5l_io_unit *io = bio->bi_private; + struct r5l_io_unit *io_deferred; + struct r5l_log *log = io->log; + unsigned long flags; + bool has_null_flush; + bool has_flush_payload; + + if (bio->bi_status) + md_error(log->rdev->mddev, log->rdev); + + bio_put(bio); + mempool_free(io->meta_page, &log->meta_pool); + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + + /* + * if the io doesn't not have null_flush or flush payload, + * it is not safe to access it after releasing io_list_lock. + * Therefore, it is necessary to check the condition with + * the lock held. + */ + has_null_flush = io->has_null_flush; + has_flush_payload = io->has_flush_payload; + + if (log->need_cache_flush && !list_empty(&io->stripe_list)) + r5l_move_to_end_ios(log); + else + r5l_log_run_stripes(log); + if (!list_empty(&log->running_ios)) { + /* + * FLUSH/FUA io_unit is deferred because of ordering, now we + * can dispatch it + */ + io_deferred = list_first_entry(&log->running_ios, + struct r5l_io_unit, log_sibling); + if (io_deferred->io_deferred) + schedule_work(&log->deferred_io_work); + } + + spin_unlock_irqrestore(&log->io_list_lock, flags); + + if (log->need_cache_flush) + md_wakeup_thread(log->rdev->mddev->thread); + + /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ + if (has_null_flush) { + struct bio *bi; + + WARN_ON(bio_list_empty(&io->flush_barriers)); + while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { + bio_endio(bi); + if (atomic_dec_and_test(&io->pending_stripe)) { + __r5l_stripe_write_finished(io); + return; + } + } + } + /* decrease pending_stripe for flush payload */ + if (has_flush_payload) + if (atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); +} + +static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) +{ + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_START); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + /* + * In case of journal device failures, submit_bio will get error + * and calls endio, then active stripes will continue write + * process. Therefore, it is not necessary to check Faulty bit + * of journal device here. + * + * We can't check split_bio after current_bio is submitted. If + * io->split_bio is null, after current_bio is submitted, current_bio + * might already be completed and the io_unit is freed. We submit + * split_bio first to avoid the issue. + */ + if (io->split_bio) { + if (io->has_flush) + io->split_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->split_bio->bi_opf |= REQ_FUA; + submit_bio(io->split_bio); + } + + if (io->has_flush) + io->current_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->current_bio->bi_opf |= REQ_FUA; + submit_bio(io->current_bio); +} + +/* deferred io_unit will be dispatched here */ +static void r5l_submit_io_async(struct work_struct *work) +{ + struct r5l_log *log = container_of(work, struct r5l_log, + deferred_io_work); + struct r5l_io_unit *io = NULL; + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + if (!list_empty(&log->running_ios)) { + io = list_first_entry(&log->running_ios, struct r5l_io_unit, + log_sibling); + if (!io->io_deferred) + io = NULL; + else + io->io_deferred = 0; + } + spin_unlock_irqrestore(&log->io_list_lock, flags); + if (io) + r5l_do_submit_io(log, io); +} + +static void r5c_disable_writeback_async(struct work_struct *work) +{ + struct r5l_log *log = container_of(work, struct r5l_log, + disable_writeback_work); + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return; + pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", + mdname(mddev)); + + /* wait superblock change before suspend */ + wait_event(mddev->sb_wait, + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } +} + +static void r5l_submit_current_io(struct r5l_log *log) +{ + struct r5l_io_unit *io = log->current_io; + struct r5l_meta_block *block; + unsigned long flags; + u32 crc; + bool do_submit = true; + + if (!io) + return; + + block = page_address(io->meta_page); + block->meta_size = cpu_to_le32(io->meta_offset); + crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); + block->checksum = cpu_to_le32(crc); + + log->current_io = NULL; + spin_lock_irqsave(&log->io_list_lock, flags); + if (io->has_flush || io->has_fua) { + if (io != list_first_entry(&log->running_ios, + struct r5l_io_unit, log_sibling)) { + io->io_deferred = 1; + do_submit = false; + } + } + spin_unlock_irqrestore(&log->io_list_lock, flags); + if (do_submit) + r5l_do_submit_io(log, io); +} + +static struct bio *r5l_bio_alloc(struct r5l_log *log) +{ + struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS, + REQ_OP_WRITE, GFP_NOIO, &log->bs); + + bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; + + return bio; +} + +static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) +{ + log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); + + r5c_update_log_state(log); + /* + * If we filled up the log device start from the beginning again, + * which will require a new bio. + * + * Note: for this to work properly the log size needs to me a multiple + * of BLOCK_SECTORS. + */ + if (log->log_start == 0) + io->need_split_bio = true; + + io->log_end = log->log_start; +} + +static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) +{ + struct r5l_io_unit *io; + struct r5l_meta_block *block; + + io = mempool_alloc(&log->io_pool, GFP_ATOMIC); + if (!io) + return NULL; + memset(io, 0, sizeof(*io)); + + io->log = log; + INIT_LIST_HEAD(&io->log_sibling); + INIT_LIST_HEAD(&io->stripe_list); + bio_list_init(&io->flush_barriers); + io->state = IO_UNIT_RUNNING; + + io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO); + block = page_address(io->meta_page); + clear_page(block); + block->magic = cpu_to_le32(R5LOG_MAGIC); + block->version = R5LOG_VERSION; + block->seq = cpu_to_le64(log->seq); + block->position = cpu_to_le64(log->log_start); + + io->log_start = log->log_start; + io->meta_offset = sizeof(struct r5l_meta_block); + io->seq = log->seq++; + + io->current_bio = r5l_bio_alloc(log); + io->current_bio->bi_end_io = r5l_log_endio; + io->current_bio->bi_private = io; + bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + + r5_reserve_log_entry(log, io); + + spin_lock_irq(&log->io_list_lock); + list_add_tail(&io->log_sibling, &log->running_ios); + spin_unlock_irq(&log->io_list_lock); + + return io; +} + +static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) +{ + if (log->current_io && + log->current_io->meta_offset + payload_size > PAGE_SIZE) + r5l_submit_current_io(log); + + if (!log->current_io) { + log->current_io = r5l_new_meta(log); + if (!log->current_io) + return -ENOMEM; + } + + return 0; +} + +static void r5l_append_payload_meta(struct r5l_log *log, u16 type, + sector_t location, + u32 checksum1, u32 checksum2, + bool checksum2_valid) +{ + struct r5l_io_unit *io = log->current_io; + struct r5l_payload_data_parity *payload; + + payload = page_address(io->meta_page) + io->meta_offset; + payload->header.type = cpu_to_le16(type); + payload->header.flags = cpu_to_le16(0); + payload->size = cpu_to_le32((1 + !!checksum2_valid) << + (PAGE_SHIFT - 9)); + payload->location = cpu_to_le64(location); + payload->checksum[0] = cpu_to_le32(checksum1); + if (checksum2_valid) + payload->checksum[1] = cpu_to_le32(checksum2); + + io->meta_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * (1 + !!checksum2_valid); +} + +static void r5l_append_payload_page(struct r5l_log *log, struct page *page) +{ + struct r5l_io_unit *io = log->current_io; + + if (io->need_split_bio) { + BUG_ON(io->split_bio); + io->split_bio = io->current_bio; + io->current_bio = r5l_bio_alloc(log); + bio_chain(io->current_bio, io->split_bio); + io->need_split_bio = false; + } + + if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) + BUG(); + + r5_reserve_log_entry(log, io); +} + +static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct r5l_io_unit *io; + struct r5l_payload_flush *payload; + int meta_size; + + /* + * payload_flush requires extra writes to the journal. + * To avoid handling the extra IO in quiesce, just skip + * flush_payload + */ + if (conf->quiesce) + return; + + mutex_lock(&log->io_mutex); + meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64); + + if (r5l_get_meta(log, meta_size)) { + mutex_unlock(&log->io_mutex); + return; + } + + /* current implementation is one stripe per flush payload */ + io = log->current_io; + payload = page_address(io->meta_page) + io->meta_offset; + payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH); + payload->header.flags = cpu_to_le16(0); + payload->size = cpu_to_le32(sizeof(__le64)); + payload->flush_stripes[0] = cpu_to_le64(sect); + io->meta_offset += meta_size; + /* multiple flush payloads count as one pending_stripe */ + if (!io->has_flush_payload) { + io->has_flush_payload = 1; + atomic_inc(&io->pending_stripe); + } + mutex_unlock(&log->io_mutex); +} + +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, + int data_pages, int parity_pages) +{ + int i; + int meta_size; + int ret; + struct r5l_io_unit *io; + + meta_size = + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) + * data_pages) + + sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * parity_pages; + + ret = r5l_get_meta(log, meta_size); + if (ret) + return ret; + + io = log->current_io; + + if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) + io->has_flush = 1; + + for (i = 0; i < sh->disks; i++) { + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || + test_bit(R5_InJournal, &sh->dev[i].flags)) + continue; + if (i == sh->pd_idx || i == sh->qd_idx) + continue; + if (test_bit(R5_WantFUA, &sh->dev[i].flags) && + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { + io->has_fua = 1; + /* + * we need to flush journal to make sure recovery can + * reach the data with fua flag + */ + io->has_flush = 1; + } + r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, + raid5_compute_blocknr(sh, i, 0), + sh->dev[i].log_checksum, 0, false); + r5l_append_payload_page(log, sh->dev[i].page); + } + + if (parity_pages == 2) { + r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, + sh->sector, sh->dev[sh->pd_idx].log_checksum, + sh->dev[sh->qd_idx].log_checksum, true); + r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); + r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); + } else if (parity_pages == 1) { + r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, + sh->sector, sh->dev[sh->pd_idx].log_checksum, + 0, false); + r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); + } else /* Just writing data, not parity, in caching phase */ + BUG_ON(parity_pages != 0); + + list_add_tail(&sh->log_list, &io->stripe_list); + atomic_inc(&io->pending_stripe); + sh->log_io = io; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return 0; + + if (sh->log_start == MaxSector) { + BUG_ON(!list_empty(&sh->r5c)); + sh->log_start = io->log_start; + spin_lock_irq(&log->stripe_in_journal_lock); + list_add_tail(&sh->r5c, + &log->stripe_in_journal_list); + spin_unlock_irq(&log->stripe_in_journal_lock); + atomic_inc(&log->stripe_in_journal_count); + } + return 0; +} + +/* add stripe to no_space_stripes, and then wake up reclaim */ +static inline void r5l_add_no_space_stripe(struct r5l_log *log, + struct stripe_head *sh) +{ + spin_lock(&log->no_space_stripes_lock); + list_add_tail(&sh->log_list, &log->no_space_stripes); + spin_unlock(&log->no_space_stripes_lock); +} + +/* + * running in raid5d, where reclaim could wait for raid5d too (when it flushes + * data from log to raid disks), so we shouldn't wait for reclaim here + */ +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + int write_disks = 0; + int data_pages, parity_pages; + int reserve; + int i; + int ret = 0; + bool wake_reclaim = false; + + if (!log) + return -EAGAIN; + /* Don't support stripe batch */ + if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || + test_bit(STRIPE_SYNCING, &sh->state)) { + /* the stripe is written to log, we start writing it to raid */ + clear_bit(STRIPE_LOG_TRAPPED, &sh->state); + return -EAGAIN; + } + + WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + + for (i = 0; i < sh->disks; i++) { + void *addr; + + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || + test_bit(R5_InJournal, &sh->dev[i].flags)) + continue; + + write_disks++; + /* checksum is already calculated in last run */ + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + continue; + addr = kmap_atomic(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_atomic(addr); + } + parity_pages = 1 + !!(sh->qd_idx >= 0); + data_pages = write_disks - parity_pages; + + set_bit(STRIPE_LOG_TRAPPED, &sh->state); + /* + * The stripe must enter state machine again to finish the write, so + * don't delay. + */ + clear_bit(STRIPE_DELAYED, &sh->state); + atomic_inc(&sh->count); + + mutex_lock(&log->io_mutex); + /* meta + data */ + reserve = (1 + write_disks) << (PAGE_SHIFT - 9); + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { + if (!r5l_has_free_space(log, reserve)) { + r5l_add_no_space_stripe(log, sh); + wake_reclaim = true; + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, + &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } + } + } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ + /* + * log space critical, do not process stripes that are + * not in cache yet (sh->log_start == MaxSector). + */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + sh->log_start == MaxSector) { + r5l_add_no_space_stripe(log, sh); + wake_reclaim = true; + reserve = 0; + } else if (!r5l_has_free_space(log, reserve)) { + if (sh->log_start == log->last_checkpoint) + BUG(); + else + r5l_add_no_space_stripe(log, sh); + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, + &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } + } + } + + mutex_unlock(&log->io_mutex); + if (wake_reclaim) + r5l_wake_reclaim(log, reserve); + return 0; +} + +void r5l_write_stripe_run(struct r5l_log *log) +{ + if (!log) + return; + mutex_lock(&log->io_mutex); + r5l_submit_current_io(log); + mutex_unlock(&log->io_mutex); +} + +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) +{ + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { + /* + * in write through (journal only) + * we flush log disk cache first, then write stripe data to + * raid disks. So if bio is finished, the log disk cache is + * flushed already. The recovery guarantees we can recovery + * the bio from log disk, so we don't need to flush again + */ + if (bio->bi_iter.bi_size == 0) { + bio_endio(bio); + return 0; + } + bio->bi_opf &= ~REQ_PREFLUSH; + } else { + /* write back (with cache) */ + if (bio->bi_iter.bi_size == 0) { + mutex_lock(&log->io_mutex); + r5l_get_meta(log, 0); + bio_list_add(&log->current_io->flush_barriers, bio); + log->current_io->has_flush = 1; + log->current_io->has_null_flush = 1; + atomic_inc(&log->current_io->pending_stripe); + r5l_submit_current_io(log); + mutex_unlock(&log->io_mutex); + return 0; + } + } + return -EAGAIN; +} + +/* This will run after log space is reclaimed */ +static void r5l_run_no_space_stripes(struct r5l_log *log) +{ + struct stripe_head *sh; + + spin_lock(&log->no_space_stripes_lock); + while (!list_empty(&log->no_space_stripes)) { + sh = list_first_entry(&log->no_space_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } + spin_unlock(&log->no_space_stripes_lock); +} + +/* + * calculate new last_checkpoint + * for write through mode, returns log->next_checkpoint + * for write back, returns log_start of first sh in stripe_in_journal_list + */ +static sector_t r5c_calculate_new_cp(struct r5conf *conf) +{ + struct stripe_head *sh; + struct r5l_log *log = conf->log; + sector_t new_cp; + unsigned long flags; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return log->next_checkpoint; + + spin_lock_irqsave(&log->stripe_in_journal_lock, flags); + if (list_empty(&conf->log->stripe_in_journal_list)) { + /* all stripes flushed */ + spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); + return log->next_checkpoint; + } + sh = list_first_entry(&conf->log->stripe_in_journal_list, + struct stripe_head, r5c); + new_cp = sh->log_start; + spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); + return new_cp; +} + +static sector_t r5l_reclaimable_space(struct r5l_log *log) +{ + struct r5conf *conf = log->rdev->mddev->private; + + return r5l_ring_distance(log, log->last_checkpoint, + r5c_calculate_new_cp(conf)); +} + +static void r5l_run_no_mem_stripe(struct r5l_log *log) +{ + struct stripe_head *sh; + + lockdep_assert_held(&log->io_list_lock); + + if (!list_empty(&log->no_mem_stripes)) { + sh = list_first_entry(&log->no_mem_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + +static bool r5l_complete_finished_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + bool found = false; + + lockdep_assert_held(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_STRIPE_END) + break; + + log->next_checkpoint = io->log_start; + + list_del(&io->log_sibling); + mempool_free(io, &log->io_pool); + r5l_run_no_mem_stripe(log); + + found = true; + } + + return found; +} + +static void __r5l_stripe_write_finished(struct r5l_io_unit *io) +{ + struct r5l_log *log = io->log; + struct r5conf *conf = log->rdev->mddev->private; + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); + + if (!r5l_complete_finished_ios(log)) { + spin_unlock_irqrestore(&log->io_list_lock, flags); + return; + } + + if (r5l_reclaimable_space(log) > log->max_free_space || + test_bit(R5C_LOG_TIGHT, &conf->cache_state)) + r5l_wake_reclaim(log, 0); + + spin_unlock_irqrestore(&log->io_list_lock, flags); + wake_up(&log->iounit_wait); +} + +void r5l_stripe_write_finished(struct stripe_head *sh) +{ + struct r5l_io_unit *io; + + io = sh->log_io; + sh->log_io = NULL; + + if (io && atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); +} + +static void r5l_log_flush_endio(struct bio *bio) +{ + struct r5l_log *log = container_of(bio, struct r5l_log, + flush_bio); + unsigned long flags; + struct r5l_io_unit *io; + + if (bio->bi_status) + md_error(log->rdev->mddev, log->rdev); + bio_uninit(bio); + + spin_lock_irqsave(&log->io_list_lock, flags); + list_for_each_entry(io, &log->flushing_ios, log_sibling) + r5l_io_run_stripes(io); + list_splice_tail_init(&log->flushing_ios, &log->finished_ios); + spin_unlock_irqrestore(&log->io_list_lock, flags); +} + +/* + * Starting dispatch IO to raid. + * io_unit(meta) consists of a log. There is one situation we want to avoid. A + * broken meta in the middle of a log causes recovery can't find meta at the + * head of log. If operations require meta at the head persistent in log, we + * must make sure meta before it persistent in log too. A case is: + * + * stripe data/parity is in log, we start write stripe to raid disks. stripe + * data/parity must be persistent in log before we do the write to raid disks. + * + * The solution is we restrictly maintain io_unit list order. In this case, we + * only write stripes of an io_unit to raid disks till the io_unit is the first + * one whose data/parity is in log. + */ +void r5l_flush_stripe_to_raid(struct r5l_log *log) +{ + bool do_flush; + + if (!log || !log->need_cache_flush) + return; + + spin_lock_irq(&log->io_list_lock); + /* flush bio is running */ + if (!list_empty(&log->flushing_ios)) { + spin_unlock_irq(&log->io_list_lock); + return; + } + list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); + do_flush = !list_empty(&log->flushing_ios); + spin_unlock_irq(&log->io_list_lock); + + if (!do_flush) + return; + bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0, + REQ_OP_WRITE | REQ_PREFLUSH); + log->flush_bio.bi_end_io = r5l_log_flush_endio; + submit_bio(&log->flush_bio); +} + +static void r5l_write_super(struct r5l_log *log, sector_t cp); +static void r5l_write_super_and_discard_space(struct r5l_log *log, + sector_t end) +{ + struct block_device *bdev = log->rdev->bdev; + struct mddev *mddev; + + r5l_write_super(log, end); + + if (!bdev_max_discard_sectors(bdev)) + return; + + mddev = log->rdev->mddev; + /* + * Discard could zero data, so before discard we must make sure + * superblock is updated to new log tail. Updating superblock (either + * directly call md_update_sb() or depend on md thread) must hold + * reconfig mutex. On the other hand, raid5_quiesce is called with + * reconfig_mutex hold. The first step of raid5_quiesce() is waiting + * for all IO finish, hence waiting for reclaim thread, while reclaim + * thread is calling this function and waiting for reconfig mutex. So + * there is a deadlock. We workaround this issue with a trylock. + * FIXME: we could miss discard if we can't take reconfig mutex + */ + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + if (!mddev_trylock(mddev)) + return; + md_update_sb(mddev, 1); + mddev_unlock(mddev); + + /* discard IO error really doesn't matter, ignore it */ + if (log->last_checkpoint < end) { + blkdev_issue_discard(bdev, + log->last_checkpoint + log->rdev->data_offset, + end - log->last_checkpoint, GFP_NOIO); + } else { + blkdev_issue_discard(bdev, + log->last_checkpoint + log->rdev->data_offset, + log->device_size - log->last_checkpoint, + GFP_NOIO); + blkdev_issue_discard(bdev, log->rdev->data_offset, end, + GFP_NOIO); + } +} + +/* + * r5c_flush_stripe moves stripe from cached list to handle_list. When called, + * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. + * + * must hold conf->device_lock + */ +static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) +{ + BUG_ON(list_empty(&sh->lru)); + BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); + + /* + * The stripe is not ON_RELEASE_LIST, so it is safe to call + * raid5_release_stripe() while holding conf->device_lock + */ + BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); + lockdep_assert_held(&conf->device_lock); + + list_del_init(&sh->lru); + atomic_inc(&sh->count); + + set_bit(STRIPE_HANDLE, &sh->state); + atomic_inc(&conf->active_stripes); + r5c_make_stripe_write_out(sh); + + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_flushing_partial_stripes); + else + atomic_inc(&conf->r5c_flushing_full_stripes); + raid5_release_stripe(sh); +} + +/* + * if num == 0, flush all full stripes + * if num > 0, flush all full stripes. If less than num full stripes are + * flushed, flush some partial stripes until totally num stripes are + * flushed or there is no more cached stripes. + */ +void r5c_flush_cache(struct r5conf *conf, int num) +{ + int count; + struct stripe_head *sh, *next; + + lockdep_assert_held(&conf->device_lock); + if (!conf->log) + return; + + count = 0; + list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { + r5c_flush_stripe(conf, sh); + count++; + } + + if (count >= num) + return; + list_for_each_entry_safe(sh, next, + &conf->r5c_partial_stripe_list, lru) { + r5c_flush_stripe(conf, sh); + if (++count >= num) + break; + } +} + +static void r5c_do_reclaim(struct r5conf *conf) +{ + struct r5l_log *log = conf->log; + struct stripe_head *sh; + int count = 0; + unsigned long flags; + int total_cached; + int stripes_to_flush; + int flushing_partial, flushing_full; + + if (!r5c_is_writeback(log)) + return; + + flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); + flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); + total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + + atomic_read(&conf->r5c_cached_full_stripes) - + flushing_full - flushing_partial; + + if (total_cached > conf->min_nr_stripes * 3 / 4 || + atomic_read(&conf->empty_inactive_list_nr) > 0) + /* + * if stripe cache pressure high, flush all full stripes and + * some partial stripes + */ + stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; + else if (total_cached > conf->min_nr_stripes * 1 / 2 || + atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > + R5C_FULL_STRIPE_FLUSH_BATCH(conf)) + /* + * if stripe cache pressure moderate, or if there is many full + * stripes,flush all full stripes + */ + stripes_to_flush = 0; + else + /* no need to flush */ + stripes_to_flush = -1; + + if (stripes_to_flush >= 0) { + spin_lock_irqsave(&conf->device_lock, flags); + r5c_flush_cache(conf, stripes_to_flush); + spin_unlock_irqrestore(&conf->device_lock, flags); + } + + /* if log space is tight, flush stripes on stripe_in_journal_list */ + if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { + spin_lock_irqsave(&log->stripe_in_journal_lock, flags); + spin_lock(&conf->device_lock); + list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { + /* + * stripes on stripe_in_journal_list could be in any + * state of the stripe_cache state machine. In this + * case, we only want to flush stripe on + * r5c_cached_full/partial_stripes. The following + * condition makes sure the stripe is on one of the + * two lists. + */ + if (!list_empty(&sh->lru) && + !test_bit(STRIPE_HANDLE, &sh->state) && + atomic_read(&sh->count) == 0) { + r5c_flush_stripe(conf, sh); + if (count++ >= R5C_RECLAIM_STRIPE_GROUP) + break; + } + } + spin_unlock(&conf->device_lock); + spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); + } + + if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) + r5l_run_no_space_stripes(log); + + md_wakeup_thread(conf->mddev->thread); +} + +static void r5l_do_reclaim(struct r5l_log *log) +{ + struct r5conf *conf = log->rdev->mddev->private; + sector_t reclaim_target = xchg(&log->reclaim_target, 0); + sector_t reclaimable; + sector_t next_checkpoint; + bool write_super; + + spin_lock_irq(&log->io_list_lock); + write_super = r5l_reclaimable_space(log) > log->max_free_space || + reclaim_target != 0 || !list_empty(&log->no_space_stripes); + /* + * move proper io_unit to reclaim list. We should not change the order. + * reclaimable/unreclaimable io_unit can be mixed in the list, we + * shouldn't reuse space of an unreclaimable io_unit + */ + while (1) { + reclaimable = r5l_reclaimable_space(log); + if (reclaimable >= reclaim_target || + (list_empty(&log->running_ios) && + list_empty(&log->io_end_ios) && + list_empty(&log->flushing_ios) && + list_empty(&log->finished_ios))) + break; + + md_wakeup_thread(log->rdev->mddev->thread); + wait_event_lock_irq(log->iounit_wait, + r5l_reclaimable_space(log) > reclaimable, + log->io_list_lock); + } + + next_checkpoint = r5c_calculate_new_cp(conf); + spin_unlock_irq(&log->io_list_lock); + + if (reclaimable == 0 || !write_super) + return; + + /* + * write_super will flush cache of each raid disk. We must write super + * here, because the log area might be reused soon and we don't want to + * confuse recovery + */ + r5l_write_super_and_discard_space(log, next_checkpoint); + + mutex_lock(&log->io_mutex); + log->last_checkpoint = next_checkpoint; + r5c_update_log_state(log); + mutex_unlock(&log->io_mutex); + + r5l_run_no_space_stripes(log); +} + +static void r5l_reclaim_thread(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + + if (!log) + return; + r5c_do_reclaim(conf); + r5l_do_reclaim(log); +} + +void r5l_wake_reclaim(struct r5l_log *log, sector_t space) +{ + unsigned long target; + unsigned long new = (unsigned long)space; /* overflow in theory */ + + if (!log) + return; + do { + target = log->reclaim_target; + if (new < target) + return; + } while (cmpxchg(&log->reclaim_target, target, new) != target); + md_wakeup_thread(log->reclaim_thread); +} + +void r5l_quiesce(struct r5l_log *log, int quiesce) +{ + struct mddev *mddev; + + if (quiesce) { + /* make sure r5l_write_super_and_discard_space exits */ + mddev = log->rdev->mddev; + wake_up(&mddev->sb_wait); + kthread_park(log->reclaim_thread->tsk); + r5l_wake_reclaim(log, MaxSector); + r5l_do_reclaim(log); + } else + kthread_unpark(log->reclaim_thread->tsk); +} + +bool r5l_log_disk_error(struct r5conf *conf) +{ + struct r5l_log *log = conf->log; + + /* don't allow write if journal disk is missing */ + if (!log) + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + else + return test_bit(Faulty, &log->rdev->flags); +} + +#define R5L_RECOVERY_PAGE_POOL_SIZE 256 + +struct r5l_recovery_ctx { + struct page *meta_page; /* current meta */ + sector_t meta_total_blocks; /* total size of current meta and data */ + sector_t pos; /* recovery position */ + u64 seq; /* recovery position seq */ + int data_parity_stripes; /* number of data_parity stripes */ + int data_only_stripes; /* number of data_only stripes */ + struct list_head cached_list; + + /* + * read ahead page pool (ra_pool) + * in recovery, log is read sequentially. It is not efficient to + * read every page with sync_page_io(). The read ahead page pool + * reads multiple pages with one IO, so further log read can + * just copy data from the pool. + */ + struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE]; + struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE]; + sector_t pool_offset; /* offset of first page in the pool */ + int total_pages; /* total allocated pages */ + int valid_pages; /* pages with valid data */ +}; + +static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct page *page; + + ctx->valid_pages = 0; + ctx->total_pages = 0; + while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) { + page = alloc_page(GFP_KERNEL); + + if (!page) + break; + ctx->ra_pool[ctx->total_pages] = page; + ctx->total_pages += 1; + } + + if (ctx->total_pages == 0) + return -ENOMEM; + + ctx->pool_offset = 0; + return 0; +} + +static void r5l_recovery_free_ra_pool(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + int i; + + for (i = 0; i < ctx->total_pages; ++i) + put_page(ctx->ra_pool[i]); +} + +/* + * fetch ctx->valid_pages pages from offset + * In normal cases, ctx->valid_pages == ctx->total_pages after the call. + * However, if the offset is close to the end of the journal device, + * ctx->valid_pages could be smaller than ctx->total_pages + */ +static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + sector_t offset) +{ + struct bio bio; + int ret; + + bio_init(&bio, log->rdev->bdev, ctx->ra_bvec, + R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ); + bio.bi_iter.bi_sector = log->rdev->data_offset + offset; + + ctx->valid_pages = 0; + ctx->pool_offset = offset; + + while (ctx->valid_pages < ctx->total_pages) { + __bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, + 0); + ctx->valid_pages += 1; + + offset = r5l_ring_add(log, offset, BLOCK_SECTORS); + + if (offset == 0) /* reached end of the device */ + break; + } + + ret = submit_bio_wait(&bio); + bio_uninit(&bio); + return ret; +} + +/* + * try read a page from the read ahead page pool, if the page is not in the + * pool, call r5l_recovery_fetch_ra_pool + */ +static int r5l_recovery_read_page(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + struct page *page, + sector_t offset) +{ + int ret; + + if (offset < ctx->pool_offset || + offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) { + ret = r5l_recovery_fetch_ra_pool(log, ctx, offset); + if (ret) + return ret; + } + + BUG_ON(offset < ctx->pool_offset || + offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS); + + memcpy(page_address(page), + page_address(ctx->ra_pool[(offset - ctx->pool_offset) >> + BLOCK_SECTOR_SHIFT]), + PAGE_SIZE); + return 0; +} + +static int r5l_recovery_read_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct page *page = ctx->meta_page; + struct r5l_meta_block *mb; + u32 crc, stored_crc; + int ret; + + ret = r5l_recovery_read_page(log, ctx, page, ctx->pos); + if (ret != 0) + return ret; + + mb = page_address(page); + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + le64_to_cpu(mb->seq) != ctx->seq || + mb->version != R5LOG_VERSION || + le64_to_cpu(mb->position) != ctx->pos) + return -EINVAL; + + crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + if (stored_crc != crc) + return -EINVAL; + + if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) + return -EINVAL; + + ctx->meta_total_blocks = BLOCK_SECTORS; + + return 0; +} + +static void +r5l_recovery_create_empty_meta_block(struct r5l_log *log, + struct page *page, + sector_t pos, u64 seq) +{ + struct r5l_meta_block *mb; + + mb = page_address(page); + clear_page(mb); + mb->magic = cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); +} + +static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, + u64 seq) +{ + struct page *page; + struct r5l_meta_block *mb; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + r5l_recovery_create_empty_meta_block(log, page, pos, seq); + mb = page_address(page); + mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, + mb, PAGE_SIZE)); + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE | + REQ_SYNC | REQ_FUA, false)) { + __free_page(page); + return -EIO; + } + __free_page(page); + return 0; +} + +/* + * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite + * to mark valid (potentially not flushed) data in the journal. + * + * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, + * so there should not be any mismatch here. + */ +static void r5l_recovery_load_data(struct r5l_log *log, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx, + struct r5l_payload_data_parity *payload, + sector_t log_offset) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int dd_idx; + + raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, + &dd_idx, sh); + r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset); + sh->dev[dd_idx].log_checksum = + le32_to_cpu(payload->checksum[0]); + ctx->meta_total_blocks += BLOCK_SECTORS; + + set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); + set_bit(STRIPE_R5C_CACHING, &sh->state); +} + +static void r5l_recovery_load_parity(struct r5l_log *log, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx, + struct r5l_payload_data_parity *payload, + sector_t log_offset) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + + ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; + r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset); + sh->dev[sh->pd_idx].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); + + if (sh->qd_idx >= 0) { + r5l_recovery_read_page( + log, ctx, sh->dev[sh->qd_idx].page, + r5l_ring_add(log, log_offset, BLOCK_SECTORS)); + sh->dev[sh->qd_idx].log_checksum = + le32_to_cpu(payload->checksum[1]); + set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); + } + clear_bit(STRIPE_R5C_CACHING, &sh->state); +} + +static void r5l_recovery_reset_stripe(struct stripe_head *sh) +{ + int i; + + sh->state = 0; + sh->log_start = MaxSector; + for (i = sh->disks; i--; ) + sh->dev[i].flags = 0; +} + +static void +r5l_recovery_replay_one_stripe(struct r5conf *conf, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx) +{ + struct md_rdev *rdev, *rrdev; + int disk_index; + int data_count = 0; + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) + continue; + if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) + continue; + data_count++; + } + + /* + * stripes that only have parity must have been flushed + * before the crash that we are now recovering from, so + * there is nothing more to recovery. + */ + if (data_count == 0) + goto out; + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) + continue; + + /* in case device is broken */ + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[disk_index].rdev); + if (rdev) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + sync_page_io(rdev, sh->sector, PAGE_SIZE, + sh->dev[disk_index].page, REQ_OP_WRITE, + false); + rdev_dec_pending(rdev, rdev->mddev); + rcu_read_lock(); + } + rrdev = rcu_dereference(conf->disks[disk_index].replacement); + if (rrdev) { + atomic_inc(&rrdev->nr_pending); + rcu_read_unlock(); + sync_page_io(rrdev, sh->sector, PAGE_SIZE, + sh->dev[disk_index].page, REQ_OP_WRITE, + false); + rdev_dec_pending(rrdev, rrdev->mddev); + rcu_read_lock(); + } + rcu_read_unlock(); + } + ctx->data_parity_stripes++; +out: + r5l_recovery_reset_stripe(sh); +} + +static struct stripe_head * +r5c_recovery_alloc_stripe( + struct r5conf *conf, + sector_t stripe_sect, + int noblock) +{ + struct stripe_head *sh; + + sh = raid5_get_active_stripe(conf, NULL, stripe_sect, + noblock ? R5_GAS_NOBLOCK : 0); + if (!sh) + return NULL; /* no more stripe available */ + + r5l_recovery_reset_stripe(sh); + + return sh; +} + +static struct stripe_head * +r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) +{ + struct stripe_head *sh; + + list_for_each_entry(sh, list, lru) + if (sh->sector == sect) + return sh; + return NULL; +} + +static void +r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { + r5l_recovery_reset_stripe(sh); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } +} + +static void +r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, cached_stripe_list, lru) + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } +} + +/* if matches return 0; otherwise return -EINVAL */ +static int +r5l_recovery_verify_data_checksum(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + struct page *page, + sector_t log_offset, __le32 log_checksum) +{ + void *addr; + u32 checksum; + + r5l_recovery_read_page(log, ctx, page, log_offset); + addr = kmap_atomic(page); + checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_atomic(addr); + return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; +} + +/* + * before loading data to stripe cache, we need verify checksum for all data, + * if there is mismatch for any data page, we drop all data in the mata block + */ +static int +r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct r5l_meta_block *mb = page_address(ctx->meta_page); + sector_t mb_offset = sizeof(struct r5l_meta_block); + sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + struct page *page; + struct r5l_payload_data_parity *payload; + struct r5l_payload_flush *payload_flush; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + while (mb_offset < le32_to_cpu(mb->meta_size)) { + payload = (void *)mb + mb_offset; + payload_flush = (void *)mb + mb_offset; + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + if (r5l_recovery_verify_data_checksum( + log, ctx, page, log_offset, + payload->checksum[0]) < 0) + goto mismatch; + } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { + if (r5l_recovery_verify_data_checksum( + log, ctx, page, log_offset, + payload->checksum[0]) < 0) + goto mismatch; + if (conf->max_degraded == 2 && /* q for RAID 6 */ + r5l_recovery_verify_data_checksum( + log, ctx, page, + r5l_ring_add(log, log_offset, + BLOCK_SECTORS), + payload->checksum[1]) < 0) + goto mismatch; + } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { + /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ + } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ + goto mismatch; + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { + mb_offset += sizeof(struct r5l_payload_flush) + + le32_to_cpu(payload_flush->size); + } else { + /* DATA or PARITY payload */ + log_offset = r5l_ring_add(log, log_offset, + le32_to_cpu(payload->size)); + mb_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + } + + } + + put_page(page); + return 0; + +mismatch: + put_page(page); + return -EINVAL; +} + +/* + * Analyze all data/parity pages in one meta block + * Returns: + * 0 for success + * -EINVAL for unknown playload type + * -EAGAIN for checksum mismatch of data page + * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) + */ +static int +r5c_recovery_analyze_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + struct list_head *cached_stripe_list) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct r5l_meta_block *mb; + struct r5l_payload_data_parity *payload; + struct r5l_payload_flush *payload_flush; + int mb_offset; + sector_t log_offset; + sector_t stripe_sect; + struct stripe_head *sh; + int ret; + + /* + * for mismatch in data blocks, we will drop all data in this mb, but + * we will still read next mb for other data with FLUSH flag, as + * io_unit could finish out of order. + */ + ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); + if (ret == -EINVAL) + return -EAGAIN; + else if (ret) + return ret; /* -ENOMEM duo to alloc_page() failed */ + + mb = page_address(ctx->meta_page); + mb_offset = sizeof(struct r5l_meta_block); + log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + while (mb_offset < le32_to_cpu(mb->meta_size)) { + int dd; + + payload = (void *)mb + mb_offset; + payload_flush = (void *)mb + mb_offset; + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { + int i, count; + + count = le32_to_cpu(payload_flush->size) / sizeof(__le64); + for (i = 0; i < count; ++i) { + stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); + sh = r5c_recovery_lookup_stripe(cached_stripe_list, + stripe_sect); + if (sh) { + WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5l_recovery_reset_stripe(sh); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } + } + + mb_offset += sizeof(struct r5l_payload_flush) + + le32_to_cpu(payload_flush->size); + continue; + } + + /* DATA or PARITY payload */ + stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? + raid5_compute_sector( + conf, le64_to_cpu(payload->location), 0, &dd, + NULL) + : le64_to_cpu(payload->location); + + sh = r5c_recovery_lookup_stripe(cached_stripe_list, + stripe_sect); + + if (!sh) { + sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); + /* + * cannot get stripe from raid5_get_active_stripe + * try replay some stripes + */ + if (!sh) { + r5c_recovery_replay_stripes( + cached_stripe_list, ctx); + sh = r5c_recovery_alloc_stripe( + conf, stripe_sect, 1); + } + if (!sh) { + int new_size = conf->min_nr_stripes * 2; + pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", + mdname(mddev), + new_size); + ret = raid5_set_cache_size(mddev, new_size); + if (conf->min_nr_stripes <= new_size / 2) { + pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", + mdname(mddev), + ret, + new_size, + conf->min_nr_stripes, + conf->max_nr_stripes); + return -ENOMEM; + } + sh = r5c_recovery_alloc_stripe( + conf, stripe_sect, 0); + } + if (!sh) { + pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", + mdname(mddev)); + return -ENOMEM; + } + list_add_tail(&sh->lru, cached_stripe_list); + } + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && + test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { + r5l_recovery_replay_one_stripe(conf, sh, ctx); + list_move_tail(&sh->lru, cached_stripe_list); + } + r5l_recovery_load_data(log, sh, ctx, payload, + log_offset); + } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) + r5l_recovery_load_parity(log, sh, ctx, payload, + log_offset); + else + return -EINVAL; + + log_offset = r5l_ring_add(log, log_offset, + le32_to_cpu(payload->size)); + + mb_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + } + + return 0; +} + +/* + * Load the stripe into cache. The stripe will be written out later by + * the stripe cache state machine. + */ +static void r5c_recovery_load_one_stripe(struct r5l_log *log, + struct stripe_head *sh) +{ + struct r5dev *dev; + int i; + + for (i = sh->disks; i--; ) { + dev = sh->dev + i; + if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { + set_bit(R5_InJournal, &dev->flags); + set_bit(R5_UPTODATE, &dev->flags); + } + } +} + +/* + * Scan through the log for all to-be-flushed data + * + * For stripes with data and parity, namely Data-Parity stripe + * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. + * + * For stripes with only data, namely Data-Only stripe + * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. + * + * For a stripe, if we see data after parity, we should discard all previous + * data and parity for this stripe, as these data are already flushed to + * the array. + * + * At the end of the scan, we return the new journal_tail, which points to + * first data-only stripe on the journal device, or next invalid meta block. + */ +static int r5c_recovery_flush_log(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh; + int ret = 0; + + /* scan through the log */ + while (1) { + if (r5l_recovery_read_meta_block(log, ctx)) + break; + + ret = r5c_recovery_analyze_meta_block(log, ctx, + &ctx->cached_list); + /* + * -EAGAIN means mismatch in data block, in this case, we still + * try scan the next metablock + */ + if (ret && ret != -EAGAIN) + break; /* ret == -EINVAL or -ENOMEM */ + ctx->seq++; + ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); + } + + if (ret == -ENOMEM) { + r5c_recovery_drop_stripes(&ctx->cached_list, ctx); + return ret; + } + + /* replay data-parity stripes */ + r5c_recovery_replay_stripes(&ctx->cached_list, ctx); + + /* load data-only stripes to stripe cache */ + list_for_each_entry(sh, &ctx->cached_list, lru) { + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5c_recovery_load_one_stripe(log, sh); + ctx->data_only_stripes++; + } + + return 0; +} + +/* + * we did a recovery. Now ctx.pos points to an invalid meta block. New + * log will start here. but we can't let superblock point to last valid + * meta block. The log might looks like: + * | meta 1| meta 2| meta 3| + * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If + * superblock points to meta 1, we write a new valid meta 2n. if crash + * happens again, new recovery will start from meta 1. Since meta 2n is + * valid now, recovery will think meta 3 is valid, which is wrong. + * The solution is we create a new meta in meta2 with its seq == meta + * 1's seq + 10000 and let superblock points to meta2. The same recovery + * will not think meta 3 is a valid meta, because its seq doesn't match + */ + +/* + * Before recovery, the log looks like the following + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ + * |- log->last_checkpoint + * |- log->last_cp_seq + * + * Now we scan through the log until we see invalid entry + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos + * |- log->last_cp_seq |- ctx->seq + * + * From this point, we need to increase seq number by 10 to avoid + * confusing next recovery. + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+1 + * |- log->last_cp_seq |- ctx->seq+10001 + * + * However, it is not safe to start the state machine yet, because data only + * parities are not yet secured in RAID. To save these data only parities, we + * rewrite them from seq+11. + * + * ----------------------------------------------------------------- + * | valid log | data only stripes | invalid log | + * ----------------------------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+n + * |- log->last_cp_seq |- ctx->seq+10000+n + * + * If failure happens again during this process, the recovery can safe start + * again from log->last_checkpoint. + * + * Once data only stripes are rewritten to journal, we move log_tail + * + * ----------------------------------------------------------------- + * | old log | data only stripes | invalid log | + * ----------------------------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+n + * |- log->last_cp_seq |- ctx->seq+10000+n + * + * Then we can safely start the state machine. If failure happens from this + * point on, the recovery will start from new log->last_checkpoint. + */ +static int +r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh; + struct mddev *mddev = log->rdev->mddev; + struct page *page; + sector_t next_checkpoint = MaxSector; + + page = alloc_page(GFP_KERNEL); + if (!page) { + pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", + mdname(mddev)); + return -ENOMEM; + } + + WARN_ON(list_empty(&ctx->cached_list)); + + list_for_each_entry(sh, &ctx->cached_list, lru) { + struct r5l_meta_block *mb; + int i; + int offset; + sector_t write_pos; + + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5l_recovery_create_empty_meta_block(log, page, + ctx->pos, ctx->seq); + mb = page_address(page); + offset = le32_to_cpu(mb->meta_size); + write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + struct r5l_payload_data_parity *payload; + void *addr; + + if (test_bit(R5_InJournal, &dev->flags)) { + payload = (void *)mb + offset; + payload->header.type = cpu_to_le16( + R5LOG_PAYLOAD_DATA); + payload->size = cpu_to_le32(BLOCK_SECTORS); + payload->location = cpu_to_le64( + raid5_compute_blocknr(sh, i, 0)); + addr = kmap_atomic(dev->page); + payload->checksum[0] = cpu_to_le32( + crc32c_le(log->uuid_checksum, addr, + PAGE_SIZE)); + kunmap_atomic(addr); + sync_page_io(log->rdev, write_pos, PAGE_SIZE, + dev->page, REQ_OP_WRITE, false); + write_pos = r5l_ring_add(log, write_pos, + BLOCK_SECTORS); + offset += sizeof(__le32) + + sizeof(struct r5l_payload_data_parity); + + } + } + mb->meta_size = cpu_to_le32(offset); + mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, + mb, PAGE_SIZE)); + sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, + REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false); + sh->log_start = ctx->pos; + list_add_tail(&sh->r5c, &log->stripe_in_journal_list); + atomic_inc(&log->stripe_in_journal_count); + ctx->pos = write_pos; + ctx->seq += 1; + next_checkpoint = sh->log_start; + } + log->next_checkpoint = next_checkpoint; + __free_page(page); + return 0; +} + +static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct stripe_head *sh, *next; + bool cleared_pending = false; + + if (ctx->data_only_stripes == 0) + return; + + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + cleared_pending = true; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + } + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; + + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { + r5c_make_stripe_write_out(sh); + set_bit(STRIPE_HANDLE, &sh->state); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } + + /* reuse conf->wait_for_quiescent in recovery */ + wait_event(conf->wait_for_quiescent, + atomic_read(&conf->active_stripes) == 0); + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + if (cleared_pending) + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); +} + +static int r5l_recovery_log(struct r5l_log *log) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5l_recovery_ctx *ctx; + int ret; + sector_t pos; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pos = log->last_checkpoint; + ctx->seq = log->last_cp_seq; + INIT_LIST_HEAD(&ctx->cached_list); + ctx->meta_page = alloc_page(GFP_KERNEL); + + if (!ctx->meta_page) { + ret = -ENOMEM; + goto meta_page; + } + + if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) { + ret = -ENOMEM; + goto ra_pool; + } + + ret = r5c_recovery_flush_log(log, ctx); + + if (ret) + goto error; + + pos = ctx->pos; + ctx->seq += 10000; + + if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) + pr_info("md/raid:%s: starting from clean shutdown\n", + mdname(mddev)); + else + pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", + mdname(mddev), ctx->data_only_stripes, + ctx->data_parity_stripes); + + if (ctx->data_only_stripes == 0) { + log->next_checkpoint = ctx->pos; + r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++); + ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) { + pr_err("md/raid:%s: failed to rewrite stripes to journal\n", + mdname(mddev)); + ret = -EIO; + goto error; + } + + log->log_start = ctx->pos; + log->seq = ctx->seq; + log->last_checkpoint = pos; + r5l_write_super(log, pos); + + r5c_recovery_flush_data_only_stripes(log, ctx); + ret = 0; +error: + r5l_recovery_free_ra_pool(log, ctx); +ra_pool: + __free_page(ctx->meta_page); +meta_page: + kfree(ctx); + return ret; +} + +static void r5l_write_super(struct r5l_log *log, sector_t cp) +{ + struct mddev *mddev = log->rdev->mddev; + + log->rdev->journal_tail = cp; + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); +} + +static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret; + + ret = mddev_lock(mddev); + if (ret) + return ret; + + conf = mddev->private; + if (!conf || !conf->log) + goto out_unlock; + + switch (conf->log->r5c_journal_mode) { + case R5C_JOURNAL_MODE_WRITE_THROUGH: + ret = snprintf( + page, PAGE_SIZE, "[%s] %s\n", + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); + break; + case R5C_JOURNAL_MODE_WRITE_BACK: + ret = snprintf( + page, PAGE_SIZE, "%s [%s]\n", + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); + break; + default: + ret = 0; + } + +out_unlock: + mddev_unlock(mddev); + return ret; +} + +/* + * Set journal cache mode on @mddev (external API initially needed by dm-raid). + * + * @mode as defined in 'enum r5c_journal_mode'. + * + */ +int r5c_journal_mode_set(struct mddev *mddev, int mode) +{ + struct r5conf *conf; + + if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || + mode > R5C_JOURNAL_MODE_WRITE_BACK) + return -EINVAL; + + conf = mddev->private; + if (!conf || !conf->log) + return -ENODEV; + + if (raid5_calc_degraded(conf) > 0 && + mode == R5C_JOURNAL_MODE_WRITE_BACK) + return -EINVAL; + + mddev_suspend(mddev); + conf->log->r5c_journal_mode = mode; + mddev_resume(mddev); + + pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", + mdname(mddev), mode, r5c_journal_mode_str[mode]); + return 0; +} +EXPORT_SYMBOL(r5c_journal_mode_set); + +static ssize_t r5c_journal_mode_store(struct mddev *mddev, + const char *page, size_t length) +{ + int mode = ARRAY_SIZE(r5c_journal_mode_str); + size_t len = length; + int ret; + + if (len < 2) + return -EINVAL; + + if (page[len - 1] == '\n') + len--; + + while (mode--) + if (strlen(r5c_journal_mode_str[mode]) == len && + !strncmp(page, r5c_journal_mode_str[mode], len)) + break; + ret = mddev_lock(mddev); + if (ret) + return ret; + ret = r5c_journal_mode_set(mddev, mode); + mddev_unlock(mddev); + return ret ?: length; +} + +struct md_sysfs_entry +r5c_journal_mode = __ATTR(journal_mode, 0644, + r5c_journal_mode_show, r5c_journal_mode_store); + +/* + * Try handle write operation in caching phase. This function should only + * be called in write-back mode. + * + * If all outstanding writes can be handled in caching phase, returns 0 + * If writes requires write-out phase, call r5c_make_stripe_write_out() + * and returns -EAGAIN + */ +int r5c_try_caching_write(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + struct r5l_log *log = conf->log; + int i; + struct r5dev *dev; + int to_cache = 0; + void __rcu **pslot; + sector_t tree_index; + int ret; + uintptr_t refcount; + + BUG_ON(!r5c_is_writeback(log)); + + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + /* + * There are two different scenarios here: + * 1. The stripe has some data cached, and it is sent to + * write-out phase for reclaim + * 2. The stripe is clean, and this is the first write + * + * For 1, return -EAGAIN, so we continue with + * handle_stripe_dirtying(). + * + * For 2, set STRIPE_R5C_CACHING and continue with caching + * write. + */ + + /* case 1: anything injournal or anything in written */ + if (s->injournal > 0 || s->written > 0) + return -EAGAIN; + /* case 2 */ + set_bit(STRIPE_R5C_CACHING, &sh->state); + } + + /* + * When run in degraded mode, array is set to write-through mode. + * This check helps drain pending write safely in the transition to + * write-through mode. + * + * When a stripe is syncing, the write is also handled in write + * through mode. + */ + if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + + for (i = disks; i--; ) { + dev = &sh->dev[i]; + /* if non-overwrite, use writing-out phase */ + if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_InJournal, &dev->flags)) { + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + + /* if the stripe is not counted in big_stripe_tree, add it now */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + if (pslot) { + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); + } else { + /* + * this radix_tree_insert can fail safely, so no + * need to call radix_tree_preload() + */ + ret = radix_tree_insert( + &log->big_stripe_tree, tree_index, + (void *)(1 << R5C_RADIX_COUNT_SHIFT)); + if (ret) { + spin_unlock(&log->tree_lock); + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + spin_unlock(&log->tree_lock); + + /* + * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is + * counted in the radix tree + */ + set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); + atomic_inc(&conf->r5c_cached_partial_stripes); + } + + for (i = disks; i--; ) { + dev = &sh->dev[i]; + if (dev->towrite) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + to_cache++; + } + } + + if (to_cache) { + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + /* + * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() + * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in + * r5c_handle_data_cached() + */ + set_bit(STRIPE_LOG_TRAPPED, &sh->state); + } + + return 0; +} + +/* + * free extra pages (orig_page) we allocated for prexor + */ +void r5c_release_extra_page(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + int i; + bool using_disk_info_extra_page; + + using_disk_info_extra_page = + sh->dev[0].orig_page == conf->disks[0].extra_page; + + for (i = sh->disks; i--; ) + if (sh->dev[i].page != sh->dev[i].orig_page) { + struct page *p = sh->dev[i].orig_page; + + sh->dev[i].orig_page = sh->dev[i].page; + clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + + if (!using_disk_info_extra_page) + put_page(p); + } + + if (using_disk_info_extra_page) { + clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); + md_wakeup_thread(conf->mddev->thread); + } +} + +void r5c_use_extra_page(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + int i; + struct r5dev *dev; + + for (i = sh->disks; i--; ) { + dev = &sh->dev[i]; + if (dev->orig_page != dev->page) + put_page(dev->orig_page); + dev->orig_page = conf->disks[i].extra_page; + } +} + +/* + * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the + * stripe is committed to RAID disks. + */ +void r5c_finish_stripe_write_out(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s) +{ + struct r5l_log *log = conf->log; + int i; + int do_wakeup = 0; + sector_t tree_index; + void __rcu **pslot; + uintptr_t refcount; + + if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) + return; + + WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return; + + for (i = sh->disks; i--; ) { + clear_bit(R5_InJournal, &sh->dev[i].flags); + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + do_wakeup = 1; + } + + /* + * analyse_stripe() runs before r5c_finish_stripe_write_out(), + * We updated R5_InJournal, so we also update s->injournal. + */ + s->injournal = 0; + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); + + if (do_wakeup) + wake_up(&conf->wait_for_overlap); + + spin_lock_irq(&log->stripe_in_journal_lock); + list_del_init(&sh->r5c); + spin_unlock_irq(&log->stripe_in_journal_lock); + sh->log_start = MaxSector; + + atomic_dec(&log->stripe_in_journal_count); + r5c_update_log_state(log); + + /* stop counting this stripe in big_stripe_tree */ + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + BUG_ON(pslot == NULL); + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + if (refcount == 1) + radix_tree_delete(&log->big_stripe_tree, tree_index); + else + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); + spin_unlock(&log->tree_lock); + } + + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); + atomic_dec(&conf->r5c_flushing_partial_stripes); + atomic_dec(&conf->r5c_cached_partial_stripes); + } + + if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); + atomic_dec(&conf->r5c_flushing_full_stripes); + atomic_dec(&conf->r5c_cached_full_stripes); + } + + r5l_append_flush_payload(log, sh->sector); + /* stripe is flused to raid disks, we can do resync now */ + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); +} + +int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + int pages = 0; + int reserve; + int i; + int ret = 0; + + BUG_ON(!log); + + for (i = 0; i < sh->disks; i++) { + void *addr; + + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + continue; + addr = kmap_atomic(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_atomic(addr); + pages++; + } + WARN_ON(pages == 0); + + /* + * The stripe must enter state machine again to call endio, so + * don't delay. + */ + clear_bit(STRIPE_DELAYED, &sh->state); + atomic_inc(&sh->count); + + mutex_lock(&log->io_mutex); + /* meta + data */ + reserve = (1 + pages) << (PAGE_SHIFT - 9); + + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + sh->log_start == MaxSector) + r5l_add_no_space_stripe(log, sh); + else if (!r5l_has_free_space(log, reserve)) { + if (sh->log_start == log->last_checkpoint) + BUG(); + else + r5l_add_no_space_stripe(log, sh); + } else { + ret = r5l_log_stripe(log, sh, pages, 0); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } + } + + mutex_unlock(&log->io_mutex); + return 0; +} + +/* check whether this big stripe is in write back cache. */ +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) +{ + struct r5l_log *log = conf->log; + sector_t tree_index; + void *slot; + + if (!log) + return false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + tree_index = r5c_tree_index(conf, sect); + slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); + return slot != NULL; +} + +static int r5l_load_log(struct r5l_log *log) +{ + struct md_rdev *rdev = log->rdev; + struct page *page; + struct r5l_meta_block *mb; + sector_t cp = log->rdev->journal_tail; + u32 stored_crc, expected_crc; + bool create_super = false; + int ret = 0; + + /* Make sure it's valid */ + if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) + cp = 0; + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, false)) { + ret = -EIO; + goto ioerr; + } + mb = page_address(page); + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + mb->version != R5LOG_VERSION) { + create_super = true; + goto create; + } + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + if (stored_crc != expected_crc) { + create_super = true; + goto create; + } + if (le64_to_cpu(mb->position) != cp) { + create_super = true; + goto create; + } +create: + if (create_super) { + log->last_cp_seq = get_random_u32(); + cp = 0; + r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); + /* + * Make sure super points to correct address. Log might have + * data very soon. If super hasn't correct log tail address, + * recovery can't find the log + */ + r5l_write_super(log, cp); + } else + log->last_cp_seq = le64_to_cpu(mb->seq); + + log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); + log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; + if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) + log->max_free_space = RECLAIM_MAX_FREE_SPACE; + log->last_checkpoint = cp; + + __free_page(page); + + if (create_super) { + log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); + log->seq = log->last_cp_seq + 1; + log->next_checkpoint = cp; + } else + ret = r5l_recovery_log(log); + + r5c_update_log_state(log); + return ret; +ioerr: + __free_page(page); + return ret; +} + +int r5l_start(struct r5l_log *log) +{ + int ret; + + if (!log) + return 0; + + ret = r5l_load_log(log); + if (ret) { + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + + r5l_exit_log(conf); + } + return ret; +} + +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + + if (!log) + return; + + if ((raid5_calc_degraded(conf) > 0 || + test_bit(Journal, &rdev->flags)) && + conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + schedule_work(&log->disable_writeback_work); +} + +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) +{ + struct request_queue *q = bdev_get_queue(rdev->bdev); + struct r5l_log *log; + int ret; + + pr_debug("md/raid:%s: using device %pg as journal\n", + mdname(conf->mddev), rdev->bdev); + + if (PAGE_SIZE != 4096) + return -EINVAL; + + /* + * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and + * raid_disks r5l_payload_data_parity. + * + * Write journal and cache does not work for very big array + * (raid_disks > 203) + */ + if (sizeof(struct r5l_meta_block) + + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * + conf->raid_disks) > PAGE_SIZE) { + pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", + mdname(conf->mddev), conf->raid_disks); + return -EINVAL; + } + + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return -ENOMEM; + log->rdev = rdev; + + log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; + + log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, + sizeof(rdev->mddev->uuid)); + + mutex_init(&log->io_mutex); + + spin_lock_init(&log->io_list_lock); + INIT_LIST_HEAD(&log->running_ios); + INIT_LIST_HEAD(&log->io_end_ios); + INIT_LIST_HEAD(&log->flushing_ios); + INIT_LIST_HEAD(&log->finished_ios); + + log->io_kc = KMEM_CACHE(r5l_io_unit, 0); + if (!log->io_kc) + goto io_kc; + + ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc); + if (ret) + goto io_pool; + + ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (ret) + goto io_bs; + + ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0); + if (ret) + goto out_mempool; + + spin_lock_init(&log->tree_lock); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, + log->rdev->mddev, "reclaim"); + if (!log->reclaim_thread) + goto reclaim_thread; + log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + + init_waitqueue_head(&log->iounit_wait); + + INIT_LIST_HEAD(&log->no_mem_stripes); + + INIT_LIST_HEAD(&log->no_space_stripes); + spin_lock_init(&log->no_space_stripes_lock); + + INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); + INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + INIT_LIST_HEAD(&log->stripe_in_journal_list); + spin_lock_init(&log->stripe_in_journal_lock); + atomic_set(&log->stripe_in_journal_count, 0); + + conf->log = log; + + set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return 0; + +reclaim_thread: + mempool_exit(&log->meta_pool); +out_mempool: + bioset_exit(&log->bs); +io_bs: + mempool_exit(&log->io_pool); +io_pool: + kmem_cache_destroy(log->io_kc); +io_kc: + kfree(log); + return -EINVAL; +} + +void r5l_exit_log(struct r5conf *conf) +{ + struct r5l_log *log = conf->log; + + md_unregister_thread(&log->reclaim_thread); + + /* + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to + * ensure disable_writeback_work wakes up and exits. + */ + conf->log = NULL; + wake_up(&conf->mddev->sb_wait); + flush_work(&log->disable_writeback_work); + + mempool_exit(&log->meta_pool); + bioset_exit(&log->bs); + mempool_exit(&log->io_pool); + kmem_cache_destroy(log->io_kc); + kfree(log); +} diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h new file mode 100644 index 000000000..c83325026 --- /dev/null +++ b/drivers/md/raid5-log.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RAID5_LOG_H +#define _RAID5_LOG_H + +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +void r5l_exit_log(struct r5conf *conf); +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +void r5l_write_stripe_run(struct r5l_log *log); +void r5l_flush_stripe_to_raid(struct r5l_log *log); +void r5l_stripe_write_finished(struct stripe_head *sh); +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +void r5l_quiesce(struct r5l_log *log, int quiesce); +bool r5l_log_disk_error(struct r5conf *conf); +bool r5c_is_writeback(struct r5l_log *log); +int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks); +void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s); +void r5c_release_extra_page(struct stripe_head *sh); +void r5c_use_extra_page(struct stripe_head *sh); +void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks); +int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); +void r5c_make_stripe_write_out(struct stripe_head *sh); +void r5c_flush_cache(struct r5conf *conf, int num); +void r5c_check_stripe_cache_usage(struct r5conf *conf); +void r5c_check_cached_full_stripe(struct r5conf *conf); +extern struct md_sysfs_entry r5c_journal_mode; +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +int r5l_start(struct r5l_log *log); + +struct dma_async_tx_descriptor * +ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx); +int ppl_init_log(struct r5conf *conf); +void ppl_exit_log(struct r5conf *conf); +int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); +void ppl_write_stripe_run(struct r5conf *conf); +void ppl_stripe_write_finished(struct stripe_head *sh); +int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); +void ppl_quiesce(struct r5conf *conf, int quiesce); +int ppl_handle_flush_request(struct bio *bio); +extern struct md_sysfs_entry ppl_write_hint; + +static inline bool raid5_has_log(struct r5conf *conf) +{ + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); +} + +static inline bool raid5_has_ppl(struct r5conf *conf) +{ + return test_bit(MD_HAS_PPL, &conf->mddev->flags); +} + +static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s) +{ + struct r5conf *conf = sh->raid_conf; + + if (conf->log) { + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + /* writing out phase */ + if (s->waiting_extra_page) + return 0; + return r5l_write_stripe(conf->log, sh); + } else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { + /* caching phase */ + return r5c_cache_data(conf->log, sh); + } + } else if (raid5_has_ppl(conf)) { + return ppl_write_stripe(conf, sh); + } + + return -EAGAIN; +} + +static inline void log_stripe_write_finished(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + + if (conf->log) + r5l_stripe_write_finished(sh); + else if (raid5_has_ppl(conf)) + ppl_stripe_write_finished(sh); +} + +static inline void log_write_stripe_run(struct r5conf *conf) +{ + if (conf->log) + r5l_write_stripe_run(conf->log); + else if (raid5_has_ppl(conf)) + ppl_write_stripe_run(conf); +} + +static inline void log_flush_stripe_to_raid(struct r5conf *conf) +{ + if (conf->log) + r5l_flush_stripe_to_raid(conf->log); + else if (raid5_has_ppl(conf)) + ppl_write_stripe_run(conf); +} + +static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) +{ + int ret = -ENODEV; + + if (conf->log) + ret = r5l_handle_flush_request(conf->log, bio); + else if (raid5_has_ppl(conf)) + ret = ppl_handle_flush_request(bio); + + return ret; +} + +static inline void log_quiesce(struct r5conf *conf, int quiesce) +{ + if (conf->log) + r5l_quiesce(conf->log, quiesce); + else if (raid5_has_ppl(conf)) + ppl_quiesce(conf, quiesce); +} + +static inline void log_exit(struct r5conf *conf) +{ + if (conf->log) + r5l_exit_log(conf); + else if (raid5_has_ppl(conf)) + ppl_exit_log(conf); +} + +static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev, + bool ppl) +{ + if (journal_dev) + return r5l_init_log(conf, journal_dev); + else if (ppl) + return ppl_init_log(conf); + + return 0; +} + +static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add) +{ + if (raid5_has_ppl(conf)) + return ppl_modify_log(conf, rdev, add); + + return 0; +} + +#endif diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c new file mode 100644 index 000000000..31b9157bc --- /dev/null +++ b/drivers/md/raid5-ppl.c @@ -0,0 +1,1532 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Partial Parity Log for closing the RAID5 write hole + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "raid5.h" +#include "raid5-log.h" + +/* + * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for + * partial parity data. The header contains an array of entries + * (struct ppl_header_entry) which describe the logged write requests. + * Partial parity for the entries comes after the header, written in the same + * sequence as the entries: + * + * Header + * entry0 + * ... + * entryN + * PP data + * PP for entry0 + * ... + * PP for entryN + * + * An entry describes one or more consecutive stripe_heads, up to a full + * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the + * number of stripe_heads in the entry and n is the number of modified data + * disks. Every stripe_head in the entry must write to the same data disks. + * An example of a valid case described by a single entry (writes to the first + * stripe of a 4 disk array, 16k chunk size): + * + * sh->sector dd0 dd1 dd2 ppl + * +-----+-----+-----+ + * 0 | --- | --- | --- | +----+ + * 8 | -W- | -W- | --- | | pp | data_sector = 8 + * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k + * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k + * +-----+-----+-----+ +----+ + * + * data_sector is the first raid sector of the modified data, data_size is the + * total size of modified data and pp_size is the size of partial parity for + * this entry. Entries for full stripe writes contain no partial parity + * (pp_size = 0), they only mark the stripes for which parity should be + * recalculated after an unclean shutdown. Every entry holds a checksum of its + * partial parity, the header also has a checksum of the header itself. + * + * A write request is always logged to the PPL instance stored on the parity + * disk of the corresponding stripe. For each member disk there is one ppl_log + * used to handle logging for this disk, independently from others. They are + * grouped in child_logs array in struct ppl_conf, which is assigned to + * r5conf->log_private. + * + * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. + * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head + * can be appended to the last entry if it meets the conditions for a valid + * entry described above, otherwise a new entry is added. Checksums of entries + * are calculated incrementally as stripes containing partial parity are being + * added. ppl_submit_iounit() calculates the checksum of the header and submits + * a bio containing the header page and partial parity pages (sh->ppl_page) for + * all stripes of the io_unit. When the PPL write completes, the stripes + * associated with the io_unit are released and raid5d starts writing their data + * and parity. When all stripes are written, the io_unit is freed and the next + * can be submitted. + * + * An io_unit is used to gather stripes until it is submitted or becomes full + * (if the maximum number of entries or size of PPL is reached). Another io_unit + * can't be submitted until the previous has completed (PPL and stripe + * data+parity is written). The log->io_list tracks all io_units of a log + * (for a single member disk). New io_units are added to the end of the list + * and the first io_unit is submitted, if it is not submitted already. + * The current io_unit accepting new stripes is always at the end of the list. + * + * If write-back cache is enabled for any of the disks in the array, its data + * must be flushed before next io_unit is submitted. + */ + +#define PPL_SPACE_SIZE (128 * 1024) + +struct ppl_conf { + struct mddev *mddev; + + /* array of child logs, one for each raid disk */ + struct ppl_log *child_logs; + int count; + + int block_size; /* the logical block size used for data_sector + * in ppl_header_entry */ + u32 signature; /* raid array identifier */ + atomic64_t seq; /* current log write sequence number */ + + struct kmem_cache *io_kc; + mempool_t io_pool; + struct bio_set bs; + struct bio_set flush_bs; + + /* used only for recovery */ + int recovered_entries; + int mismatch_count; + + /* stripes to retry if failed to allocate io_unit */ + struct list_head no_mem_stripes; + spinlock_t no_mem_stripes_lock; + + unsigned short write_hint; +}; + +struct ppl_log { + struct ppl_conf *ppl_conf; /* shared between all log instances */ + + struct md_rdev *rdev; /* array member disk associated with + * this log instance */ + struct mutex io_mutex; + struct ppl_io_unit *current_io; /* current io_unit accepting new data + * always at the end of io_list */ + spinlock_t io_list_lock; + struct list_head io_list; /* all io_units of this log */ + + sector_t next_io_sector; + unsigned int entry_space; + bool use_multippl; + bool wb_cache_on; + unsigned long disk_flush_bitmap; +}; + +#define PPL_IO_INLINE_BVECS 32 + +struct ppl_io_unit { + struct ppl_log *log; + + struct page *header_page; /* for ppl_header */ + + unsigned int entries_count; /* number of entries in ppl_header */ + unsigned int pp_size; /* total size current of partial parity */ + + u64 seq; /* sequence number of this log write */ + struct list_head log_sibling; /* log->io_list */ + + struct list_head stripe_list; /* stripes added to the io_unit */ + atomic_t pending_stripes; /* how many stripes not written to raid */ + atomic_t pending_flushes; /* how many disk flushes are in progress */ + + bool submitted; /* true if write to log started */ + + /* inline bio and its biovec for submitting the iounit */ + struct bio bio; + struct bio_vec biovec[PPL_IO_INLINE_BVECS]; +}; + +struct dma_async_tx_descriptor * +ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + struct page **srcs = percpu->scribble; + int count = 0, pd_idx = sh->pd_idx, i; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + /* + * Partial parity is the XOR of stripe data chunks that are not changed + * during the write request. Depending on available data + * (read-modify-write vs. reconstruct-write case) we calculate it + * differently. + */ + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + /* + * rmw: xor old data and parity from updated disks + * This is calculated earlier by ops_run_prexor5() so just copy + * the parity dev page. + */ + srcs[count++] = sh->dev[pd_idx].page; + } else if (sh->reconstruct_state == reconstruct_state_drain_run) { + /* rcw: xor data from all not updated disks */ + for (i = disks; i--;) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_UPTODATE, &dev->flags)) + srcs[count++] = dev->page; + } + } else { + return tx; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, + NULL, sh, (void *) (srcs + sh->disks + 2)); + + if (count == 1) + tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE, + &submit); + else + tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE, + &submit); + + return tx; +} + +static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct kmem_cache *kc = pool_data; + struct ppl_io_unit *io; + + io = kmem_cache_alloc(kc, gfp_mask); + if (!io) + return NULL; + + io->header_page = alloc_page(gfp_mask); + if (!io->header_page) { + kmem_cache_free(kc, io); + return NULL; + } + + return io; +} + +static void ppl_io_pool_free(void *element, void *pool_data) +{ + struct kmem_cache *kc = pool_data; + struct ppl_io_unit *io = element; + + __free_page(io->header_page); + kmem_cache_free(kc, io); +} + +static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, + struct stripe_head *sh) +{ + struct ppl_conf *ppl_conf = log->ppl_conf; + struct ppl_io_unit *io; + struct ppl_header *pplhdr; + struct page *header_page; + + io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT); + if (!io) + return NULL; + + header_page = io->header_page; + memset(io, 0, sizeof(*io)); + io->header_page = header_page; + + io->log = log; + INIT_LIST_HEAD(&io->log_sibling); + INIT_LIST_HEAD(&io->stripe_list); + atomic_set(&io->pending_stripes, 0); + atomic_set(&io->pending_flushes, 0); + bio_init(&io->bio, log->rdev->bdev, io->biovec, PPL_IO_INLINE_BVECS, + REQ_OP_WRITE | REQ_FUA); + + pplhdr = page_address(io->header_page); + clear_page(pplhdr); + memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); + pplhdr->signature = cpu_to_le32(ppl_conf->signature); + + io->seq = atomic64_add_return(1, &ppl_conf->seq); + pplhdr->generation = cpu_to_le64(io->seq); + + return io; +} + +static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) +{ + struct ppl_io_unit *io = log->current_io; + struct ppl_header_entry *e = NULL; + struct ppl_header *pplhdr; + int i; + sector_t data_sector = 0; + int data_disks = 0; + struct r5conf *conf = sh->raid_conf; + + pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); + + /* check if current io_unit is full */ + if (io && (io->pp_size == log->entry_space || + io->entries_count == PPL_HDR_MAX_ENTRIES)) { + pr_debug("%s: add io_unit blocked by seq: %llu\n", + __func__, io->seq); + io = NULL; + } + + /* add a new unit if there is none or the current is full */ + if (!io) { + io = ppl_new_iounit(log, sh); + if (!io) + return -ENOMEM; + spin_lock_irq(&log->io_list_lock); + list_add_tail(&io->log_sibling, &log->io_list); + spin_unlock_irq(&log->io_list_lock); + + log->current_io = io; + } + + for (i = 0; i < sh->disks; i++) { + struct r5dev *dev = &sh->dev[i]; + + if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) { + if (!data_disks || dev->sector < data_sector) + data_sector = dev->sector; + data_disks++; + } + } + BUG_ON(!data_disks); + + pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__, + io->seq, (unsigned long long)data_sector, data_disks); + + pplhdr = page_address(io->header_page); + + if (io->entries_count > 0) { + struct ppl_header_entry *last = + &pplhdr->entries[io->entries_count - 1]; + struct stripe_head *sh_last = list_last_entry( + &io->stripe_list, struct stripe_head, log_list); + u64 data_sector_last = le64_to_cpu(last->data_sector); + u32 data_size_last = le32_to_cpu(last->data_size); + + /* + * Check if we can append the stripe to the last entry. It must + * be just after the last logged stripe and write to the same + * disks. Use bit shift and logarithm to avoid 64-bit division. + */ + if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) && + (data_sector >> ilog2(conf->chunk_sectors) == + data_sector_last >> ilog2(conf->chunk_sectors)) && + ((data_sector - data_sector_last) * data_disks == + data_size_last >> 9)) + e = last; + } + + if (!e) { + e = &pplhdr->entries[io->entries_count++]; + e->data_sector = cpu_to_le64(data_sector); + e->parity_disk = cpu_to_le32(sh->pd_idx); + e->checksum = cpu_to_le32(~0); + } + + le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT); + + /* don't write any PP if full stripe write */ + if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { + le32_add_cpu(&e->pp_size, PAGE_SIZE); + io->pp_size += PAGE_SIZE; + e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), + page_address(sh->ppl_page), + PAGE_SIZE)); + } + + list_add_tail(&sh->log_list, &io->stripe_list); + atomic_inc(&io->pending_stripes); + sh->ppl_io = io; + + return 0; +} + +int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh) +{ + struct ppl_conf *ppl_conf = conf->log_private; + struct ppl_io_unit *io = sh->ppl_io; + struct ppl_log *log; + + if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page || + !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || + !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) { + clear_bit(STRIPE_LOG_TRAPPED, &sh->state); + return -EAGAIN; + } + + log = &ppl_conf->child_logs[sh->pd_idx]; + + mutex_lock(&log->io_mutex); + + if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { + mutex_unlock(&log->io_mutex); + return -EAGAIN; + } + + set_bit(STRIPE_LOG_TRAPPED, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + atomic_inc(&sh->count); + + if (ppl_log_stripe(log, sh)) { + spin_lock_irq(&ppl_conf->no_mem_stripes_lock); + list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes); + spin_unlock_irq(&ppl_conf->no_mem_stripes_lock); + } + + mutex_unlock(&log->io_mutex); + + return 0; +} + +static void ppl_log_endio(struct bio *bio) +{ + struct ppl_io_unit *io = bio->bi_private; + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct stripe_head *sh, *next; + + pr_debug("%s: seq: %llu\n", __func__, io->seq); + + if (bio->bi_status) + md_error(ppl_conf->mddev, log->rdev); + + list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { + list_del_init(&sh->log_list); + + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + +static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) +{ + pr_debug("%s: seq: %llu size: %u sector: %llu dev: %pg\n", + __func__, io->seq, bio->bi_iter.bi_size, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_bdev); + + submit_bio(bio); +} + +static void ppl_submit_iounit(struct ppl_io_unit *io) +{ + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct ppl_header *pplhdr = page_address(io->header_page); + struct bio *bio = &io->bio; + struct stripe_head *sh; + int i; + + bio->bi_private = io; + + if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { + ppl_log_endio(bio); + return; + } + + for (i = 0; i < io->entries_count; i++) { + struct ppl_header_entry *e = &pplhdr->entries[i]; + + pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n", + __func__, io->seq, i, le64_to_cpu(e->data_sector), + le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size)); + + e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >> + ilog2(ppl_conf->block_size >> 9)); + e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum)); + } + + pplhdr->entries_count = cpu_to_le32(io->entries_count); + pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); + + /* Rewind the buffer if current PPL is larger then remaining space */ + if (log->use_multippl && + log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector < + (PPL_HEADER_SIZE + io->pp_size) >> 9) + log->next_io_sector = log->rdev->ppl.sector; + + + bio->bi_end_io = ppl_log_endio; + bio->bi_iter.bi_sector = log->next_io_sector; + bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + + pr_debug("%s: log->current_io_sector: %llu\n", __func__, + (unsigned long long)log->next_io_sector); + + if (log->use_multippl) + log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; + + WARN_ON(log->disk_flush_bitmap != 0); + + list_for_each_entry(sh, &io->stripe_list, log_list) { + for (i = 0; i < sh->disks; i++) { + struct r5dev *dev = &sh->dev[i]; + + if ((ppl_conf->child_logs[i].wb_cache_on) && + (test_bit(R5_Wantwrite, &dev->flags))) { + set_bit(i, &log->disk_flush_bitmap); + } + } + + /* entries for full stripe writes have no partial parity */ + if (test_bit(STRIPE_FULL_WRITE, &sh->state)) + continue; + + if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { + struct bio *prev = bio; + + bio = bio_alloc_bioset(prev->bi_bdev, BIO_MAX_VECS, + prev->bi_opf, GFP_NOIO, + &ppl_conf->bs); + bio->bi_iter.bi_sector = bio_end_sector(prev); + bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); + + bio_chain(bio, prev); + ppl_submit_iounit_bio(io, prev); + } + } + + ppl_submit_iounit_bio(io, bio); +} + +static void ppl_submit_current_io(struct ppl_log *log) +{ + struct ppl_io_unit *io; + + spin_lock_irq(&log->io_list_lock); + + io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, + log_sibling); + if (io && io->submitted) + io = NULL; + + spin_unlock_irq(&log->io_list_lock); + + if (io) { + io->submitted = true; + + if (io == log->current_io) + log->current_io = NULL; + + ppl_submit_iounit(io); + } +} + +void ppl_write_stripe_run(struct r5conf *conf) +{ + struct ppl_conf *ppl_conf = conf->log_private; + struct ppl_log *log; + int i; + + for (i = 0; i < ppl_conf->count; i++) { + log = &ppl_conf->child_logs[i]; + + mutex_lock(&log->io_mutex); + ppl_submit_current_io(log); + mutex_unlock(&log->io_mutex); + } +} + +static void ppl_io_unit_finished(struct ppl_io_unit *io) +{ + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct r5conf *conf = ppl_conf->mddev->private; + unsigned long flags; + + pr_debug("%s: seq: %llu\n", __func__, io->seq); + + local_irq_save(flags); + + spin_lock(&log->io_list_lock); + list_del(&io->log_sibling); + spin_unlock(&log->io_list_lock); + + mempool_free(io, &ppl_conf->io_pool); + + spin_lock(&ppl_conf->no_mem_stripes_lock); + if (!list_empty(&ppl_conf->no_mem_stripes)) { + struct stripe_head *sh; + + sh = list_first_entry(&ppl_conf->no_mem_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } + spin_unlock(&ppl_conf->no_mem_stripes_lock); + + local_irq_restore(flags); + + wake_up(&conf->wait_for_quiescent); +} + +static void ppl_flush_endio(struct bio *bio) +{ + struct ppl_io_unit *io = bio->bi_private; + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct r5conf *conf = ppl_conf->mddev->private; + + pr_debug("%s: dev: %pg\n", __func__, bio->bi_bdev); + + if (bio->bi_status) { + struct md_rdev *rdev; + + rcu_read_lock(); + rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio)); + if (rdev) + md_error(rdev->mddev, rdev); + rcu_read_unlock(); + } + + bio_put(bio); + + if (atomic_dec_and_test(&io->pending_flushes)) { + ppl_io_unit_finished(io); + md_wakeup_thread(conf->mddev->thread); + } +} + +static void ppl_do_flush(struct ppl_io_unit *io) +{ + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct r5conf *conf = ppl_conf->mddev->private; + int raid_disks = conf->raid_disks; + int flushed_disks = 0; + int i; + + atomic_set(&io->pending_flushes, raid_disks); + + for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) { + struct md_rdev *rdev; + struct block_device *bdev = NULL; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) + bdev = rdev->bdev; + rcu_read_unlock(); + + if (bdev) { + struct bio *bio; + + bio = bio_alloc_bioset(bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, + GFP_NOIO, &ppl_conf->flush_bs); + bio->bi_private = io; + bio->bi_end_io = ppl_flush_endio; + + pr_debug("%s: dev: %ps\n", __func__, bio->bi_bdev); + + submit_bio(bio); + flushed_disks++; + } + } + + log->disk_flush_bitmap = 0; + + for (i = flushed_disks ; i < raid_disks; i++) { + if (atomic_dec_and_test(&io->pending_flushes)) + ppl_io_unit_finished(io); + } +} + +static inline bool ppl_no_io_unit_submitted(struct r5conf *conf, + struct ppl_log *log) +{ + struct ppl_io_unit *io; + + io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, + log_sibling); + + return !io || !io->submitted; +} + +void ppl_quiesce(struct r5conf *conf, int quiesce) +{ + struct ppl_conf *ppl_conf = conf->log_private; + int i; + + if (quiesce) { + for (i = 0; i < ppl_conf->count; i++) { + struct ppl_log *log = &ppl_conf->child_logs[i]; + + spin_lock_irq(&log->io_list_lock); + wait_event_lock_irq(conf->wait_for_quiescent, + ppl_no_io_unit_submitted(conf, log), + log->io_list_lock); + spin_unlock_irq(&log->io_list_lock); + } + } +} + +int ppl_handle_flush_request(struct bio *bio) +{ + if (bio->bi_iter.bi_size == 0) { + bio_endio(bio); + return 0; + } + bio->bi_opf &= ~REQ_PREFLUSH; + return -EAGAIN; +} + +void ppl_stripe_write_finished(struct stripe_head *sh) +{ + struct ppl_io_unit *io; + + io = sh->ppl_io; + sh->ppl_io = NULL; + + if (io && atomic_dec_and_test(&io->pending_stripes)) { + if (io->log->disk_flush_bitmap) + ppl_do_flush(io); + else + ppl_io_unit_finished(io); + } +} + +static void ppl_xor(int size, struct page *page1, struct page *page2) +{ + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + struct page *xor_srcs[] = { page1, page2 }; + + init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST, + NULL, NULL, NULL, NULL); + tx = async_xor(page1, xor_srcs, 0, 2, size, &submit); + + async_tx_quiesce(&tx); +} + +/* + * PPL recovery strategy: xor partial parity and data from all modified data + * disks within a stripe and write the result as the new stripe parity. If all + * stripe data disks are modified (full stripe write), no partial parity is + * available, so just xor the data disks. + * + * Recovery of a PPL entry shall occur only if all modified data disks are + * available and read from all of them succeeds. + * + * A PPL entry applies to a stripe, partial parity size for an entry is at most + * the size of the chunk. Examples of possible cases for a single entry: + * + * case 0: single data disk write: + * data0 data1 data2 ppl parity + * +--------+--------+--------+ +--------------------+ + * | ------ | ------ | ------ | +----+ | (no change) | + * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | + * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | + * | ------ | ------ | ------ | +----+ | (no change) | + * +--------+--------+--------+ +--------------------+ + * pp_size = data_size + * + * case 1: more than one data disk write: + * data0 data1 data2 ppl parity + * +--------+--------+--------+ +--------------------+ + * | ------ | ------ | ------ | +----+ | (no change) | + * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | + * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | + * | ------ | ------ | ------ | +----+ | (no change) | + * +--------+--------+--------+ +--------------------+ + * pp_size = data_size / modified_data_disks + * + * case 2: write to all data disks (also full stripe write): + * data0 data1 data2 parity + * +--------+--------+--------+ +--------------------+ + * | ------ | ------ | ------ | | (no change) | + * | -data- | -data- | -data- | --------> | xor all data | + * | ------ | ------ | ------ | --------> | (no change) | + * | ------ | ------ | ------ | | (no change) | + * +--------+--------+--------+ +--------------------+ + * pp_size = 0 + * + * The following cases are possible only in other implementations. The recovery + * code can handle them, but they are not generated at runtime because they can + * be reduced to cases 0, 1 and 2: + * + * case 3: + * data0 data1 data2 ppl parity + * +--------+--------+--------+ +----+ +--------------------+ + * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp | + * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp | + * | -data- | -data- | -data- | | -- | -> | xor all data | + * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp | + * +--------+--------+--------+ +----+ +--------------------+ + * pp_size = chunk_size + * + * case 4: + * data0 data1 data2 ppl parity + * +--------+--------+--------+ +----+ +--------------------+ + * | ------ | -data- | ------ | | pp | | data1 ^ pp | + * | ------ | ------ | ------ | | -- | -> | (no change) | + * | ------ | ------ | ------ | | -- | -> | (no change) | + * | -data- | ------ | ------ | | pp | | data0 ^ pp | + * +--------+--------+--------+ +----+ +--------------------+ + * pp_size = chunk_size + */ +static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, + sector_t ppl_sector) +{ + struct ppl_conf *ppl_conf = log->ppl_conf; + struct mddev *mddev = ppl_conf->mddev; + struct r5conf *conf = mddev->private; + int block_size = ppl_conf->block_size; + struct page *page1; + struct page *page2; + sector_t r_sector_first; + sector_t r_sector_last; + int strip_sectors; + int data_disks; + int i; + int ret = 0; + unsigned int pp_size = le32_to_cpu(e->pp_size); + unsigned int data_size = le32_to_cpu(e->data_size); + + page1 = alloc_page(GFP_KERNEL); + page2 = alloc_page(GFP_KERNEL); + + if (!page1 || !page2) { + ret = -ENOMEM; + goto out; + } + + r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9); + + if ((pp_size >> 9) < conf->chunk_sectors) { + if (pp_size > 0) { + data_disks = data_size / pp_size; + strip_sectors = pp_size >> 9; + } else { + data_disks = conf->raid_disks - conf->max_degraded; + strip_sectors = (data_size >> 9) / data_disks; + } + r_sector_last = r_sector_first + + (data_disks - 1) * conf->chunk_sectors + + strip_sectors; + } else { + data_disks = conf->raid_disks - conf->max_degraded; + strip_sectors = conf->chunk_sectors; + r_sector_last = r_sector_first + (data_size >> 9); + } + + pr_debug("%s: array sector first: %llu last: %llu\n", __func__, + (unsigned long long)r_sector_first, + (unsigned long long)r_sector_last); + + /* if start and end is 4k aligned, use a 4k block */ + if (block_size == 512 && + (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 && + (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0) + block_size = RAID5_STRIPE_SIZE(conf); + + /* iterate through blocks in strip */ + for (i = 0; i < strip_sectors; i += (block_size >> 9)) { + bool update_parity = false; + sector_t parity_sector; + struct md_rdev *parity_rdev; + struct stripe_head sh; + int disk; + int indent = 0; + + pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i); + indent += 2; + + memset(page_address(page1), 0, PAGE_SIZE); + + /* iterate through data member disks */ + for (disk = 0; disk < data_disks; disk++) { + int dd_idx; + struct md_rdev *rdev; + sector_t sector; + sector_t r_sector = r_sector_first + i + + (disk * conf->chunk_sectors); + + pr_debug("%s:%*s data member disk %d start\n", + __func__, indent, "", disk); + indent += 2; + + if (r_sector >= r_sector_last) { + pr_debug("%s:%*s array sector %llu doesn't need parity update\n", + __func__, indent, "", + (unsigned long long)r_sector); + indent -= 2; + continue; + } + + update_parity = true; + + /* map raid sector to member disk */ + sector = raid5_compute_sector(conf, r_sector, 0, + &dd_idx, NULL); + pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n", + __func__, indent, "", + (unsigned long long)r_sector, dd_idx, + (unsigned long long)sector); + + /* Array has not started so rcu dereference is safe */ + rdev = rcu_dereference_protected( + conf->disks[dd_idx].rdev, 1); + if (!rdev || (!test_bit(In_sync, &rdev->flags) && + sector >= rdev->recovery_offset)) { + pr_debug("%s:%*s data member disk %d missing\n", + __func__, indent, "", dd_idx); + update_parity = false; + break; + } + + pr_debug("%s:%*s reading data member disk %pg sector %llu\n", + __func__, indent, "", rdev->bdev, + (unsigned long long)sector); + if (!sync_page_io(rdev, sector, block_size, page2, + REQ_OP_READ, false)) { + md_error(mddev, rdev); + pr_debug("%s:%*s read failed!\n", __func__, + indent, ""); + ret = -EIO; + goto out; + } + + ppl_xor(block_size, page1, page2); + + indent -= 2; + } + + if (!update_parity) + continue; + + if (pp_size > 0) { + pr_debug("%s:%*s reading pp disk sector %llu\n", + __func__, indent, "", + (unsigned long long)(ppl_sector + i)); + if (!sync_page_io(log->rdev, + ppl_sector - log->rdev->data_offset + i, + block_size, page2, REQ_OP_READ, + false)) { + pr_debug("%s:%*s read failed!\n", __func__, + indent, ""); + md_error(mddev, log->rdev); + ret = -EIO; + goto out; + } + + ppl_xor(block_size, page1, page2); + } + + /* map raid sector to parity disk */ + parity_sector = raid5_compute_sector(conf, r_sector_first + i, + 0, &disk, &sh); + BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); + + /* Array has not started so rcu dereference is safe */ + parity_rdev = rcu_dereference_protected( + conf->disks[sh.pd_idx].rdev, 1); + + BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); + pr_debug("%s:%*s write parity at sector %llu, disk %pg\n", + __func__, indent, "", + (unsigned long long)parity_sector, + parity_rdev->bdev); + if (!sync_page_io(parity_rdev, parity_sector, block_size, + page1, REQ_OP_WRITE, false)) { + pr_debug("%s:%*s parity write error!\n", __func__, + indent, ""); + md_error(mddev, parity_rdev); + ret = -EIO; + goto out; + } + } +out: + if (page1) + __free_page(page1); + if (page2) + __free_page(page2); + return ret; +} + +static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, + sector_t offset) +{ + struct ppl_conf *ppl_conf = log->ppl_conf; + struct md_rdev *rdev = log->rdev; + struct mddev *mddev = rdev->mddev; + sector_t ppl_sector = rdev->ppl.sector + offset + + (PPL_HEADER_SIZE >> 9); + struct page *page; + int i; + int ret = 0; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + /* iterate through all PPL entries saved */ + for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) { + struct ppl_header_entry *e = &pplhdr->entries[i]; + u32 pp_size = le32_to_cpu(e->pp_size); + sector_t sector = ppl_sector; + int ppl_entry_sectors = pp_size >> 9; + u32 crc, crc_stored; + + pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n", + __func__, rdev->raid_disk, i, + (unsigned long long)ppl_sector, pp_size); + + crc = ~0; + crc_stored = le32_to_cpu(e->checksum); + + /* read parial parity for this entry and calculate its checksum */ + while (pp_size) { + int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size; + + if (!sync_page_io(rdev, sector - rdev->data_offset, + s, page, REQ_OP_READ, false)) { + md_error(mddev, rdev); + ret = -EIO; + goto out; + } + + crc = crc32c_le(crc, page_address(page), s); + + pp_size -= s; + sector += s >> 9; + } + + crc = ~crc; + + if (crc != crc_stored) { + /* + * Don't recover this entry if the checksum does not + * match, but keep going and try to recover other + * entries. + */ + pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n", + __func__, crc_stored, crc); + ppl_conf->mismatch_count++; + } else { + ret = ppl_recover_entry(log, e, ppl_sector); + if (ret) + goto out; + ppl_conf->recovered_entries++; + } + + ppl_sector += ppl_entry_sectors; + } + + /* flush the disk cache after recovery if necessary */ + ret = blkdev_issue_flush(rdev->bdev); +out: + __free_page(page); + return ret; +} + +static int ppl_write_empty_header(struct ppl_log *log) +{ + struct page *page; + struct ppl_header *pplhdr; + struct md_rdev *rdev = log->rdev; + int ret = 0; + + pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__, + rdev->raid_disk, (unsigned long long)rdev->ppl.sector); + + page = alloc_page(GFP_NOIO | __GFP_ZERO); + if (!page) + return -ENOMEM; + + pplhdr = page_address(page); + /* zero out PPL space to avoid collision with old PPLs */ + blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector, + log->rdev->ppl.size, GFP_NOIO, 0); + memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); + pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); + pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); + + if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, + PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC | + REQ_FUA, false)) { + md_error(rdev->mddev, rdev); + ret = -EIO; + } + + __free_page(page); + return ret; +} + +static int ppl_load_distributed(struct ppl_log *log) +{ + struct ppl_conf *ppl_conf = log->ppl_conf; + struct md_rdev *rdev = log->rdev; + struct mddev *mddev = rdev->mddev; + struct page *page, *page2; + struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL; + u32 crc, crc_stored; + u32 signature; + int ret = 0, i; + sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0; + + pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk); + /* read PPL headers, find the recent one */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + page2 = alloc_page(GFP_KERNEL); + if (!page2) { + __free_page(page); + return -ENOMEM; + } + + /* searching ppl area for latest ppl */ + while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) { + if (!sync_page_io(rdev, + rdev->ppl.sector - rdev->data_offset + + pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ, + false)) { + md_error(mddev, rdev); + ret = -EIO; + /* if not able to read - don't recover any PPL */ + pplhdr = NULL; + break; + } + pplhdr = page_address(page); + + /* check header validity */ + crc_stored = le32_to_cpu(pplhdr->checksum); + pplhdr->checksum = 0; + crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + + if (crc_stored != crc) { + pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n", + __func__, crc_stored, crc, + (unsigned long long)pplhdr_offset); + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + signature = le32_to_cpu(pplhdr->signature); + + if (mddev->external) { + /* + * For external metadata the header signature is set and + * validated in userspace. + */ + ppl_conf->signature = signature; + } else if (ppl_conf->signature != signature) { + pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n", + __func__, signature, ppl_conf->signature, + (unsigned long long)pplhdr_offset); + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) > + le64_to_cpu(pplhdr->generation)) { + /* previous was newest */ + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + prev_pplhdr_offset = pplhdr_offset; + prev_pplhdr = pplhdr; + + swap(page, page2); + + /* calculate next potential ppl offset */ + for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) + pplhdr_offset += + le32_to_cpu(pplhdr->entries[i].pp_size) >> 9; + pplhdr_offset += PPL_HEADER_SIZE >> 9; + } + + /* no valid ppl found */ + if (!pplhdr) + ppl_conf->mismatch_count++; + else + pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n", + __func__, (unsigned long long)pplhdr_offset, + le64_to_cpu(pplhdr->generation)); + + /* attempt to recover from log if we are starting a dirty array */ + if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + ret = ppl_recover(log, pplhdr, pplhdr_offset); + + /* write empty header if we are starting the array */ + if (!ret && !mddev->pers) + ret = ppl_write_empty_header(log); + + __free_page(page); + __free_page(page2); + + pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", + __func__, ret, ppl_conf->mismatch_count, + ppl_conf->recovered_entries); + return ret; +} + +static int ppl_load(struct ppl_conf *ppl_conf) +{ + int ret = 0; + u32 signature = 0; + bool signature_set = false; + int i; + + for (i = 0; i < ppl_conf->count; i++) { + struct ppl_log *log = &ppl_conf->child_logs[i]; + + /* skip missing drive */ + if (!log->rdev) + continue; + + ret = ppl_load_distributed(log); + if (ret) + break; + + /* + * For external metadata we can't check if the signature is + * correct on a single drive, but we can check if it is the same + * on all drives. + */ + if (ppl_conf->mddev->external) { + if (!signature_set) { + signature = ppl_conf->signature; + signature_set = true; + } else if (signature != ppl_conf->signature) { + pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n", + mdname(ppl_conf->mddev)); + ret = -EINVAL; + break; + } + } + } + + pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", + __func__, ret, ppl_conf->mismatch_count, + ppl_conf->recovered_entries); + return ret; +} + +static void __ppl_exit_log(struct ppl_conf *ppl_conf) +{ + clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); + clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags); + + kfree(ppl_conf->child_logs); + + bioset_exit(&ppl_conf->bs); + bioset_exit(&ppl_conf->flush_bs); + mempool_exit(&ppl_conf->io_pool); + kmem_cache_destroy(ppl_conf->io_kc); + + kfree(ppl_conf); +} + +void ppl_exit_log(struct r5conf *conf) +{ + struct ppl_conf *ppl_conf = conf->log_private; + + if (ppl_conf) { + __ppl_exit_log(ppl_conf); + conf->log_private = NULL; + } +} + +static int ppl_validate_rdev(struct md_rdev *rdev) +{ + int ppl_data_sectors; + int ppl_size_new; + + /* + * The configured PPL size must be enough to store + * the header and (at the very least) partial parity + * for one stripe. Round it down to ensure the data + * space is cleanly divisible by stripe size. + */ + ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); + + if (ppl_data_sectors > 0) + ppl_data_sectors = rounddown(ppl_data_sectors, + RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private)); + + if (ppl_data_sectors <= 0) { + pr_warn("md/raid:%s: PPL space too small on %pg\n", + mdname(rdev->mddev), rdev->bdev); + return -ENOSPC; + } + + ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9); + + if ((rdev->ppl.sector < rdev->data_offset && + rdev->ppl.sector + ppl_size_new > rdev->data_offset) || + (rdev->ppl.sector >= rdev->data_offset && + rdev->data_offset + rdev->sectors > rdev->ppl.sector)) { + pr_warn("md/raid:%s: PPL space overlaps with data on %pg\n", + mdname(rdev->mddev), rdev->bdev); + return -EINVAL; + } + + if (!rdev->mddev->external && + ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) || + (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) { + pr_warn("md/raid:%s: PPL space overlaps with superblock on %pg\n", + mdname(rdev->mddev), rdev->bdev); + return -EINVAL; + } + + rdev->ppl.size = ppl_size_new; + + return 0; +} + +static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) +{ + struct request_queue *q; + + if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + + PPL_HEADER_SIZE) * 2) { + log->use_multippl = true; + set_bit(MD_HAS_MULTIPLE_PPLS, + &log->ppl_conf->mddev->flags); + log->entry_space = PPL_SPACE_SIZE; + } else { + log->use_multippl = false; + log->entry_space = (log->rdev->ppl.size << 9) - + PPL_HEADER_SIZE; + } + log->next_io_sector = rdev->ppl.sector; + + q = bdev_get_queue(rdev->bdev); + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + log->wb_cache_on = true; +} + +int ppl_init_log(struct r5conf *conf) +{ + struct ppl_conf *ppl_conf; + struct mddev *mddev = conf->mddev; + int ret = 0; + int max_disks; + int i; + + pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", + mdname(conf->mddev)); + + if (PAGE_SIZE != 4096) + return -EINVAL; + + if (mddev->level != 5) { + pr_warn("md/raid:%s PPL is not compatible with raid level %d\n", + mdname(mddev), mddev->level); + return -EINVAL; + } + + if (mddev->bitmap_info.file || mddev->bitmap_info.offset) { + pr_warn("md/raid:%s PPL is not compatible with bitmap\n", + mdname(mddev)); + return -EINVAL; + } + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_warn("md/raid:%s PPL is not compatible with journal\n", + mdname(mddev)); + return -EINVAL; + } + + max_disks = sizeof_field(struct ppl_log, disk_flush_bitmap) * + BITS_PER_BYTE; + if (conf->raid_disks > max_disks) { + pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n", + mdname(mddev), max_disks); + return -EINVAL; + } + + ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); + if (!ppl_conf) + return -ENOMEM; + + ppl_conf->mddev = mddev; + + ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0); + if (!ppl_conf->io_kc) { + ret = -ENOMEM; + goto err; + } + + ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc, + ppl_io_pool_free, ppl_conf->io_kc); + if (ret) + goto err; + + ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS); + if (ret) + goto err; + + ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0); + if (ret) + goto err; + + ppl_conf->count = conf->raid_disks; + ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), + GFP_KERNEL); + if (!ppl_conf->child_logs) { + ret = -ENOMEM; + goto err; + } + + atomic64_set(&ppl_conf->seq, 0); + INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); + spin_lock_init(&ppl_conf->no_mem_stripes_lock); + + if (!mddev->external) { + ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); + ppl_conf->block_size = 512; + } else { + ppl_conf->block_size = queue_logical_block_size(mddev->queue); + } + + for (i = 0; i < ppl_conf->count; i++) { + struct ppl_log *log = &ppl_conf->child_logs[i]; + /* Array has not started so rcu dereference is safe */ + struct md_rdev *rdev = + rcu_dereference_protected(conf->disks[i].rdev, 1); + + mutex_init(&log->io_mutex); + spin_lock_init(&log->io_list_lock); + INIT_LIST_HEAD(&log->io_list); + + log->ppl_conf = ppl_conf; + log->rdev = rdev; + + if (rdev) { + ret = ppl_validate_rdev(rdev); + if (ret) + goto err; + + ppl_init_child_log(log, rdev); + } + } + + /* load and possibly recover the logs from the member disks */ + ret = ppl_load(ppl_conf); + + if (ret) { + goto err; + } else if (!mddev->pers && mddev->recovery_cp == 0 && + ppl_conf->recovered_entries > 0 && + ppl_conf->mismatch_count == 0) { + /* + * If we are starting a dirty array and the recovery succeeds + * without any issues, set the array as clean. + */ + mddev->recovery_cp = MaxSector; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + } else if (mddev->pers && ppl_conf->mismatch_count > 0) { + /* no mismatch allowed when enabling PPL for a running array */ + ret = -EINVAL; + goto err; + } + + conf->log_private = ppl_conf; + set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); + + return 0; +err: + __ppl_exit_log(ppl_conf); + return ret; +} + +int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) +{ + struct ppl_conf *ppl_conf = conf->log_private; + struct ppl_log *log; + int ret = 0; + + if (!rdev) + return -EINVAL; + + pr_debug("%s: disk: %d operation: %s dev: %pg\n", + __func__, rdev->raid_disk, add ? "add" : "remove", + rdev->bdev); + + if (rdev->raid_disk < 0) + return 0; + + if (rdev->raid_disk >= ppl_conf->count) + return -ENODEV; + + log = &ppl_conf->child_logs[rdev->raid_disk]; + + mutex_lock(&log->io_mutex); + if (add) { + ret = ppl_validate_rdev(rdev); + if (!ret) { + log->rdev = rdev; + ret = ppl_write_empty_header(log); + ppl_init_child_log(log, rdev); + } + } else { + log->rdev = NULL; + } + mutex_unlock(&log->io_mutex); + + return ret; +} + +static ssize_t +ppl_write_hint_show(struct mddev *mddev, char *buf) +{ + return sprintf(buf, "%d\n", 0); +} + +static ssize_t +ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + int err = 0; + unsigned short new; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtou16(page, 10, &new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (!raid5_has_ppl(conf) || !conf->log_private) + err = -EINVAL; + + mddev_unlock(mddev); + + return err ?: len; +} + +struct md_sysfs_entry +ppl_write_hint = __ATTR(ppl_write_hint, S_IRUGO | S_IWUSR, + ppl_write_hint_show, + ppl_write_hint_store); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c new file mode 100644 index 000000000..e4564ca1f --- /dev/null +++ b/drivers/md/raid5.c @@ -0,0 +1,9177 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * raid5.c : Multiple Devices driver for Linux + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar + * Copyright (C) 2002, 2003 H. Peter Anvin + * + * RAID-4/5/6 management functions. + * Thanks to Penguin Computing for making the RAID-6 development possible + * by donating a test server! + */ + +/* + * BITMAP UNPLUGGING: + * + * The sequencing for updating the bitmap reliably is a little + * subtle (and I got it wrong the first time) so it deserves some + * explanation. + * + * We group bitmap updates into batches. Each batch has a number. + * We may write out several batches at once, but that isn't very important. + * conf->seq_write is the number of the last batch successfully written. + * conf->seq_flush is the number of the last batch that was closed to + * new additions. + * When we discover that we will need to write to any block in a stripe + * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq + * the number of the batch it will be in. This is seq_flush+1. + * When we are ready to do a write, if that batch hasn't been written yet, + * we plug the array and queue the stripe for later. + * When an unplug happens, we increment bm_flush, thus closing the current + * batch. + * When we notice that bm_flush > bm_write, we write out all pending updates + * to the bitmap, and advance bm_write to where bm_flush was. + * This may occasionally write a bit out twice, but is sure never to + * miss any bits. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "md.h" +#include "raid5.h" +#include "raid0.h" +#include "md-bitmap.h" +#include "raid5-log.h" + +#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) + +#define cpu_to_group(cpu) cpu_to_node(cpu) +#define ANY_GROUP NUMA_NO_NODE + +#define RAID5_MAX_REQ_STRIPES 256 + +static bool devices_handle_discard_safely = false; +module_param(devices_handle_discard_safely, bool, 0644); +MODULE_PARM_DESC(devices_handle_discard_safely, + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); +static struct workqueue_struct *raid5_wq; + +static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) +{ + int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; + return &conf->stripe_hashtbl[hash]; +} + +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) +{ + return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) + __acquires(&conf->device_lock) +{ + spin_lock_irq(conf->hash_locks + hash); + spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) + __releases(&conf->device_lock) +{ + spin_unlock(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); +} + +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) + __acquires(&conf->device_lock) +{ + int i; + spin_lock_irq(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); + spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) + __releases(&conf->device_lock) +{ + int i; + spin_unlock(&conf->device_lock); + for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) + spin_unlock(conf->hash_locks + i); + spin_unlock_irq(conf->hash_locks); +} + +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} +static inline int raid6_next_disk(int disk, int raid_disks) +{ + disk++; + return (disk < raid_disks) ? disk : 0; +} + +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) +{ + int slot = *count; + + if (sh->ddf_layout) + (*count)++; + if (idx == sh->pd_idx) + return syndrome_disks; + if (idx == sh->qd_idx) + return syndrome_disks + 1; + if (!sh->ddf_layout) + (*count)++; + return slot; +} + +static void print_raid5_conf (struct r5conf *conf); + +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + +static bool stripe_is_lowprio(struct stripe_head *sh) +{ + return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && + !test_bit(STRIPE_R5C_CACHING, &sh->state); +} + +static void raid5_wakeup_stripe_thread(struct stripe_head *sh) + __must_hold(&sh->raid_conf->device_lock) +{ + struct r5conf *conf = sh->raid_conf; + struct r5worker_group *group; + int thread_cnt; + int i, cpu = sh->cpu; + + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + + if (list_empty(&sh->lru)) { + struct r5worker_group *group; + group = conf->worker_groups + cpu_to_group(cpu); + if (stripe_is_lowprio(sh)) + list_add_tail(&sh->lru, &group->loprio_list); + else + list_add_tail(&sh->lru, &group->handle_list); + group->stripes_cnt++; + sh->group = group; + } + + if (conf->worker_cnt_per_group == 0) { + md_wakeup_thread(conf->mddev->thread); + return; + } + + group = conf->worker_groups + cpu_to_group(sh->cpu); + + group->workers[0].working = true; + /* at least one worker should run to avoid race */ + queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); + + thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; + /* wakeup more workers */ + for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { + if (group->workers[i].working == false) { + group->workers[i].working = true; + queue_work_on(sh->cpu, raid5_wq, + &group->workers[i].work); + thread_cnt--; + } + } +} + +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) +{ + int i; + int injournal = 0; /* number of date pages with R5_InJournal */ + + BUG_ON(!list_empty(&sh->lru)); + BUG_ON(atomic_read(&conf->active_stripes)==0); + + if (r5c_is_writeback(conf->log)) + for (i = sh->disks; i--; ) + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + injournal++; + /* + * In the following cases, the stripe cannot be released to cached + * lists. Therefore, we make the stripe write out and set + * STRIPE_HANDLE: + * 1. when quiesce in r5c write back; + * 2. when resync is requested fot the stripe. + */ + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || + (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + set_bit(STRIPE_HANDLE, &sh->state); + } + + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_DELAYED, &sh->state); + clear_bit(STRIPE_BIT_DELAY, &sh->state); + if (conf->worker_cnt_per_group == 0) { + if (stripe_is_lowprio(sh)) + list_add_tail(&sh->lru, + &conf->loprio_list); + else + list_add_tail(&sh->lru, + &conf->handle_list); + } else { + raid5_wakeup_stripe_thread(sh); + return; + } + } + md_wakeup_thread(conf->mddev->thread); + } else { + BUG_ON(stripe_operations_active(sh)); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + atomic_dec(&conf->active_stripes); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + if (!r5c_is_writeback(conf->log)) + list_add_tail(&sh->lru, temp_inactive_list); + else { + WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); + if (injournal == 0) + list_add_tail(&sh->lru, temp_inactive_list); + else if (injournal == conf->raid_disks - conf->max_degraded) { + /* full stripe */ + if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_cached_full_stripes); + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_dec(&conf->r5c_cached_partial_stripes); + list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); + r5c_check_cached_full_stripe(conf); + } else + /* + * STRIPE_R5C_PARTIAL_STRIPE is set in + * r5c_try_caching_write(). No need to + * set it again. + */ + list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); + } + } + } +} + +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) +{ + if (atomic_dec_and_test(&sh->count)) + do_release_stripe(conf, sh, temp_inactive_list); +} + +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list, + int hash) +{ + int size; + bool do_wakeup = false; + unsigned long flags; + + if (hash == NR_STRIPE_HASH_LOCKS) { + size = NR_STRIPE_HASH_LOCKS; + hash = NR_STRIPE_HASH_LOCKS - 1; + } else + size = 1; + while (size) { + struct list_head *list = &temp_inactive_list[size - 1]; + + /* + * We don't hold any lock here yet, raid5_get_active_stripe() might + * remove stripes from the list + */ + if (!list_empty_careful(list)) { + spin_lock_irqsave(conf->hash_locks + hash, flags); + if (list_empty(conf->inactive_list + hash) && + !list_empty(list)) + atomic_dec(&conf->empty_inactive_list_nr); + list_splice_tail_init(list, conf->inactive_list + hash); + do_wakeup = true; + spin_unlock_irqrestore(conf->hash_locks + hash, flags); + } + size--; + hash--; + } + + if (do_wakeup) { + wake_up(&conf->wait_for_stripe); + if (atomic_read(&conf->active_stripes) == 0) + wake_up(&conf->wait_for_quiescent); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } +} + +static int release_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) +{ + struct stripe_head *sh, *t; + int count = 0; + struct llist_node *head; + + head = llist_del_all(&conf->released_stripes); + head = llist_reverse_order(head); + llist_for_each_entry_safe(sh, t, head, release_list) { + int hash; + + /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ + smp_mb(); + clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); + /* + * Don't worry the bit is set here, because if the bit is set + * again, the count is always > 1. This is true for + * STRIPE_ON_UNPLUG_LIST bit too. + */ + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); + count++; + } + + return count; +} + +void raid5_release_stripe(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + unsigned long flags; + struct list_head list; + int hash; + bool wakeup; + + /* Avoid release_list until the last reference. + */ + if (atomic_add_unless(&sh->count, -1, 1)) + return; + + if (unlikely(!conf->mddev->thread) || + test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + goto slow_path; + wakeup = llist_add(&sh->release_list, &conf->released_stripes); + if (wakeup) + md_wakeup_thread(conf->mddev->thread); + return; +slow_path: + /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ + if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { + INIT_LIST_HEAD(&list); + hash = sh->hash_lock_index; + do_release_stripe(conf, sh, &list); + spin_unlock_irqrestore(&conf->device_lock, flags); + release_inactive_stripe_list(conf, &list, hash); + } +} + +static inline void remove_hash(struct stripe_head *sh) +{ + pr_debug("remove_hash(), stripe %llu\n", + (unsigned long long)sh->sector); + + hlist_del_init(&sh->hash); +} + +static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) +{ + struct hlist_head *hp = stripe_hash(conf, sh->sector); + + pr_debug("insert_hash(), stripe %llu\n", + (unsigned long long)sh->sector); + + hlist_add_head(&sh->hash, hp); +} + +/* find an idle stripe, make sure it is unhashed, and return it. */ +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) +{ + struct stripe_head *sh = NULL; + struct list_head *first; + + if (list_empty(conf->inactive_list + hash)) + goto out; + first = (conf->inactive_list + hash)->next; + sh = list_entry(first, struct stripe_head, lru); + list_del_init(first); + remove_hash(sh); + atomic_inc(&conf->active_stripes); + BUG_ON(hash != sh->hash_lock_index); + if (list_empty(conf->inactive_list + hash)) + atomic_inc(&conf->empty_inactive_list_nr); +out: + return sh; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static void free_stripe_pages(struct stripe_head *sh) +{ + int i; + struct page *p; + + /* Have not allocate page pool */ + if (!sh->pages) + return; + + for (i = 0; i < sh->nr_pages; i++) { + p = sh->pages[i]; + if (p) + put_page(p); + sh->pages[i] = NULL; + } +} + +static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) +{ + int i; + struct page *p; + + for (i = 0; i < sh->nr_pages; i++) { + /* The page have allocated. */ + if (sh->pages[i]) + continue; + + p = alloc_page(gfp); + if (!p) { + free_stripe_pages(sh); + return -ENOMEM; + } + sh->pages[i] = p; + } + return 0; +} + +static int +init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) +{ + int nr_pages, cnt; + + if (sh->pages) + return 0; + + /* Each of the sh->dev[i] need one conf->stripe_size */ + cnt = PAGE_SIZE / conf->stripe_size; + nr_pages = (disks + cnt - 1) / cnt; + + sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!sh->pages) + return -ENOMEM; + sh->nr_pages = nr_pages; + sh->stripes_per_page = cnt; + return 0; +} +#endif + +static void shrink_buffers(struct stripe_head *sh) +{ + int i; + int num = sh->raid_conf->pool_size; + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE + for (i = 0; i < num ; i++) { + struct page *p; + + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); + p = sh->dev[i].page; + if (!p) + continue; + sh->dev[i].page = NULL; + put_page(p); + } +#else + for (i = 0; i < num; i++) + sh->dev[i].page = NULL; + free_stripe_pages(sh); /* Free pages */ +#endif +} + +static int grow_buffers(struct stripe_head *sh, gfp_t gfp) +{ + int i; + int num = sh->raid_conf->pool_size; + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE + for (i = 0; i < num; i++) { + struct page *page; + + if (!(page = alloc_page(gfp))) { + return 1; + } + sh->dev[i].page = page; + sh->dev[i].orig_page = page; + sh->dev[i].offset = 0; + } +#else + if (alloc_stripe_pages(sh, gfp)) + return -ENOMEM; + + for (i = 0; i < num; i++) { + sh->dev[i].page = raid5_get_dev_page(sh, i); + sh->dev[i].orig_page = sh->dev[i].page; + sh->dev[i].offset = raid5_get_page_offset(sh, i); + } +#endif + return 0; +} + +static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, + struct stripe_head *sh); + +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) +{ + struct r5conf *conf = sh->raid_conf; + int i, seq; + + BUG_ON(atomic_read(&sh->count) != 0); + BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); + BUG_ON(stripe_operations_active(sh)); + BUG_ON(sh->batch_head); + + pr_debug("init_stripe called, stripe %llu\n", + (unsigned long long)sector); +retry: + seq = read_seqcount_begin(&conf->gen_lock); + sh->generation = conf->generation - previous; + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; + sh->sector = sector; + stripe_set_idx(sector, conf, previous, sh); + sh->state = 0; + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->toread || dev->read || dev->towrite || dev->written || + test_bit(R5_LOCKED, &dev->flags)) { + pr_err("sector=%llx i=%d %p %p %p %p %d\n", + (unsigned long long)sh->sector, i, dev->toread, + dev->read, dev->towrite, dev->written, + test_bit(R5_LOCKED, &dev->flags)); + WARN_ON(1); + } + dev->flags = 0; + dev->sector = raid5_compute_blocknr(sh, i, previous); + } + if (read_seqcount_retry(&conf->gen_lock, seq)) + goto retry; + sh->overwrite_disks = 0; + insert_hash(conf, sh); + sh->cpu = smp_processor_id(); + set_bit(STRIPE_BATCH_READY, &sh->state); +} + +static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, + short generation) +{ + struct stripe_head *sh; + + pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); + hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) + if (sh->sector == sector && sh->generation == generation) + return sh; + pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); + return NULL; +} + +static struct stripe_head *find_get_stripe(struct r5conf *conf, + sector_t sector, short generation, int hash) +{ + int inc_empty_inactive_list_flag; + struct stripe_head *sh; + + sh = __find_stripe(conf, sector, generation); + if (!sh) + return NULL; + + if (atomic_inc_not_zero(&sh->count)) + return sh; + + /* + * Slow path. The reference count is zero which means the stripe must + * be on a list (sh->lru). Must remove the stripe from the list that + * references it with the device_lock held. + */ + + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; + list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && + inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } + } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); + + return sh; +} + +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + * + * Most calls to this function hold &conf->device_lock. Calls + * in raid5_run() do not require the lock as no other threads + * have been started yet. + */ +int raid5_calc_degraded(struct r5conf *conf) +{ + int degraded, degraded2; + int i; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = rcu_dereference(conf->disks[i].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (conf->raid_disks == conf->previous_raid_disks) + return degraded; + rcu_read_lock(); + degraded2 = 0; + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = rcu_dereference(conf->disks[i].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded2++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded2++; + } + rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static bool has_failed(struct r5conf *conf) +{ + int degraded = conf->mddev->degraded; + + if (test_bit(MD_BROKEN, &conf->mddev->flags)) + return true; + + if (conf->mddev->reshape_position != MaxSector) + degraded = raid5_calc_degraded(conf); + + return degraded > conf->max_degraded; +} + +enum stripe_result { + STRIPE_SUCCESS = 0, + STRIPE_RETRY, + STRIPE_SCHEDULE_AND_RETRY, + STRIPE_FAIL, +}; + +struct stripe_request_ctx { + /* a reference to the last stripe_head for batching */ + struct stripe_head *batch_last; + + /* first sector in the request */ + sector_t first_sector; + + /* last sector in the request */ + sector_t last_sector; + + /* + * bitmap to track stripe sectors that have been added to stripes + * add one to account for unaligned requests + */ + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); + + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; +}; + +/* + * Block until another thread clears R5_INACTIVE_BLOCKED or + * there are fewer than 3/4 the maximum number of active stripes + * and there is an inactive stripe available. + */ +static bool is_inactive_blocked(struct r5conf *conf, int hash) +{ + if (list_empty(conf->inactive_list + hash)) + return false; + + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) + return true; + + return (atomic_read(&conf->active_stripes) < + (conf->max_nr_stripes * 3 / 4)); +} + +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + struct stripe_request_ctx *ctx, sector_t sector, + unsigned int flags) +{ + struct stripe_head *sh; + int hash = stripe_hash_locks_hash(conf, sector); + int previous = !!(flags & R5_GAS_PREVIOUS); + + pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); + + spin_lock_irq(conf->hash_locks + hash); + + for (;;) { + if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { + /* + * Must release the reference to batch_last before + * waiting, on quiesce, otherwise the batch_last will + * hold a reference to a stripe and raid5_quiesce() + * will deadlock waiting for active_stripes to go to + * zero. + */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + + wait_event_lock_irq(conf->wait_for_quiescent, + !conf->quiesce, + *(conf->hash_locks + hash)); + } + + sh = find_get_stripe(conf, sector, conf->generation - previous, + hash); + if (sh) + break; + + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { + sh = get_free_stripe(conf, hash); + if (sh) { + r5c_check_stripe_cache_usage(conf); + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + break; + } + + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + } + + if (flags & R5_GAS_NOBLOCK) + break; + + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + + /* release batch_last before wait to avoid risk of deadlock */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + + wait_event_lock_irq(conf->wait_for_stripe, + is_inactive_blocked(conf, hash), + *(conf->hash_locks + hash)); + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + } + + spin_unlock_irq(conf->hash_locks + hash); + return sh; +} + +static bool is_full_stripe_write(struct stripe_head *sh) +{ + BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); + return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); +} + +static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __acquires(&sh1->stripe_lock) + __acquires(&sh2->stripe_lock) +{ + if (sh1 > sh2) { + spin_lock_irq(&sh2->stripe_lock); + spin_lock_nested(&sh1->stripe_lock, 1); + } else { + spin_lock_irq(&sh1->stripe_lock); + spin_lock_nested(&sh2->stripe_lock, 1); + } +} + +static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __releases(&sh1->stripe_lock) + __releases(&sh2->stripe_lock) +{ + spin_unlock(&sh1->stripe_lock); + spin_unlock_irq(&sh2->stripe_lock); +} + +/* Only freshly new full stripe normal write stripe can be added to a batch list */ +static bool stripe_can_batch(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + + if (raid5_has_log(conf) || raid5_has_ppl(conf)) + return false; + return test_bit(STRIPE_BATCH_READY, &sh->state) && + !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && + is_full_stripe_write(sh); +} + +/* we only do back search */ +static void stripe_add_to_batch_list(struct r5conf *conf, + struct stripe_head *sh, struct stripe_head *last_sh) +{ + struct stripe_head *head; + sector_t head_sector, tmp_sec; + int hash; + int dd_idx; + + /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ + tmp_sec = sh->sector; + if (!sector_div(tmp_sec, conf->chunk_sectors)) + return; + head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); + + if (last_sh && head_sector == last_sh->sector) { + head = last_sh; + atomic_inc(&head->count); + } else { + hash = stripe_hash_locks_hash(conf, head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = find_get_stripe(conf, head_sector, conf->generation, + hash); + spin_unlock_irq(conf->hash_locks + hash); + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; + } + + lock_two_stripes(head, sh); + /* clear_batch_ready clear the flag */ + if (!stripe_can_batch(head) || !stripe_can_batch(sh)) + goto unlock_out; + + if (sh->batch_head) + goto unlock_out; + + dd_idx = 0; + while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + dd_idx++; + if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || + bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) + goto unlock_out; + + if (head->batch_head) { + spin_lock(&head->batch_head->batch_lock); + /* This batch list is already running */ + if (!stripe_can_batch(head)) { + spin_unlock(&head->batch_head->batch_lock); + goto unlock_out; + } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; + + /* + * at this point, head's BATCH_READY could be cleared, but we + * can still add the stripe to batch list + */ + list_add(&sh->batch_list, &head->batch_list); + spin_unlock(&head->batch_head->batch_lock); + } else { + head->batch_head = head; + sh->batch_head = head->batch_head; + spin_lock(&head->batch_lock); + list_add_tail(&sh->batch_list, &head->batch_list); + spin_unlock(&head->batch_lock); + } + + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + + if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { + int seq = sh->bm_seq; + if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && + sh->batch_head->bm_seq > seq) + seq = sh->batch_head->bm_seq; + set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); + sh->batch_head->bm_seq = seq; + } + + atomic_inc(&sh->count); +unlock_out: + unlock_two_stripes(head, sh); +out: + raid5_release_stripe(head); +} + +/* Determine if 'data_offset' or 'new_data_offset' should be used + * in this stripe_head. + */ +static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) +{ + sector_t progress = conf->reshape_progress; + /* Need a memory barrier to make sure we see the value + * of conf->generation, or ->data_offset that was set before + * reshape_progress was updated. + */ + smp_rmb(); + if (progress == MaxSector) + return 0; + if (sh->generation == conf->generation - 1) + return 0; + /* We are in a reshape, and this is a new-generation stripe, + * so use new_data_offset. + */ + return 1; +} + +static void dispatch_bio_list(struct bio_list *tmp) +{ + struct bio *bio; + + while ((bio = bio_list_pop(tmp))) + submit_bio_noacct(bio); +} + +static int cmp_stripe(void *priv, const struct list_head *a, + const struct list_head *b) +{ + const struct r5pending_data *da = list_entry(a, + struct r5pending_data, sibling); + const struct r5pending_data *db = list_entry(b, + struct r5pending_data, sibling); + if (da->sector > db->sector) + return 1; + if (da->sector < db->sector) + return -1; + return 0; +} + +static void dispatch_defer_bios(struct r5conf *conf, int target, + struct bio_list *list) +{ + struct r5pending_data *data; + struct list_head *first, *next = NULL; + int cnt = 0; + + if (conf->pending_data_cnt == 0) + return; + + list_sort(NULL, &conf->pending_list, cmp_stripe); + + first = conf->pending_list.next; + + /* temporarily move the head */ + if (conf->next_pending_data) + list_move_tail(&conf->pending_list, + &conf->next_pending_data->sibling); + + while (!list_empty(&conf->pending_list)) { + data = list_first_entry(&conf->pending_list, + struct r5pending_data, sibling); + if (&data->sibling == first) + first = data->sibling.next; + next = data->sibling.next; + + bio_list_merge(list, &data->bios); + list_move(&data->sibling, &conf->free_list); + cnt++; + if (cnt >= target) + break; + } + conf->pending_data_cnt -= cnt; + BUG_ON(conf->pending_data_cnt < 0 || cnt < target); + + if (next != &conf->pending_list) + conf->next_pending_data = list_entry(next, + struct r5pending_data, sibling); + else + conf->next_pending_data = NULL; + /* list isn't empty */ + if (first != &conf->pending_list) + list_move_tail(&conf->pending_list, first); +} + +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp = BIO_EMPTY_LIST; + + if (conf->pending_data_cnt == 0) + return; + + spin_lock(&conf->pending_bios_lock); + dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); + BUG_ON(conf->pending_data_cnt != 0); + spin_unlock(&conf->pending_bios_lock); + + dispatch_bio_list(&tmp); +} + +static void defer_issue_bios(struct r5conf *conf, sector_t sector, + struct bio_list *bios) +{ + struct bio_list tmp = BIO_EMPTY_LIST; + struct r5pending_data *ent; + + spin_lock(&conf->pending_bios_lock); + ent = list_first_entry(&conf->free_list, struct r5pending_data, + sibling); + list_move_tail(&ent->sibling, &conf->pending_list); + ent->sector = sector; + bio_list_init(&ent->bios); + bio_list_merge(&ent->bios, bios); + conf->pending_data_cnt++; + if (conf->pending_data_cnt >= PENDING_IO_MAX) + dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); + + spin_unlock(&conf->pending_bios_lock); + + dispatch_bio_list(&tmp); +} + +static void +raid5_end_read_request(struct bio *bi); +static void +raid5_end_write_request(struct bio *bi); + +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) +{ + struct r5conf *conf = sh->raid_conf; + int i, disks = sh->disks; + struct stripe_head *head_sh = sh; + struct bio_list pending_bios = BIO_EMPTY_LIST; + struct r5dev *dev; + bool should_defer; + + might_sleep(); + + if (log_stripe(sh, s) == 0) + return; + + should_defer = conf->batch_bio_dispatch && conf->group_cnt; + + for (i = disks; i--; ) { + enum req_op op; + blk_opf_t op_flags = 0; + int replace_only = 0; + struct bio *bi, *rbi; + struct md_rdev *rdev, *rrdev = NULL; + + sh = head_sh; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + op = REQ_OP_WRITE; + if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) + op_flags = REQ_FUA; + if (test_bit(R5_Discard, &sh->dev[i].flags)) + op = REQ_OP_DISCARD; + } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + op = REQ_OP_READ; + else if (test_and_clear_bit(R5_WantReplace, + &sh->dev[i].flags)) { + op = REQ_OP_WRITE; + replace_only = 1; + } else + continue; + if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) + op_flags |= REQ_SYNC; + +again: + dev = &sh->dev[i]; + bi = &dev->req; + rbi = &dev->rreq; /* For writing to replacement */ + + rcu_read_lock(); + rrdev = rcu_dereference(conf->disks[i].replacement); + smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ + rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev) { + rdev = rrdev; + rrdev = NULL; + } + if (op_is_write(op)) { + if (replace_only) + rdev = NULL; + if (rdev == rrdev) + /* We raced and saw duplicates */ + rrdev = NULL; + } else { + if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) + rdev = rrdev; + rrdev = NULL; + } + + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + if (rrdev && test_bit(Faulty, &rrdev->flags)) + rrdev = NULL; + if (rrdev) + atomic_inc(&rrdev->nr_pending); + rcu_read_unlock(); + + /* We have already checked bad blocks for reads. Now + * need to check for writes. We never accept write errors + * on the replacement, so we don't to check rrdev. + */ + while (op_is_write(op) && rdev && + test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), + &first_bad, &bad_sectors); + if (!bad) + break; + + if (bad < 0) { + set_bit(BlockedBadBlocks, &rdev->flags); + if (!conf->mddev->external && + conf->mddev->sb_flags) { + /* It is very unlikely, but we might + * still need to write out the + * bad block log - better give it + * a chance*/ + md_check_recovery(conf->mddev); + } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, conf->mddev); + } else { + /* Acknowledged bad block - skip the write */ + rdev_dec_pending(rdev, conf->mddev); + rdev = NULL; + } + } + + if (rdev) { + if (s->syncing || s->expanding || s->expanded + || s->replacing) + md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); + + set_bit(STRIPE_IO_STARTED, &sh->state); + + bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); + bi->bi_end_io = op_is_write(op) + ? raid5_end_write_request + : raid5_end_read_request; + bi->bi_private = sh; + + pr_debug("%s: for %llu schedule op %d on disc %d\n", + __func__, (unsigned long long)sh->sector, + bi->bi_opf, i); + atomic_inc(&sh->count); + if (sh != head_sh) + atomic_inc(&head_sh->count); + if (use_new_offset(conf, sh)) + bi->bi_iter.bi_sector = (sh->sector + + rdev->new_data_offset); + else + bi->bi_iter.bi_sector = (sh->sector + + rdev->data_offset); + if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) + bi->bi_opf |= REQ_NOMERGE; + + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + + if (!op_is_write(op) && + test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * issuing read for a page in journal, this + * must be preparing for prexor in rmw; read + * the data into orig_page + */ + sh->dev[i].vec.bv_page = sh->dev[i].orig_page; + else + sh->dev[i].vec.bv_page = sh->dev[i].page; + bi->bi_vcnt = 1; + bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); + bi->bi_io_vec[0].bv_offset = sh->dev[i].offset; + bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (op == REQ_OP_DISCARD) + bi->bi_vcnt = 0; + if (rrdev) + set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); + + if (conf->mddev->gendisk) + trace_block_bio_remap(bi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); + if (should_defer && op_is_write(op)) + bio_list_add(&pending_bios, bi); + else + submit_bio_noacct(bi); + } + if (rrdev) { + if (s->syncing || s->expanding || s->expanded + || s->replacing) + md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); + + set_bit(STRIPE_IO_STARTED, &sh->state); + + bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); + BUG_ON(!op_is_write(op)); + rbi->bi_end_io = raid5_end_write_request; + rbi->bi_private = sh; + + pr_debug("%s: for %llu schedule op %d on " + "replacement disc %d\n", + __func__, (unsigned long long)sh->sector, + rbi->bi_opf, i); + atomic_inc(&sh->count); + if (sh != head_sh) + atomic_inc(&head_sh->count); + if (use_new_offset(conf, sh)) + rbi->bi_iter.bi_sector = (sh->sector + + rrdev->new_data_offset); + else + rbi->bi_iter.bi_sector = (sh->sector + + rrdev->data_offset); + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].rvec.bv_page = sh->dev[i].page; + rbi->bi_vcnt = 1; + rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); + rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset; + rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (op == REQ_OP_DISCARD) + rbi->bi_vcnt = 0; + if (conf->mddev->gendisk) + trace_block_bio_remap(rbi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); + if (should_defer && op_is_write(op)) + bio_list_add(&pending_bios, rbi); + else + submit_bio_noacct(rbi); + } + if (!rdev && !rrdev) { + if (op_is_write(op)) + set_bit(STRIPE_DEGRADED, &sh->state); + pr_debug("skip op %d on disc %d for sector %llu\n", + bi->bi_opf, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + + if (!head_sh->batch_head) + continue; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + if (sh != head_sh) + goto again; + } + + if (should_defer && !bio_list_empty(&pending_bios)) + defer_issue_bios(conf, head_sh->sector, &pending_bios); +} + +static struct dma_async_tx_descriptor * +async_copy_data(int frombio, struct bio *bio, struct page **page, + unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx, + struct stripe_head *sh, int no_skipcopy) +{ + struct bio_vec bvl; + struct bvec_iter iter; + struct page *bio_page; + int page_offset; + struct async_submit_ctl submit; + enum async_tx_flags flags = 0; + struct r5conf *conf = sh->raid_conf; + + if (bio->bi_iter.bi_sector >= sector) + page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; + + if (frombio) + flags |= ASYNC_TX_FENCE; + init_async_submit(&submit, flags, tx, NULL, NULL, NULL); + + bio_for_each_segment(bvl, bio, iter) { + int len = bvl.bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } + + if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) + clen = RAID5_STRIPE_SIZE(conf) - page_offset; + else + clen = len; + + if (clen > 0) { + b_offset += bvl.bv_offset; + bio_page = bvl.bv_page; + if (frombio) { + if (conf->skip_copy && + b_offset == 0 && page_offset == 0 && + clen == RAID5_STRIPE_SIZE(conf) && + !no_skipcopy) + *page = bio_page; + else + tx = async_memcpy(*page, bio_page, page_offset + poff, + b_offset, clen, &submit); + } else + tx = async_memcpy(bio_page, *page, b_offset, + page_offset + poff, clen, &submit); + } + /* chain the operations */ + submit.depend_tx = tx; + + if (clen < len) /* hit end of page */ + break; + page_offset += len; + } + + return tx; +} + +static void ops_complete_biofill(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int i; + struct r5conf *conf = sh->raid_conf; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* clear completed biofills */ + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + /* acknowledge completion of a biofill operation */ + /* and check if we need to reply to a read request, + * new R5_Wantfill requests are held off until + * !STRIPE_BIOFILL_RUN + */ + if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { + struct bio *rbi, *rbi2; + + BUG_ON(!dev->read); + rbi = dev->read; + dev->read = NULL; + while (rbi && rbi->bi_iter.bi_sector < + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + rbi2 = r5_next_bio(conf, rbi, dev->sector); + bio_endio(rbi); + rbi = rbi2; + } + } + } + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); + + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); +} + +static void ops_run_biofill(struct stripe_head *sh) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct async_submit_ctl submit; + int i; + struct r5conf *conf = sh->raid_conf; + + BUG_ON(sh->batch_head); + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_Wantfill, &dev->flags)) { + struct bio *rbi; + spin_lock_irq(&sh->stripe_lock); + dev->read = rbi = dev->toread; + dev->toread = NULL; + spin_unlock_irq(&sh->stripe_lock); + while (rbi && rbi->bi_iter.bi_sector < + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + tx = async_copy_data(0, rbi, &dev->page, + dev->offset, + dev->sector, tx, sh, 0); + rbi = r5_next_bio(conf, rbi, dev->sector); + } + } + } + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); + async_trigger_callback(&submit); +} + +static void mark_target_uptodate(struct stripe_head *sh, int target) +{ + struct r5dev *tgt; + + if (target < 0) + return; + + tgt = &sh->dev[target]; + set_bit(R5_UPTODATE, &tgt->flags); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + clear_bit(R5_Wantcompute, &tgt->flags); +} + +static void ops_complete_compute(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* mark the computed target(s) as uptodate */ + mark_target_uptodate(sh, sh->ops.target); + mark_target_uptodate(sh, sh->ops.target2); + + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); +} + +/* return a pointer to the address conversion region of the scribble buffer */ +static struct page **to_addr_page(struct raid5_percpu *percpu, int i) +{ + return percpu->scribble + i * percpu->scribble_obj_size; +} + +/* return a pointer to the address conversion region of the scribble buffer */ +static addr_conv_t *to_addr_conv(struct stripe_head *sh, + struct raid5_percpu *percpu, int i) +{ + return (void *) (to_addr_page(percpu, i) + sh->disks + 2); +} + +/* + * Return a pointer to record offset address. + */ +static unsigned int * +to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2); +} + +static struct dma_async_tx_descriptor * +ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); + int target = sh->ops.target; + struct r5dev *tgt = &sh->dev[target]; + struct page *xor_dest = tgt->page; + unsigned int off_dest = tgt->offset; + int count = 0; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + int i; + + BUG_ON(sh->batch_head); + + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + + for (i = disks; i--; ) { + if (i != target) { + off_srcs[count] = sh->dev[i].offset; + xor_srcs[count++] = sh->dev[i].page; + } + } + + atomic_inc(&sh->count); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + else + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + + return tx; +} + +/* set_syndrome_sources - populate source buffers for gen_syndrome + * @srcs - (struct page *) array of size sh->disks + * @offs - (unsigned int) array of offset for each page + * @sh - stripe_head to parse + * + * Populates srcs in proper layout order for the stripe and returns the + * 'count' of sources to be used in a call to async_gen_syndrome. The P + * destination buffer is recorded in srcs[count] and the Q destination + * is recorded in srcs[count+1]]. + */ +static int set_syndrome_sources(struct page **srcs, + unsigned int *offs, + struct stripe_head *sh, + int srctype) +{ + int disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); + int d0_idx = raid6_d0(sh); + int count; + int i; + + for (i = 0; i < disks; i++) + srcs[i] = NULL; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + struct r5dev *dev = &sh->dev[i]; + + if (i == sh->qd_idx || i == sh->pd_idx || + (srctype == SYNDROME_SRC_ALL) || + (srctype == SYNDROME_SRC_WANT_DRAIN && + (test_bit(R5_Wantdrain, &dev->flags) || + test_bit(R5_InJournal, &dev->flags))) || + (srctype == SYNDROME_SRC_WRITTEN && + (dev->written || + test_bit(R5_InJournal, &dev->flags)))) { + if (test_bit(R5_InJournal, &dev->flags)) + srcs[slot] = sh->dev[i].orig_page; + else + srcs[slot] = sh->dev[i].page; + /* + * For R5_InJournal, PAGE_SIZE must be 4KB and will + * not shared page. In that case, dev[i].offset + * is 0. + */ + offs[slot] = sh->dev[i].offset; + } + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + + return syndrome_disks; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); + int target; + int qd_idx = sh->qd_idx; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + struct r5dev *tgt; + struct page *dest; + unsigned int dest_off; + int i; + int count; + + BUG_ON(sh->batch_head); + if (sh->ops.target < 0) + target = sh->ops.target2; + else if (sh->ops.target2 < 0) + target = sh->ops.target; + else + /* we should only have one valid target */ + BUG(); + BUG_ON(target < 0); + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + + tgt = &sh->dev[target]; + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + dest = tgt->page; + dest_off = tgt->offset; + + atomic_inc(&sh->count); + + if (target == qd_idx) { + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); + blocks[count] = NULL; /* regenerating p is not necessary */ + BUG_ON(blocks[count+1] != dest); /* q should already be set */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu, 0)); + tx = async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + } else { + /* Compute any data- or p-drive using XOR */ + count = 0; + for (i = disks; i-- ; ) { + if (i == target || i == qd_idx) + continue; + offs[count] = sh->dev[i].offset; + blocks[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, ops_complete_compute, sh, + to_addr_conv(sh, percpu, 0)); + tx = async_xor_offs(dest, dest_off, blocks, offs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + } + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + int target = sh->ops.target; + int target2 = sh->ops.target2; + struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt2 = &sh->dev[target2]; + struct dma_async_tx_descriptor *tx; + struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); + struct async_submit_ctl submit; + + BUG_ON(sh->batch_head); + pr_debug("%s: stripe %llu block1: %d block2: %d\n", + __func__, (unsigned long long)sh->sector, target, target2); + BUG_ON(target < 0 || target2 < 0); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); + + /* we need to open-code set_syndrome_sources to handle the + * slot number conversion for 'faila' and 'failb' + */ + for (i = 0; i < disks ; i++) { + offs[i] = 0; + blocks[i] = NULL; + } + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + offs[slot] = sh->dev[i].offset; + blocks[slot] = sh->dev[i].page; + + if (i == target) + faila = slot; + if (i == target2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + + BUG_ON(faila == failb); + if (failb < faila) + swap(faila, failb); + pr_debug("%s: stripe: %llu faila: %d failb: %d\n", + __func__, (unsigned long long)sh->sector, faila, failb); + + atomic_inc(&sh->count); + + if (failb == syndrome_disks+1) { + /* Q disk is one of the missing disks */ + if (faila == syndrome_disks) { + /* Missing P+Q, just recompute */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu, 0)); + return async_gen_syndrome(blocks, offs, syndrome_disks+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); + } else { + struct page *dest; + unsigned int dest_off; + int data_target; + int qd_idx = sh->qd_idx; + + /* Missing D+Q: recompute D from P, then recompute Q */ + if (target == qd_idx) + data_target = target2; + else + data_target = target; + + count = 0; + for (i = disks; i-- ; ) { + if (i == data_target || i == qd_idx) + continue; + offs[count] = sh->dev[i].offset; + blocks[count++] = sh->dev[i].page; + } + dest = sh->dev[data_target].page; + dest_off = sh->dev[data_target].offset; + init_async_submit(&submit, + ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, NULL, NULL, + to_addr_conv(sh, percpu, 0)); + tx = async_xor_offs(dest, dest_off, blocks, offs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); + + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); + init_async_submit(&submit, ASYNC_TX_FENCE, tx, + ops_complete_compute, sh, + to_addr_conv(sh, percpu, 0)); + return async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); + } + } else { + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu, 0)); + if (failb == syndrome_disks) { + /* We're missing D+P. */ + return async_raid6_datap_recov(syndrome_disks+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, + blocks, offs, &submit); + } else { + /* We're missing D+D. */ + return async_raid6_2data_recov(syndrome_disks+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, failb, + blocks, offs, &submit); + } + } +} + +static void ops_complete_prexor(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + if (r5c_is_writeback(sh->raid_conf->log)) + /* + * raid5-cache write back uses orig_page during prexor. + * After prexor, it is time to free orig_page + */ + r5c_release_extra_page(sh); +} + +static struct dma_async_tx_descriptor * +ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); + int count = 0, pd_idx = sh->pd_idx, i; + struct async_submit_ctl submit; + + /* existing parity data subtracted */ + unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset; + struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + + BUG_ON(sh->batch_head); + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + /* Only process blocks that are known to be uptodate */ + if (test_bit(R5_InJournal, &dev->flags)) { + /* + * For this case, PAGE_SIZE must be equal to 4KB and + * page offset is zero. + */ + off_srcs[count] = dev->offset; + xor_srcs[count++] = dev->orig_page; + } else if (test_bit(R5_Wantdrain, &dev->flags)) { + off_srcs[count] = dev->offset; + xor_srcs[count++] = dev->page; + } + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); + int count; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); + tx = async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +{ + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks; + int i; + struct stripe_head *head_sh = sh; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev; + struct bio *chosen; + + sh = head_sh; + if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { + struct bio *wbi; + +again: + dev = &sh->dev[i]; + /* + * clear R5_InJournal, so when rewriting a page in + * journal, it is not skipped by r5l_log_stripe() + */ + clear_bit(R5_InJournal, &dev->flags); + spin_lock_irq(&sh->stripe_lock); + chosen = dev->towrite; + dev->towrite = NULL; + sh->overwrite_disks = 0; + BUG_ON(dev->written); + wbi = dev->written = chosen; + spin_unlock_irq(&sh->stripe_lock); + WARN_ON(dev->page != dev->orig_page); + + while (wbi && wbi->bi_iter.bi_sector < + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + if (wbi->bi_opf & REQ_FUA) + set_bit(R5_WantFUA, &dev->flags); + if (wbi->bi_opf & REQ_SYNC) + set_bit(R5_SyncIO, &dev->flags); + if (bio_op(wbi) == REQ_OP_DISCARD) + set_bit(R5_Discard, &dev->flags); + else { + tx = async_copy_data(1, wbi, &dev->page, + dev->offset, + dev->sector, tx, sh, + r5c_is_writeback(conf->log)); + if (dev->page != dev->orig_page && + !r5c_is_writeback(conf->log)) { + set_bit(R5_SkipCopy, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + clear_bit(R5_OVERWRITE, &dev->flags); + } + } + wbi = r5_next_bio(conf, wbi, dev->sector); + } + + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, + batch_list); + if (sh == head_sh) + continue; + goto again; + } + } + } + + return tx; +} + +static void ops_complete_reconstruct(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + int i; + bool fua = false, sync = false, discard = false; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); + } + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->written || i == pd_idx || i == qd_idx) { + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { + set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } + if (fua) + set_bit(R5_WantFUA, &dev->flags); + if (sync) + set_bit(R5_SyncIO, &dev->flags); + } + } + + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } + + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); +} + +static void +ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + struct page **xor_srcs; + unsigned int *off_srcs; + struct async_submit_ctl submit; + int count, pd_idx = sh->pd_idx, i; + struct page *xor_dest; + unsigned int off_dest; + int prexor = 0; + unsigned long flags; + int j = 0; + struct stripe_head *head_sh = sh; + int last_stripe; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } +again: + count = 0; + xor_srcs = to_addr_page(percpu, j); + off_srcs = to_addr_offs(sh, percpu); + /* check if prexor is active which means only process blocks + * that are part of a read-modify-write (written) + */ + if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; + off_dest = off_srcs[count] = sh->dev[pd_idx].offset; + xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (head_sh->dev[i].written || + test_bit(R5_InJournal, &head_sh->dev[i].flags)) { + off_srcs[count] = dev->offset; + xor_srcs[count++] = dev->page; + } + } + } else { + xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i != pd_idx) { + off_srcs[count] = dev->offset; + xor_srcs[count++] = dev->page; + } + } + } + + /* 1/ if we prexor'd then the dest is reused as a source + * 2/ if we did not prexor then we are redoing the parity + * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST + * for the synchronous xor case + */ + last_stripe = !head_sh->batch_head || + list_first_entry(&sh->batch_list, + struct stripe_head, batch_list) == head_sh; + if (last_stripe) { + flags = ASYNC_TX_ACK | + (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); + + atomic_inc(&head_sh->count); + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, + to_addr_conv(sh, percpu, j)); + } else { + flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; + init_async_submit(&submit, flags, tx, NULL, NULL, + to_addr_conv(sh, percpu, j)); + } + + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + else + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + if (!last_stripe) { + j++; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + goto again; + } +} + +static void +ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct async_submit_ctl submit; + struct page **blocks; + unsigned int *offs; + int count, i, j = 0; + struct stripe_head *head_sh = sh; + int last_stripe; + int synflags; + unsigned long txflags; + + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + +again: + blocks = to_addr_page(percpu, j); + offs = to_addr_offs(sh, percpu); + + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + synflags = SYNDROME_SRC_WRITTEN; + txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; + } else { + synflags = SYNDROME_SRC_ALL; + txflags = ASYNC_TX_ACK; + } + + count = set_syndrome_sources(blocks, offs, sh, synflags); + last_stripe = !head_sh->batch_head || + list_first_entry(&sh->batch_list, + struct stripe_head, batch_list) == head_sh; + + if (last_stripe) { + atomic_inc(&head_sh->count); + init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, + head_sh, to_addr_conv(sh, percpu, j)); + } else + init_async_submit(&submit, 0, tx, NULL, NULL, + to_addr_conv(sh, percpu, j)); + tx = async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); + if (!last_stripe) { + j++; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + goto again; + } +} + +static void ops_complete_check(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + sh->check_state = check_state_check_result; + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); +} + +static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct page *xor_dest; + unsigned int off_dest; + struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + int count; + int i; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + BUG_ON(sh->batch_head); + count = 0; + xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; + off_srcs[count] = off_dest; + xor_srcs[count++] = xor_dest; + for (i = disks; i--; ) { + if (i == pd_idx || i == qd_idx) + continue; + off_srcs[count] = sh->dev[i].offset; + xor_srcs[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, 0, NULL, NULL, NULL, + to_addr_conv(sh, percpu, 0)); + tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), + &sh->ops.zero_sum_result, &submit); + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); + tx = async_trigger_callback(&submit); +} + +static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) +{ + struct page **srcs = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); + struct async_submit_ctl submit; + int count; + + pr_debug("%s: stripe %llu checkp: %d\n", __func__, + (unsigned long long)sh->sector, checkp); + + BUG_ON(sh->batch_head); + count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL); + if (!checkp) + srcs[count] = NULL; + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, + sh, to_addr_conv(sh, percpu, 0)); + async_syndrome_val(srcs, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit); +} + +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +{ + int overlap_clear = 0, i, disks = sh->disks; + struct dma_async_tx_descriptor *tx = NULL; + struct r5conf *conf = sh->raid_conf; + int level = conf->level; + struct raid5_percpu *percpu; + + local_lock(&conf->percpu->lock); + percpu = this_cpu_ptr(conf->percpu); + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + ops_run_biofill(sh); + overlap_clear++; + } + + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + if (level < 6) + tx = ops_run_compute5(sh, percpu); + else { + if (sh->ops.target2 < 0 || sh->ops.target < 0) + tx = ops_run_compute6_1(sh, percpu); + else + tx = ops_run_compute6_2(sh, percpu); + } + /* terminate the chain if reconstruct is not set to be run */ + if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) + async_tx_ack(tx); + } + + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { + if (level < 6) + tx = ops_run_prexor5(sh, percpu, tx); + else + tx = ops_run_prexor6(sh, percpu, tx); + } + + if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) + tx = ops_run_partial_parity(sh, percpu, tx); + + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); + overlap_clear++; + } + + if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { + if (level < 6) + ops_run_reconstruct5(sh, percpu, tx); + else + ops_run_reconstruct6(sh, percpu, tx); + } + + if (test_bit(STRIPE_OP_CHECK, &ops_request)) { + if (sh->check_state == check_state_run) + ops_run_check_p(sh, percpu); + else if (sh->check_state == check_state_run_q) + ops_run_check_pq(sh, percpu, 0); + else if (sh->check_state == check_state_run_pq) + ops_run_check_pq(sh, percpu, 1); + else + BUG(); + } + + if (overlap_clear && !sh->batch_head) { + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } + } + local_unlock(&conf->percpu->lock); +} + +static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +{ +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + kfree(sh->pages); +#endif + if (sh->ppl_page) + __free_page(sh->ppl_page); + kmem_cache_free(sc, sh); +} + +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, + int disks, struct r5conf *conf) +{ + struct stripe_head *sh; + + sh = kmem_cache_zalloc(sc, gfp); + if (sh) { + spin_lock_init(&sh->stripe_lock); + spin_lock_init(&sh->batch_lock); + INIT_LIST_HEAD(&sh->batch_list); + INIT_LIST_HEAD(&sh->lru); + INIT_LIST_HEAD(&sh->r5c); + INIT_LIST_HEAD(&sh->log_list); + atomic_set(&sh->count, 1); + sh->raid_conf = conf; + sh->log_start = MaxSector; + + if (raid5_has_ppl(conf)) { + sh->ppl_page = alloc_page(gfp); + if (!sh->ppl_page) { + free_stripe(sc, sh); + return NULL; + } + } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + if (init_stripe_shared_pages(sh, conf, disks)) { + free_stripe(sc, sh); + return NULL; + } +#endif + } + return sh; +} +static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) +{ + struct stripe_head *sh; + + sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); + if (!sh) + return 0; + + if (grow_buffers(sh, gfp)) { + shrink_buffers(sh); + free_stripe(conf->slab_cache, sh); + return 0; + } + sh->hash_lock_index = + conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + /* we just created an active stripe so... */ + atomic_inc(&conf->active_stripes); + + raid5_release_stripe(sh); + conf->max_nr_stripes++; + return 1; +} + +static int grow_stripes(struct r5conf *conf, int num) +{ + struct kmem_cache *sc; + size_t namelen = sizeof(conf->cache_name[0]); + int devs = max(conf->raid_disks, conf->previous_raid_disks); + + if (conf->mddev->gendisk) + snprintf(conf->cache_name[0], namelen, + "raid%d-%s", conf->level, mdname(conf->mddev)); + else + snprintf(conf->cache_name[0], namelen, + "raid%d-%p", conf->level, conf->mddev); + snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); + + conf->active_name = 0; + sc = kmem_cache_create(conf->cache_name[conf->active_name], + sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), + 0, 0, NULL); + if (!sc) + return 1; + conf->slab_cache = sc; + conf->pool_size = devs; + while (num--) + if (!grow_one_stripe(conf, GFP_KERNEL)) + return 1; + + return 0; +} + +/** + * scribble_alloc - allocate percpu scribble buffer for required size + * of the scribble region + * @percpu: from for_each_present_cpu() of the caller + * @num: total number of disks in the array + * @cnt: scribble objs count for required size of the scribble region + * + * The scribble buffer size must be enough to contain: + * 1/ a struct page pointer for each device in the array +2 + * 2/ room to convert each entry in (1) to its corresponding dma + * (dma_map_page()) or page (page_address()) address. + * + * Note: the +2 is for the destination buffers of the ddf/raid6 case where we + * calculate over all devices (not just the data blocks), using zeros in place + * of the P and Q blocks. + */ +static int scribble_alloc(struct raid5_percpu *percpu, + int num, int cnt) +{ + size_t obj_size = + sizeof(struct page *) * (num + 2) + + sizeof(addr_conv_t) * (num + 2) + + sizeof(unsigned int) * (num + 2); + void *scribble; + + /* + * If here is in raid array suspend context, it is in memalloc noio + * context as well, there is no potential recursive memory reclaim + * I/Os with the GFP_KERNEL flag. + */ + scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); + if (!scribble) + return -ENOMEM; + + kvfree(percpu->scribble); + + percpu->scribble = scribble; + percpu->scribble_obj_size = obj_size; + return 0; +} + +static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) +{ + unsigned long cpu; + int err = 0; + + /* + * Never shrink. And mddev_suspend() could deadlock if this is called + * from raid5d. In that case, scribble_disks and scribble_sectors + * should equal to new_disks and new_sectors + */ + if (conf->scribble_disks >= new_disks && + conf->scribble_sectors >= new_sectors) + return 0; + mddev_suspend(conf->mddev); + cpus_read_lock(); + + for_each_present_cpu(cpu) { + struct raid5_percpu *percpu; + + percpu = per_cpu_ptr(conf->percpu, cpu); + err = scribble_alloc(percpu, new_disks, + new_sectors / RAID5_STRIPE_SECTORS(conf)); + if (err) + break; + } + + cpus_read_unlock(); + mddev_resume(conf->mddev); + if (!err) { + conf->scribble_disks = new_disks; + conf->scribble_sectors = new_sectors; + } + return err; +} + +static int resize_stripes(struct r5conf *conf, int newsize) +{ + /* Make all the stripes able to hold 'newsize' devices. + * New slots in each stripe get 'page' set to a new page. + * + * This happens in stages: + * 1/ create a new kmem_cache and allocate the required number of + * stripe_heads. + * 2/ gather all the old stripe_heads and transfer the pages across + * to the new stripe_heads. This will have the side effect of + * freezing the array as once all stripe_heads have been collected, + * no IO will be possible. Old stripe heads are freed once their + * pages have been transferred over, and the old kmem_cache is + * freed when all stripes are done. + * 3/ reallocate conf->disks to be suitable bigger. If this fails, + * we simple return a failure status - no need to clean anything up. + * 4/ allocate new pages for the new slots in the new stripe_heads. + * If this fails, we don't bother trying the shrink the + * stripe_heads down again, we just leave them as they are. + * As each stripe_head is processed the new one is released into + * active service. + * + * Once step2 is started, we cannot afford to wait for a write, + * so we use GFP_NOIO allocations. + */ + struct stripe_head *osh, *nsh; + LIST_HEAD(newstripes); + struct disk_info *ndisks; + int err = 0; + struct kmem_cache *sc; + int i; + int hash, cnt; + + md_allow_write(conf->mddev); + + /* Step 1 */ + sc = kmem_cache_create(conf->cache_name[1-conf->active_name], + sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), + 0, 0, NULL); + if (!sc) + return -ENOMEM; + + /* Need to ensure auto-resizing doesn't interfere */ + mutex_lock(&conf->cache_size_mutex); + + for (i = conf->max_nr_stripes; i; i--) { + nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); + if (!nsh) + break; + + list_add(&nsh->lru, &newstripes); + } + if (i) { + /* didn't get enough, give up */ + while (!list_empty(&newstripes)) { + nsh = list_entry(newstripes.next, struct stripe_head, lru); + list_del(&nsh->lru); + free_stripe(sc, nsh); + } + kmem_cache_destroy(sc); + mutex_unlock(&conf->cache_size_mutex); + return -ENOMEM; + } + /* Step 2 - Must use GFP_NOIO now. + * OK, we have enough stripes, start collecting inactive + * stripes and copying them over + */ + hash = 0; + cnt = 0; + list_for_each_entry(nsh, &newstripes, lru) { + lock_device_hash_lock(conf, hash); + wait_event_cmd(conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash), + unlock_device_hash_lock(conf, hash), + lock_device_hash_lock(conf, hash)); + osh = get_free_stripe(conf, hash); + unlock_device_hash_lock(conf, hash); + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < osh->nr_pages; i++) { + nsh->pages[i] = osh->pages[i]; + osh->pages[i] = NULL; + } +#endif + for(i=0; ipool_size; i++) { + nsh->dev[i].page = osh->dev[i].page; + nsh->dev[i].orig_page = osh->dev[i].page; + nsh->dev[i].offset = osh->dev[i].offset; + } + nsh->hash_lock_index = hash; + free_stripe(conf->slab_cache, osh); + cnt++; + if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { + hash++; + cnt = 0; + } + } + kmem_cache_destroy(conf->slab_cache); + + /* Step 3. + * At this point, we are holding all the stripes so the array + * is completely stalled, so now is a good time to resize + * conf->disks and the scribble region + */ + ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO); + if (ndisks) { + for (i = 0; i < conf->pool_size; i++) + ndisks[i] = conf->disks[i]; + + for (i = conf->pool_size; i < newsize; i++) { + ndisks[i].extra_page = alloc_page(GFP_NOIO); + if (!ndisks[i].extra_page) + err = -ENOMEM; + } + + if (err) { + for (i = conf->pool_size; i < newsize; i++) + if (ndisks[i].extra_page) + put_page(ndisks[i].extra_page); + kfree(ndisks); + } else { + kfree(conf->disks); + conf->disks = ndisks; + } + } else + err = -ENOMEM; + + conf->slab_cache = sc; + conf->active_name = 1-conf->active_name; + + /* Step 4, return new stripes to service */ + while(!list_empty(&newstripes)) { + nsh = list_entry(newstripes.next, struct stripe_head, lru); + list_del_init(&nsh->lru); + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < nsh->nr_pages; i++) { + if (nsh->pages[i]) + continue; + nsh->pages[i] = alloc_page(GFP_NOIO); + if (!nsh->pages[i]) + err = -ENOMEM; + } + + for (i = conf->raid_disks; i < newsize; i++) { + if (nsh->dev[i].page) + continue; + nsh->dev[i].page = raid5_get_dev_page(nsh, i); + nsh->dev[i].orig_page = nsh->dev[i].page; + nsh->dev[i].offset = raid5_get_page_offset(nsh, i); + } +#else + for (i=conf->raid_disks; i < newsize; i++) + if (nsh->dev[i].page == NULL) { + struct page *p = alloc_page(GFP_NOIO); + nsh->dev[i].page = p; + nsh->dev[i].orig_page = p; + nsh->dev[i].offset = 0; + if (!p) + err = -ENOMEM; + } +#endif + raid5_release_stripe(nsh); + } + /* critical section pass, GFP_NOIO no longer needed */ + + if (!err) + conf->pool_size = newsize; + mutex_unlock(&conf->cache_size_mutex); + + return err; +} + +static int drop_one_stripe(struct r5conf *conf) +{ + struct stripe_head *sh; + int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; + + spin_lock_irq(conf->hash_locks + hash); + sh = get_free_stripe(conf, hash); + spin_unlock_irq(conf->hash_locks + hash); + if (!sh) + return 0; + BUG_ON(atomic_read(&sh->count)); + shrink_buffers(sh); + free_stripe(conf->slab_cache, sh); + atomic_dec(&conf->active_stripes); + conf->max_nr_stripes--; + return 1; +} + +static void shrink_stripes(struct r5conf *conf) +{ + while (conf->max_nr_stripes && + drop_one_stripe(conf)) + ; + + kmem_cache_destroy(conf->slab_cache); + conf->slab_cache = NULL; +} + +/* + * This helper wraps rcu_dereference_protected() and can be used when + * it is known that the nr_pending of the rdev is elevated. + */ +static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) +{ + return rcu_dereference_protected(rdev, + atomic_read(&rcu_access_pointer(rdev)->nr_pending)); +} + +/* + * This helper wraps rcu_dereference_protected() and should be used + * when it is known that the mddev_lock() is held. This is safe + * seeing raid5_remove_disk() has the same lock held. + */ +static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev, + struct md_rdev __rcu *rdev) +{ + return rcu_dereference_protected(rdev, + lockdep_is_held(&mddev->reconfig_mutex)); +} + +static void raid5_end_read_request(struct bio * bi) +{ + struct stripe_head *sh = bi->bi_private; + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks, i; + struct md_rdev *rdev = NULL; + sector_t s; + + for (i=0 ; idev[i].req) + break; + + pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", + (unsigned long long)sh->sector, i, atomic_read(&sh->count), + bi->bi_status); + if (i == disks) { + BUG(); + return; + } + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + /* If replacement finished while this request was outstanding, + * 'replacement' might be NULL already. + * In that case it moved down to 'rdev'. + * rdev is not removed until all requests are finished. + */ + rdev = rdev_pend_deref(conf->disks[i].replacement); + if (!rdev) + rdev = rdev_pend_deref(conf->disks[i].rdev); + + if (use_new_offset(conf, sh)) + s = sh->sector + rdev->new_data_offset; + else + s = sh->sector + rdev->data_offset; + if (!bi->bi_status) { + set_bit(R5_UPTODATE, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + /* Note that this cannot happen on a + * replacement device. We just fail those on + * any error + */ + pr_info_ratelimited( + "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n", + mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), + (unsigned long long)s, + rdev->bdev); + atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * end read for a page in journal, this + * must be preparing for prexor in rmw + */ + set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + + if (atomic_read(&rdev->read_errors)) + atomic_set(&rdev->read_errors, 0); + } else { + int retry = 0; + int set_bad = 0; + + clear_bit(R5_UPTODATE, &sh->dev[i].flags); + if (!(bi->bi_status == BLK_STS_PROTECTION)) + atomic_inc(&rdev->read_errors); + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + pr_warn_ratelimited( + "md/raid:%s: read error on replacement device (sector %llu on %pg).\n", + mdname(conf->mddev), + (unsigned long long)s, + rdev->bdev); + else if (conf->mddev->degraded >= conf->max_degraded) { + set_bad = 1; + pr_warn_ratelimited( + "md/raid:%s: read error not correctable (sector %llu on %pg).\n", + mdname(conf->mddev), + (unsigned long long)s, + rdev->bdev); + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { + /* Oh, no!!! */ + set_bad = 1; + pr_warn_ratelimited( + "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n", + mdname(conf->mddev), + (unsigned long long)s, + rdev->bdev); + } else if (atomic_read(&rdev->read_errors) + > conf->max_nr_stripes) { + if (!test_bit(Faulty, &rdev->flags)) { + pr_warn("md/raid:%s: %d read_errors > %d stripes\n", + mdname(conf->mddev), + atomic_read(&rdev->read_errors), + conf->max_nr_stripes); + pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n", + mdname(conf->mddev), rdev->bdev); + } + } else + retry = 1; + if (set_bad && test_bit(In_sync, &rdev->flags) + && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + retry = 1; + if (retry) + if (sh->qd_idx >= 0 && sh->pd_idx == i) + set_bit(R5_ReadError, &sh->dev[i].flags); + else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + set_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + } else + set_bit(R5_ReadNoMerge, &sh->dev[i].flags); + else { + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + if (!(set_bad + && test_bit(In_sync, &rdev->flags) + && rdev_set_badblocks( + rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) + md_error(conf->mddev, rdev); + } + } + rdev_dec_pending(rdev, conf->mddev); + bio_uninit(bi); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); +} + +static void raid5_end_write_request(struct bio *bi) +{ + struct stripe_head *sh = bi->bi_private; + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks, i; + struct md_rdev *rdev; + sector_t first_bad; + int bad_sectors; + int replacement = 0; + + for (i = 0 ; i < disks; i++) { + if (bi == &sh->dev[i].req) { + rdev = rdev_pend_deref(conf->disks[i].rdev); + break; + } + if (bi == &sh->dev[i].rreq) { + rdev = rdev_pend_deref(conf->disks[i].replacement); + if (rdev) + replacement = 1; + else + /* rdev was removed and 'replacement' + * replaced it. rdev is not removed + * until all requests are finished. + */ + rdev = rdev_pend_deref(conf->disks[i].rdev); + break; + } + } + pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", + (unsigned long long)sh->sector, i, atomic_read(&sh->count), + bi->bi_status); + if (i == disks) { + BUG(); + return; + } + + if (replacement) { + if (bi->bi_status) + md_error(conf->mddev, rdev); + else if (is_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), + &first_bad, &bad_sectors)) + set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); + } else { + if (bi->bi_status) { + set_bit(STRIPE_DEGRADED, &sh->state); + set_bit(WriteErrorSeen, &rdev->flags); + set_bit(R5_WriteError, &sh->dev[i].flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } else if (is_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), + &first_bad, &bad_sectors)) { + set_bit(R5_MadeGood, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) + /* That was a successful write so make + * sure it looks like we already did + * a re-write. + */ + set_bit(R5_ReWrite, &sh->dev[i].flags); + } + } + rdev_dec_pending(rdev, conf->mddev); + + if (sh->batch_head && bi->bi_status && !replacement) + set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); + + bio_uninit(bi); + if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + + if (sh->batch_head && sh != sh->batch_head) + raid5_release_stripe(sh->batch_head); + raid5_release_stripe(sh); +} + +static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r5conf *conf = mddev->private; + unsigned long flags; + pr_debug("raid456: error called\n"); + + pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n", + mdname(mddev), rdev->bdev); + + spin_lock_irqsave(&conf->device_lock, flags); + set_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + mddev->degraded = raid5_calc_degraded(conf); + + if (has_failed(conf)) { + set_bit(MD_BROKEN, &conf->mddev->flags); + conf->recovery_disabled = mddev->recovery_disabled; + + pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", + mdname(mddev), mddev->degraded, conf->raid_disks); + } else { + pr_crit("md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), conf->raid_disks - mddev->degraded); + } + + spin_unlock_irqrestore(&conf->device_lock, flags); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + + set_bit(Blocked, &rdev->flags); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + r5c_update_on_rdev_error(mddev, rdev); +} + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) +{ + sector_t stripe, stripe2; + sector_t chunk_number; + unsigned int chunk_offset; + int pd_idx, qd_idx; + int ddf_layout = 0; + sector_t new_sector; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; + + /* First compute the information on this sector */ + + /* + * Compute the chunk number and the sector offset inside the chunk + */ + chunk_offset = sector_div(r_sector, sectors_per_chunk); + chunk_number = r_sector; + + /* + * Compute the stripe number + */ + stripe = chunk_number; + *dd_idx = sector_div(stripe, data_disks); + stripe2 = stripe; + /* + * Select the parity disk based on the user selected algorithm. + */ + pd_idx = qd_idx = -1; + switch(conf->level) { + case 4: + pd_idx = data_disks; + break; + case 5: + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd_idx = data_disks - sector_div(stripe2, raid_disks); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_LEFT_SYMMETRIC: + pd_idx = data_disks - sector_div(stripe2, raid_disks); + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + break; + default: + BUG(); + } + break; + case 6: + + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + case ALGORITHM_LEFT_SYMMETRIC: + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; + break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + stripe2 += 1; + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = sector_div(stripe2, raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = sector_div(stripe2, raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: + BUG(); + } + break; + } + + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; + } + /* + * Finally, compute the new sector number + */ + new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; + return new_sector; +} + +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) +{ + struct r5conf *conf = sh->raid_conf; + int raid_disks = sh->disks; + int data_disks = raid_disks - conf->max_degraded; + sector_t new_sector = sh->sector, check; + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + sector_t stripe; + int chunk_offset; + sector_t chunk_number; + int dummy1, dd_idx = i; + sector_t r_sector; + struct stripe_head sh2; + + chunk_offset = sector_div(new_sector, sectors_per_chunk); + stripe = new_sector; + + if (i == sh->pd_idx) + return 0; + switch(conf->level) { + case 4: break; + case 5: + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; + default: + BUG(); + } + break; + case 6: + if (i == sh->qd_idx) + return 0; /* It is the Q disk */ + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ + else if (i > sh->pd_idx) + i -= 2; /* D D P Q D */ + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ + else { + /* D D P Q D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 2); + } + break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + /* Like left_symmetric, but P is before Q */ + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else { + /* D D Q P D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + } + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; + default: + BUG(); + } + break; + } + + chunk_number = stripe * data_disks + i; + r_sector = chunk_number * sectors_per_chunk + chunk_offset; + + check = raid5_compute_sector(conf, r_sector, + previous, &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { + pr_warn("md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); + return 0; + } + return r_sector; +} + +/* + * There are cases where we want handle_stripe_dirtying() and + * schedule_reconstruction() to delay towrite to some dev of a stripe. + * + * This function checks whether we want to delay the towrite. Specifically, + * we delay the towrite when: + * + * 1. degraded stripe has a non-overwrite to the missing dev, AND this + * stripe has data in journal (for other devices). + * + * In this case, when reading data for the non-overwrite dev, it is + * necessary to handle complex rmw of write back cache (prexor with + * orig_page, and xor with page). To keep read path simple, we would + * like to flush data in journal to RAID disks first, so complex rmw + * is handled in the write patch (handle_stripe_dirtying). + * + * 2. when journal space is critical (R5C_LOG_CRITICAL=1) + * + * It is important to be able to flush all stripes in raid5-cache. + * Therefore, we need reserve some space on the journal device for + * these flushes. If flush operation includes pending writes to the + * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe + * for the flush out. If we exclude these pending writes from flush + * operation, we only need (conf->max_degraded + 1) pages per stripe. + * Therefore, excluding pending writes in these cases enables more + * efficient use of the journal device. + * + * Note: To make sure the stripe makes progress, we only delay + * towrite for stripes with data already in journal (injournal > 0). + * When LOG_CRITICAL, stripes with injournal == 0 will be sent to + * no_space_stripes list. + * + * 3. during journal failure + * In journal failure, we try to flush all cached data to raid disks + * based on data in stripe cache. The array is read-only to upper + * layers, so we would skip all pending writes. + * + */ +static inline bool delay_towrite(struct r5conf *conf, + struct r5dev *dev, + struct stripe_head_state *s) +{ + /* case 1 above */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal) + return true; + /* case 2 above */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + s->injournal > 0) + return true; + /* case 3 above */ + if (s->log_failed && s->injournal) + return true; + return false; +} + +static void +schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; + struct r5conf *conf = sh->raid_conf; + int level = conf->level; + + if (rcw) { + /* + * In some cases, handle_stripe_dirtying initially decided to + * run rmw and allocates extra page for prexor. However, rcw is + * cheaper later on. We need to free the extra page now, + * because we won't be able to do that in ops_complete_prexor(). + */ + r5c_release_extra_page(sh); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite && !delay_towrite(conf, dev, s)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } else if (test_bit(R5_InJournal, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + s->locked++; + } + } + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + if (!s->locked) + /* False alarm, nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; + + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + + if (s->locked + conf->max_degraded == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&conf->pending_full_writes); + } else { + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + BUG_ON(level == 6 && + (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i == pd_idx || i == qd_idx) + continue; + + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } else if (test_bit(R5_InJournal, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + s->locked++; + } + } + if (!s->locked) + /* False alarm - nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + } + + /* keep the parity disk(s) locked while asynchronous operations + * are in flight + */ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->locked++; + + if (level == 6) { + int qd_idx = sh->qd_idx; + struct r5dev *dev = &sh->dev[qd_idx]; + + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + + if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && + test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && + !test_bit(STRIPE_FULL_WRITE, &sh->state) && + test_bit(R5_Insync, &sh->dev[pd_idx].flags)) + set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); + + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", + __func__, (unsigned long long)sh->sector, + s->locked, s->ops_request); +} + +static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite) +{ + struct r5conf *conf = sh->raid_conf; + struct bio **bip; + + pr_debug("checking bi b#%llu to stripe s#%llu\n", + bi->bi_iter.bi_sector, sh->sector); + + /* Don't allow new IO added to stripes in batch list */ + if (sh->batch_head) + return true; + + if (forwrite) + bip = &sh->dev[dd_idx].towrite; + else + bip = &sh->dev[dd_idx].toread; + + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { + if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) + return true; + bip = &(*bip)->bi_next; + } + + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) + return true; + + if (forwrite && raid5_has_ppl(conf)) { + /* + * With PPL only writes to consecutive data chunks within a + * stripe are allowed because for a single stripe_head we can + * only have one PPL entry at a time, which describes one data + * range. Not really an overlap, but wait_for_overlap can be + * used to handle this. + */ + sector_t sector; + sector_t first = 0; + sector_t last = 0; + int count = 0; + int i; + + for (i = 0; i < sh->disks; i++) { + if (i != sh->pd_idx && + (i == dd_idx || sh->dev[i].towrite)) { + sector = sh->dev[i].sector; + if (count == 0 || sector < first) + first = sector; + if (sector > last) + last = sector; + count++; + } + } + + if (first + conf->chunk_sectors * (count - 1) != last) + return true; + } + + return false; +} + +static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + struct r5conf *conf = sh->raid_conf; + struct bio **bip; + int firstwrite = 0; + + if (forwrite) { + bip = &sh->dev[dd_idx].towrite; + if (!*bip) + firstwrite = 1; + } else { + bip = &sh->dev[dd_idx].toread; + } + + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) + bip = &(*bip)->bi_next; + + if (!forwrite || previous) + clear_bit(STRIPE_BATCH_READY, &sh->state); + + BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); + if (*bip) + bi->bi_next = *bip; + *bip = bi; + bio_inc_remaining(bi); + md_write_inc(conf->mddev, bi); + + if (forwrite) { + /* check if page is covered */ + sector_t sector = sh->dev[dd_idx].sector; + for (bi=sh->dev[dd_idx].towrite; + sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && + bi && bi->bi_iter.bi_sector <= sector; + bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { + if (bio_end_sector(bi) >= sector) + sector = bio_end_sector(bi); + } + if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) + if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) + sh->overwrite_disks++; + } + + pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, + sh->dev[dd_idx].sector); + + if (conf->mddev->bitmap && firstwrite) { + /* Cannot hold spinlock over bitmap_startwrite, + * but must ensure this isn't added to a batch until + * we have added to the bitmap and set bm_seq. + * So set STRIPE_BITMAP_PENDING to prevent + * batching. + * If multiple __add_stripe_bio() calls race here they + * much all set STRIPE_BITMAP_PENDING. So only the first one + * to complete "bitmap_startwrite" gets to set + * STRIPE_BIT_DELAY. This is important as once a stripe + * is added to a batch, STRIPE_BIT_DELAY cannot be changed + * any more. + */ + set_bit(STRIPE_BITMAP_PENDING, &sh->state); + spin_unlock_irq(&sh->stripe_lock); + md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); + spin_lock_irq(&sh->stripe_lock); + clear_bit(STRIPE_BITMAP_PENDING, &sh->state); + if (!sh->batch_head) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + } +} + +/* + * Each stripe/dev can have one or more bios attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + spin_lock_irq(&sh->stripe_lock); + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&sh->stripe_lock); + return false; + } + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); + spin_unlock_irq(&sh->stripe_lock); + return true; +} + +static void end_reshape(struct r5conf *conf); + +static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, + struct stripe_head *sh) +{ + int sectors_per_chunk = + previous ? conf->prev_chunk_sectors : conf->chunk_sectors; + int dd_idx; + int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; + + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) + *sectors_per_chunk + chunk_offset, + previous, + &dd_idx, sh); +} + +static void +handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks) +{ + int i; + BUG_ON(sh->batch_head); + for (i = disks; i--; ) { + struct bio *bi; + int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + struct md_rdev *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + atomic_inc(&rdev->nr_pending); + else + rdev = NULL; + rcu_read_unlock(); + if (rdev) { + if (!rdev_set_badblocks( + rdev, + sh->sector, + RAID5_STRIPE_SECTORS(conf), 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } + } + spin_lock_irq(&sh->stripe_lock); + /* fail all writes first */ + bi = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + sh->overwrite_disks = 0; + spin_unlock_irq(&sh->stripe_lock); + if (bi) + bitmap_end = 1; + + log_stripe_write_finished(sh); + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + + while (bi && bi->bi_iter.bi_sector < + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); + + md_write_end(conf->mddev); + bio_io_error(bi); + bi = nextbi; + } + if (bitmap_end) + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0, 0); + bitmap_end = 0; + /* and fail all 'written' */ + bi = sh->dev[i].written; + sh->dev[i].written = NULL; + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].page = sh->dev[i].orig_page; + } + + if (bi) bitmap_end = 1; + while (bi && bi->bi_iter.bi_sector < + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); + + md_write_end(conf->mddev); + bio_io_error(bi); + bi = bi2; + } + + /* fail any reads if this device is non-operational and + * the data has not reached the cache yet. + */ + if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + s->failed > conf->max_degraded && + (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); + bi = sh->dev[i].toread; + sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + if (bi) + s->to_read--; + while (bi && bi->bi_iter.bi_sector < + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *nextbi = + r5_next_bio(conf, bi, sh->dev[i].sector); + + bio_io_error(bi); + bi = nextbi; + } + } + if (bitmap_end) + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0, 0); + /* If we were in the middle of a write the parity block might + * still be locked - so just clear all R5_LOCKED flags + */ + clear_bit(R5_LOCKED, &sh->dev[i].flags); + } + s->to_write = 0; + s->written = 0; + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); +} + +static void +handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s) +{ + int abort = 0; + int i; + + BUG_ON(sh->batch_head); + clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); + s->syncing = 0; + s->replacing = 0; + /* There is nothing more to do for sync/check/repair. + * Don't even need to abort as that is handled elsewhere + * if needed, and not always wanted e.g. if there is a known + * bad block here. + * For recover/replace we need to record a bad block on all + * non-sync devices, or abort the recovery + */ + if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { + /* During recovery devices cannot be removed, so + * locking and refcounting of rdevs is not needed + */ + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0)) + abort = 1; + rdev = rcu_dereference(conf->disks[i].replacement); + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0)) + abort = 1; + } + rcu_read_unlock(); + if (abort) + conf->recovery_disabled = + conf->mddev->recovery_disabled; + } + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); +} + +static int want_replace(struct stripe_head *sh, int disk_idx) +{ + struct md_rdev *rdev; + int rv = 0; + + rcu_read_lock(); + rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && (rdev->recovery_offset <= sh->sector + || rdev->mddev->recovery_cp <= sh->sector)) + rv = 1; + rcu_read_unlock(); + return rv; +} + +static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], + &sh->dev[s->failed_num[1]] }; + int i; + bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); + + + if (test_bit(R5_LOCKED, &dev->flags) || + test_bit(R5_UPTODATE, &dev->flags)) + /* No point reading this as we already have it or have + * decided to get it. + */ + return 0; + + if (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) + /* We need this block to directly satisfy a request */ + return 1; + + if (s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx))) + /* When syncing, or expanding we read everything. + * When replacing, we need the replaced block. + */ + return 1; + + if ((s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread)) + /* If we want to read from a failed device, then + * we need to actually read every other device. + */ + return 1; + + /* Sometimes neither read-modify-write nor reconstruct-write + * cycles can work. In those cases we read every block we + * can. Then the parity-update is certain to have enough to + * work with. + * This can only be a problem when we need to write something, + * and some device has failed. If either of those tests + * fail we need look no further. + */ + if (!s->failed || !s->to_write) + return 0; + + if (test_bit(R5_Insync, &dev->flags) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + /* Pre-reads at not permitted until after short delay + * to gather multiple requests. However if this + * device is no Insync, the block could only be computed + * and there is no need to delay that. + */ + return 0; + + for (i = 0; i < s->failed && i < 2; i++) { + if (fdev[i]->towrite && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + /* If we have a partial write to a failed + * device, then we will need to reconstruct + * the content of that device, so all other + * devices must be read. + */ + return 1; + + if (s->failed >= 2 && + (fdev[i]->towrite || + s->failed_num[i] == sh->pd_idx || + s->failed_num[i] == sh->qd_idx) && + !test_bit(R5_UPTODATE, &fdev[i]->flags)) + /* In max degraded raid6, If the failed disk is P, Q, + * or we want to read the failed disk, we need to do + * reconstruct-write. + */ + force_rcw = true; + } + + /* If we are forced to do a reconstruct-write, because parity + * cannot be trusted and we are currently recovering it, there + * is extra need to be careful. + * If one of the devices that we would need to read, because + * it is not being overwritten (and maybe not written at all) + * is missing/faulty, then we need to read everything we can. + */ + if (!force_rcw && + sh->sector < sh->raid_conf->mddev->recovery_cp) + /* reconstruct-write isn't being forced */ + return 0; + for (i = 0; i < s->failed && i < 2; i++) { + if (s->failed_num[i] != sh->pd_idx && + s->failed_num[i] != sh->qd_idx && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + return 1; + } + + return 0; +} + +/* fetch_block - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill to continue + */ +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + + /* is the data in this block needed, and can we get it? */ + if (need_this_block(sh, s, disk_idx, disks)) { + /* we would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync + */ + BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); + BUG_ON(test_bit(R5_Wantread, &dev->flags)); + BUG_ON(sh->batch_head); + + /* + * In the raid6 case if the only non-uptodate disk is P + * then we already trusted P to compute the other failed + * drives. It is safe to compute rather than re-read P. + * In other cases we only compute blocks from failed + * devices, otherwise check/repair might fail to detect + * a real inconsistency. + */ + + if ((s->uptodate == disks - 1) && + ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || + (s->failed && (disk_idx == s->failed_num[0] || + disk_idx == s->failed_num[1])))) { + /* have disk failed, and we're requested to fetch it; + * do compute it + */ + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, disk_idx); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no 2nd target */ + s->req_compute = 1; + /* Careful: from this point on 'uptodate' is in the eye + * of raid_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ + s->uptodate++; + return 1; + } else if (s->uptodate == disks-2 && s->failed >= 2) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == disk_idx) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; + } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + disk_idx, other); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); + set_bit(R5_Wantcompute, &sh->dev[other].flags); + sh->ops.target = disk_idx; + sh->ops.target2 = other; + s->uptodate += 2; + s->req_compute = 1; + return 1; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", + disk_idx, s->syncing); + } + } + + return 0; +} + +/* + * handle_stripe_fill - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill(struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + int i; + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) { + + /* + * For degraded stripe with data in journal, do not handle + * read requests yet, instead, flush the stripe to raid + * disks first, this avoids handling complex rmw of write + * back cache (prexor with orig_page, and then xor with + * page) in the read path + */ + if (s->to_read && s->injournal && s->failed) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + goto out; + } + + for (i = disks; i--; ) + if (fetch_block(sh, s, i, disks)) + break; + } +out: + set_bit(STRIPE_HANDLE, &sh->state); +} + +static void break_stripe_batch_list(struct stripe_head *head_sh, + unsigned long handle_flags); +/* handle_stripe_clean_event + * any written block on an uptodate or failed drive can be returned. + * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but + * never LOCKED, so we don't need to test 'failed' directly. + */ +static void handle_stripe_clean_event(struct r5conf *conf, + struct stripe_head *sh, int disks) +{ + int i; + struct r5dev *dev; + int discard_pending = 0; + struct stripe_head *head_sh = sh; + bool do_endio = false; + + for (i = disks; i--; ) + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Discard, &dev->flags) || + test_bit(R5_SkipCopy, &dev->flags))) { + /* We can return any write requests */ + struct bio *wbi, *wbi2; + pr_debug("Return write for disc %d\n", i); + if (test_and_clear_bit(R5_Discard, &dev->flags)) + clear_bit(R5_UPTODATE, &dev->flags); + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); + } + do_endio = true; + +returnbi: + dev->page = dev->orig_page; + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_iter.bi_sector < + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); + md_write_end(conf->mddev); + bio_endio(wbi); + wbi = wbi2; + } + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, + batch_list); + if (sh != head_sh) { + dev = &sh->dev[i]; + goto returnbi; + } + } + sh = head_sh; + dev = &sh->dev[i]; + } else if (test_bit(R5_Discard, &dev->flags)) + discard_pending = 1; + } + + log_stripe_write_finished(sh); + + if (!discard_pending && + test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + int hash; + clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + if (sh->qd_idx >= 0) { + clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); + } + /* now that discard is done we can proceed with any sync */ + clear_bit(STRIPE_DISCARD, &sh->state); + /* + * SCSI discard will change some bio fields and the stripe has + * no updated data, so remove it from hash list and the stripe + * will be reinitialized + */ +unhash: + hash = sh->hash_lock_index; + spin_lock_irq(conf->hash_locks + hash); + remove_hash(sh); + spin_unlock_irq(conf->hash_locks + hash); + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + if (sh != head_sh) + goto unhash; + } + sh = head_sh; + + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); + + } + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); + + if (head_sh->batch_head && do_endio) + break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); +} + +/* + * For RMW in write back cache, we need extra page in prexor to store the + * old data. This page is stored in dev->orig_page. + * + * This function checks whether we have data for prexor. The exact logic + * is: + * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) + */ +static inline bool uptodate_for_rmw(struct r5dev *dev) +{ + return (test_bit(R5_UPTODATE, &dev->flags)) && + (!test_bit(R5_InJournal, &dev->flags) || + test_bit(R5_OrigPageUPTDODATE, &dev->flags)); +} + +static int handle_stripe_dirtying(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + int rmw = 0, rcw = 0, i; + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* Check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->rmw_level == PARITY_DISABLE_RMW || + (recovery_cp < MaxSector && sh->sector >= recovery_cp && + s->failed == 0)) { + /* Calculate the real rcw later - for now make it + * look like rcw is cheaper + */ + rcw = 1; rmw = 2; + pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); + } else for (i = disks; i--; ) { + /* would I have to read this buffer for read_modify_write */ + struct r5dev *dev = &sh->dev[i]; + if (((dev->towrite && !delay_towrite(conf, dev, s)) || + i == sh->pd_idx || i == sh->qd_idx || + test_bit(R5_InJournal, &dev->flags)) && + !test_bit(R5_LOCKED, &dev->flags) && + !(uptodate_for_rmw(dev) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) + rmw++; + else + rmw += 2*disks; /* cannot read it */ + } + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) + rcw++; + else + rcw += 2*disks; + } + } + + pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, sh->state, rmw, rcw); + set_bit(STRIPE_HANDLE, &sh->state); + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { + /* prefer read-modify-write, but need to get some data */ + if (conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, + "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_InJournal, &dev->flags) && + dev->page == dev->orig_page && + !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { + /* alloc page for prexor */ + struct page *p = alloc_page(GFP_NOIO); + + if (p) { + dev->orig_page = p; + continue; + } + + /* + * alloc_page() failed, try use + * disk_info->extra_page + */ + if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, + &conf->cache_state)) { + r5c_use_extra_page(sh); + break; + } + + /* extra_page in use, add to delayed_list */ + set_bit(STRIPE_DELAYED, &sh->state); + s->waiting_extra_page = 1; + return -EAGAIN; + } + } + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (((dev->towrite && !delay_towrite(conf, dev, s)) || + i == sh->pd_idx || i == sh->qd_idx || + test_bit(R5_InJournal, &dev->flags)) && + !test_bit(R5_LOCKED, &dev->flags) && + !(uptodate_for_rmw(dev) || + test_bit(R5_Wantcompute, &dev->flags)) && + test_bit(R5_Insync, &dev->flags)) { + if (test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block %d for r-m-w\n", + i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else + set_bit(STRIPE_DELAYED, &sh->state); + } + } + } + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { + /* want reconstruct write, but need to get some data */ + int qread =0; + rcw = 0; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (test_bit(R5_Insync, &dev->flags) && + test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block " + "%d for Reconstruct\n", i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + qread++; + } else + set_bit(STRIPE_DELAYED, &sh->state); + } + } + if (rcw && conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", + (unsigned long long)sh->sector, + rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); + } + + if (rcw > disks && rmw > disks && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + set_bit(STRIPE_DELAYED, &sh->state); + + /* now if nothing is locked, and if we have enough data, + * we can start a write request + */ + /* since handle_stripe can be called at any time we need to handle the + * case where a compute block operation has been submitted and then a + * subsequent call wants to start a write request. raid_run_ops only + * handles the case where compute block and reconstruct are requested + * simultaneously. If this is not the case then new writes need to be + * held off until the compute completes. + */ + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction(sh, s, rcw == 0, 0); + return 0; +} + +static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks) +{ + struct r5dev *dev = NULL; + + BUG_ON(sh->batch_head); + set_bit(STRIPE_HANDLE, &sh->state); + + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ + if (s->failed == 0) { + BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + s->uptodate--; + break; + } + dev = &sh->dev[s->failed_num[0]]; + fallthrough; + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; + + /* either failed parity check, or recovery is happening */ + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); + BUG_ON(s->uptodate != disks); + + set_bit(R5_LOCKED, &dev->flags); + s->locked++; + set_bit(R5_Wantwrite, &dev->flags); + + clear_bit(STRIPE_DEGRADED, &sh->state); + set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + pr_warn_ratelimited("%s: mismatch sector in range " + "%llu-%llu\n", mdname(conf->mddev), + (unsigned long long) sh->sector, + (unsigned long long) sh->sector + + RAID5_STRIPE_SECTORS(conf)); + } else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + sh->ops.target2 = -1; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + pr_err("%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); + } +} + +static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct r5dev *dev; + + BUG_ON(sh->batch_head); + set_bit(STRIPE_HANDLE, &sh->state); + + BUG_ON(s->failed > 2); + + /* Want to check and possibly repair P and Q. + * However there could be one 'failed' device, in which + * case we can only check one of them, possibly using the + * other to generate missing data + */ + + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are < 2 failures */ + if (s->failed == s->q_failed) { + /* The only possible failed device holds Q, so it + * makes sense to check P (If anything else were failed, + * we would have used P to recreate it). + */ + sh->check_state = check_state_run; + } + if (!s->q_failed && s->failed < 2) { + /* Q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + if (sh->check_state == check_state_run) + sh->check_state = check_state_run_pq; + else + sh->check_state = check_state_run_q; + } + + /* discard potentially stale zero_sum_result */ + sh->ops.zero_sum_result = 0; + + if (sh->check_state == check_state_run) { + /* async_xor_zero_sum destroys the contents of P */ + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->uptodate--; + } + if (sh->check_state >= check_state_run && + sh->check_state <= check_state_run_pq) { + /* async_syndrome_zero_sum preserves P and Q, so + * no need to mark them !uptodate here + */ + set_bit(STRIPE_OP_CHECK, &s->ops_request); + break; + } + + /* we have 2-disk failure */ + BUG_ON(s->failed != 2); + fallthrough; + case check_state_compute_result: + sh->check_state = check_state_idle; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; + + /* now write out any block on a failed drive, + * or P or Q if they were recomputed + */ + dev = NULL; + if (s->failed == 2) { + dev = &sh->dev[s->failed_num[1]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (s->failed >= 1) { + dev = &sh->dev[s->failed_num[0]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + dev = &sh->dev[pd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + dev = &sh->dev[qd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), + "%s: disk%td not up to date\n", + mdname(conf->mddev), + dev - (struct r5dev *) &sh->dev)) { + clear_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_Wantwrite, &dev->flags); + s->locked--; + } + clear_bit(STRIPE_DEGRADED, &sh->state); + + set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + case check_state_run_q: + case check_state_run_pq: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) { + /* both parities are correct */ + if (!s->failed) + set_bit(STRIPE_INSYNC, &sh->state); + else { + /* in contrast to the raid5 case we can validate + * parity, but still have a failure to write + * back + */ + sh->check_state = check_state_compute_result; + /* Returning at this point means that we may go + * off and bring p and/or q uptodate again so + * we make sure to check zero_sum_result again + * to verify if p or q need writeback + */ + } + } else { + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + pr_warn_ratelimited("%s: mismatch sector in range " + "%llu-%llu\n", mdname(conf->mddev), + (unsigned long long) sh->sector, + (unsigned long long) sh->sector + + RAID5_STRIPE_SECTORS(conf)); + } else { + int *target = &sh->ops.target; + + sh->ops.target = -1; + sh->ops.target2 = -1; + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[pd_idx].flags); + *target = pd_idx; + target = &sh->ops.target2; + s->uptodate++; + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[qd_idx].flags); + *target = qd_idx; + s->uptodate++; + } + } + } + break; + case check_state_compute_run: + break; + default: + pr_warn("%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); + } +} + +static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) +{ + int i; + + /* We have read all the blocks in this stripe and now we need to + * copy some of them into a target stripe for expand. + */ + struct dma_async_tx_descriptor *tx = NULL; + BUG_ON(sh->batch_head); + clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); + for (i = 0; i < sh->disks; i++) + if (i != sh->pd_idx && i != sh->qd_idx) { + int dd_idx, j; + struct stripe_head *sh2; + struct async_submit_ctl submit; + + sector_t bn = raid5_compute_blocknr(sh, i, 1); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); + sh2 = raid5_get_active_stripe(conf, NULL, s, + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); + if (sh2 == NULL) + /* so far only the early blocks of this stripe + * have been requested. When later blocks + * get requested, we will try again + */ + continue; + if (!test_bit(STRIPE_EXPANDING, &sh2->state) || + test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { + /* must have already done this block */ + raid5_release_stripe(sh2); + continue; + } + + /* place all the copies on one channel */ + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); + tx = async_memcpy(sh2->dev[dd_idx].page, + sh->dev[i].page, sh2->dev[dd_idx].offset, + sh->dev[i].offset, RAID5_STRIPE_SIZE(conf), + &submit); + + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); + set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); + for (j = 0; j < conf->raid_disks; j++) + if (j != sh2->pd_idx && + j != sh2->qd_idx && + !test_bit(R5_Expanded, &sh2->dev[j].flags)) + break; + if (j == conf->raid_disks) { + set_bit(STRIPE_EXPAND_READY, &sh2->state); + set_bit(STRIPE_HANDLE, &sh2->state); + } + raid5_release_stripe(sh2); + + } + /* done submitting copies, wait for them to complete */ + async_tx_quiesce(&tx); +} + +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. + * Possible results: + * return some read requests which now have data + * return some write requests which are safely on storage + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + * + */ + +static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) +{ + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks; + struct r5dev *dev; + int i; + int do_recovery = 0; + + memset(s, 0, sizeof(*s)); + + s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; + s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; + s->failed_num[0] = -1; + s->failed_num[1] = -1; + s->log_failed = r5l_log_disk_error(conf); + + /* Now to look around and see what can be done */ + rcu_read_lock(); + for (i=disks; i--; ) { + struct md_rdev *rdev; + sector_t first_bad; + int bad_sectors; + int is_bad = 0; + + dev = &sh->dev[i]; + + pr_debug("check %d: state 0x%lx read %p write %p written %p\n", + i, dev->flags, + dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read + * + * new wantfill requests are only permitted while + * ops_complete_biofill is guaranteed to be inactive + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) + set_bit(R5_Wantfill, &dev->flags); + + /* now count some things */ + if (test_bit(R5_LOCKED, &dev->flags)) + s->locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) + s->uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) { + s->compute++; + BUG_ON(s->compute > 2); + } + + if (test_bit(R5_Wantfill, &dev->flags)) + s->to_fill++; + else if (dev->toread) + s->to_read++; + if (dev->towrite) { + s->to_write++; + if (!test_bit(R5_OVERWRITE, &dev->flags)) + s->non_overwrite++; + } + if (dev->written) + s->written++; + /* Prefer to use the replacement for reads, but only + * if it is recovered enough and has no bad blocks. + */ + rdev = rcu_dereference(conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && + !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), + &first_bad, &bad_sectors)) + set_bit(R5_ReadRepl, &dev->flags); + else { + if (rdev && !test_bit(Faulty, &rdev->flags)) + set_bit(R5_NeedReplace, &dev->flags); + else + clear_bit(R5_NeedReplace, &dev->flags); + rdev = rcu_dereference(conf->disks[i].rdev); + clear_bit(R5_ReadRepl, &dev->flags); + } + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) { + is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), + &first_bad, &bad_sectors); + if (s->blocked_rdev == NULL + && (test_bit(Blocked, &rdev->flags) + || is_bad < 0)) { + if (is_bad < 0) + set_bit(BlockedBadBlocks, + &rdev->flags); + s->blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } + } + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (is_bad) { + /* also not in-sync */ + if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { + /* treat as in-sync, but with a read error + * which we can now try to correct + */ + set_bit(R5_Insync, &dev->flags); + set_bit(R5_ReadError, &dev->flags); + } + } else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) + /* in sync if before recovery_offset */ + set_bit(R5_Insync, &dev->flags); + else if (test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Expanded, &dev->flags)) + /* If we've reshaped into here, we assume it is Insync. + * We will shortly update recovery_offset to make + * it official. + */ + set_bit(R5_Insync, &dev->flags); + + if (test_bit(R5_WriteError, &dev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 == rdev) + clear_bit(R5_Insync, &dev->flags); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_WriteError, &dev->flags); + } + if (test_bit(R5_MadeGood, &dev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGood, &dev->flags); + } + if (test_bit(R5_MadeGoodRepl, &dev->flags)) { + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].replacement); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGoodRepl, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { + /* The ReadError flag will just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { + if (s->failed < 2) + s->failed_num[s->failed] = i; + s->failed++; + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + else if (!rdev) { + rdev = rcu_dereference( + conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + } + } + + if (test_bit(R5_InJournal, &dev->flags)) + s->injournal++; + if (test_bit(R5_InJournal, &dev->flags) && dev->written) + s->just_cached++; + } + if (test_bit(STRIPE_SYNCING, &sh->state)) { + /* If there is a failed device being replaced, + * we must be recovering. + * else if we are after recovery_cp, we must be syncing + * else if MD_RECOVERY_REQUESTED is set, we also are syncing. + * else we can only be replacing + * sync and recovery both need to read all devices, and so + * use the same flag. + */ + if (do_recovery || + sh->sector >= conf->mddev->recovery_cp || + test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) + s->syncing = 1; + else + s->replacing = 1; + } + rcu_read_unlock(); +} + +/* + * Return '1' if this is a member of batch, or '0' if it is a lone stripe or + * a head which can now be handled. + */ +static int clear_batch_ready(struct stripe_head *sh) +{ + struct stripe_head *tmp; + if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) + return (sh->batch_head && sh->batch_head != sh); + spin_lock(&sh->stripe_lock); + if (!sh->batch_head) { + spin_unlock(&sh->stripe_lock); + return 0; + } + + /* + * this stripe could be added to a batch list before we check + * BATCH_READY, skips it + */ + if (sh->batch_head != sh) { + spin_unlock(&sh->stripe_lock); + return 1; + } + spin_lock(&sh->batch_lock); + list_for_each_entry(tmp, &sh->batch_list, batch_list) + clear_bit(STRIPE_BATCH_READY, &tmp->state); + spin_unlock(&sh->batch_lock); + spin_unlock(&sh->stripe_lock); + + /* + * BATCH_READY is cleared, no new stripes can be added. + * batch_list can be accessed without lock + */ + return 0; +} + +static void break_stripe_batch_list(struct stripe_head *head_sh, + unsigned long handle_flags) +{ + struct stripe_head *sh, *next; + int i; + int do_wakeup = 0; + + list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { + + list_del_init(&sh->batch_list); + + WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | + (1 << STRIPE_SYNCING) | + (1 << STRIPE_REPLACED) | + (1 << STRIPE_DELAYED) | + (1 << STRIPE_BIT_DELAY) | + (1 << STRIPE_FULL_WRITE) | + (1 << STRIPE_BIOFILL_RUN) | + (1 << STRIPE_COMPUTE_RUN) | + (1 << STRIPE_DISCARD) | + (1 << STRIPE_BATCH_READY) | + (1 << STRIPE_BATCH_ERR) | + (1 << STRIPE_BITMAP_PENDING)), + "stripe state: %lx\n", sh->state); + WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | + (1 << STRIPE_REPLACED)), + "head stripe state: %lx\n", head_sh->state); + + set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), + head_sh->state & (1 << STRIPE_INSYNC)); + + sh->check_state = head_sh->check_state; + sh->reconstruct_state = head_sh->reconstruct_state; + spin_lock_irq(&sh->stripe_lock); + sh->batch_head = NULL; + spin_unlock_irq(&sh->stripe_lock); + for (i = 0; i < sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + do_wakeup = 1; + sh->dev[i].flags = head_sh->dev[i].flags & + (~((1 << R5_WriteError) | (1 << R5_Overlap))); + } + if (handle_flags == 0 || + sh->state & handle_flags) + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } + spin_lock_irq(&head_sh->stripe_lock); + head_sh->batch_head = NULL; + spin_unlock_irq(&head_sh->stripe_lock); + for (i = 0; i < head_sh->disks; i++) + if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) + do_wakeup = 1; + if (head_sh->state & handle_flags) + set_bit(STRIPE_HANDLE, &head_sh->state); + + if (do_wakeup) + wake_up(&head_sh->raid_conf->wait_for_overlap); +} + +static void handle_stripe(struct stripe_head *sh) +{ + struct stripe_head_state s; + struct r5conf *conf = sh->raid_conf; + int i; + int prexor; + int disks = sh->disks; + struct r5dev *pdev, *qdev; + + clear_bit(STRIPE_HANDLE, &sh->state); + + /* + * handle_stripe should not continue handle the batched stripe, only + * the head of batch list or lone stripe can continue. Otherwise we + * could see break_stripe_batch_list warns about the STRIPE_ACTIVE + * is set for the batched stripe. + */ + if (clear_batch_ready(sh)) + return; + + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { + /* already being handled, ensure it gets handled + * again when current action finishes */ + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + + if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) + break_stripe_batch_list(sh, 0); + + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { + spin_lock(&sh->stripe_lock); + /* + * Cannot process 'sync' concurrently with 'discard'. + * Flush data in r5cache before 'sync'. + */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && + !test_bit(STRIPE_DISCARD, &sh->state) && + test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + clear_bit(STRIPE_REPLACED, &sh->state); + } + spin_unlock(&sh->stripe_lock); + } + clear_bit(STRIPE_DELAYED, &sh->state); + + pr_debug("handling stripe %llu, state=%#lx cnt=%d, " + "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", + (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, + sh->check_state, sh->reconstruct_state); + + analyse_stripe(sh, &s); + + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + goto finish; + + if (s.handle_bad_blocks || + test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { + set_bit(STRIPE_HANDLE, &sh->state); + goto finish; + } + + if (unlikely(s.blocked_rdev)) { + if (s.syncing || s.expanding || s.expanded || + s.replacing || s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto finish; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(s.blocked_rdev, conf->mddev); + s.blocked_rdev = NULL; + } + + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } + + pr_debug("locked=%d uptodate=%d to_read=%d" + " to_write=%d failed=%d failed_num=%d,%d\n", + s.locked, s.uptodate, s.to_read, s.to_write, s.failed, + s.failed_num[0], s.failed_num[1]); + /* + * check if the array has lost more than max_degraded devices and, + * if so, some requests might need to be failed. + * + * When journal device failed (log_failed), we will only process + * the stripe if there is data need write to raid disks + */ + if (s.failed > conf->max_degraded || + (s.log_failed && s.injournal == 0)) { + sh->check_state = 0; + sh->reconstruct_state = 0; + break_stripe_batch_list(sh, 0); + if (s.to_read+s.to_write+s.written) + handle_failed_stripe(conf, sh, &s, disks); + if (s.syncing + s.replacing) + handle_failed_sync(conf, sh, &s); + } + + /* Now we check to see if any write operations have recently + * completed + */ + prexor = 0; + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; + + /* All the 'written' buffers and the parity block are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); + BUG_ON(sh->qd_idx >= 0 && + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || i == sh->qd_idx || + dev->written || test_bit(R5_InJournal, + &dev->flags))) { + pr_debug("Writing block %d\n", i); + set_bit(R5_Wantwrite, &dev->flags); + if (prexor) + continue; + if (s.failed > 1) + continue; + if (!test_bit(R5_Insync, &dev->flags) || + ((i == sh->pd_idx || i == sh->qd_idx) && + s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + s.dec_preread_active = 1; + } + + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) + handle_stripe_clean_event(conf, sh, disks); + + if (s.just_cached) + r5c_handle_cached_data_endio(conf, sh, disks); + log_stripe_write_finished(sh); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite + || (s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) + handle_stripe_fill(sh, &s, disks); + + /* + * When the stripe finishes full journal write cycle (write to journal + * and raid disk), this is the clean up procedure so it is ready for + * next operation. + */ + r5c_finish_stripe_write_out(conf, sh, &s); + + /* + * Now to consider new write requests, cache write back and what else, + * if anything should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+xor) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + * 3/ A r5c cache log write is in flight. + */ + + if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { + if (!r5c_is_writeback(conf->log)) { + if (s.to_write) + handle_stripe_dirtying(conf, sh, &s, disks); + } else { /* write back cache */ + int ret = 0; + + /* First, try handle writes in caching phase */ + if (s.to_write) + ret = r5c_try_caching_write(conf, sh, &s, + disks); + /* + * If caching phase failed: ret == -EAGAIN + * OR + * stripe under reclaim: !caching && injournal + * + * fall back to handle_stripe_dirtying() + */ + if (ret == -EAGAIN || + /* stripe under reclaim: !caching && injournal */ + (!test_bit(STRIPE_R5C_CACHING, &sh->state) && + s.injournal > 0)) { + ret = handle_stripe_dirtying(conf, sh, &s, + disks); + if (ret == -EAGAIN) + goto finish; + } + } + } + + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough + * data is available. The parity check is held off while parity + * dependent operations are in flight. + */ + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) { + if (conf->level == 6) + handle_parity_checks6(conf, sh, &s, disks); + else + handle_parity_checks5(conf, sh, &s, disks); + } + + if ((s.replacing || s.syncing) && s.locked == 0 + && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) + && !test_bit(STRIPE_REPLACED, &sh->state)) { + /* Write out to replacement devices where possible */ + for (i = 0; i < conf->raid_disks; i++) + if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); + set_bit(R5_WantReplace, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + if (s.replacing) + set_bit(STRIPE_INSYNC, &sh->state); + set_bit(STRIPE_REPLACED, &sh->state); + } + if ((s.syncing || s.replacing) && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); + clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); + } + + /* If the failed drives are just a ReadError, then we might need + * to progress the repair/check process + */ + if (s.failed <= conf->max_degraded && !conf->mddev->ro) + for (i = 0; i < s.failed; i++) { + struct r5dev *dev = &sh->dev[s.failed_num[i]]; + if (test_bit(R5_ReadError, &dev->flags) + && !test_bit(R5_LOCKED, &dev->flags) + && test_bit(R5_UPTODATE, &dev->flags) + ) { + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + } else + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; + } + } + + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh_src + = raid5_get_active_stripe(conf, NULL, sh->sector, + R5_GAS_PREVIOUS | R5_GAS_NOBLOCK | + R5_GAS_NOQUIESCE); + if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { + /* sh cannot be written until sh_src has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh_src->state)) + atomic_inc(&conf->preread_active_stripes); + raid5_release_stripe(sh_src); + goto finish; + } + if (sh_src) + raid5_release_stripe(sh_src); + + sh->reconstruct_state = reconstruct_state_idle; + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !sh->reconstruct_state) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + stripe_set_idx(sh->sector, conf, 0, sh); + schedule_reconstruction(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { + clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); + wake_up(&conf->wait_for_overlap); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); + } + + if (s.expanding && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) + handle_stripe_expansion(conf, sh); + +finish: + /* wait for this device to become unblocked */ + if (unlikely(s.blocked_rdev)) { + if (conf->mddev->external) + md_wait_for_blocked_rdev(s.blocked_rdev, + conf->mddev); + else + /* Internal metadata will immediately + * be written by raid5d, so we don't + * need to wait here. + */ + rdev_dec_pending(s.blocked_rdev, + conf->mddev); + } + + if (s.handle_bad_blocks) + for (i = disks; i--; ) { + struct md_rdev *rdev; + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_WriteError, &dev->flags)) { + /* We own a safe reference to the rdev */ + rdev = rdev_pend_deref(conf->disks[i].rdev); + if (!rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { + rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev_clear_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { + rdev = rdev_pend_deref(conf->disks[i].replacement); + if (!rdev) + /* rdev have been moved down */ + rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev_clear_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); + rdev_dec_pending(rdev, conf->mddev); + } + } + + if (s.ops_request) + raid_run_ops(sh, s.ops_request); + + ops_run_io(sh, &s); + + if (s.dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a flush, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + + clear_bit_unlock(STRIPE_ACTIVE, &sh->state); +} + +static void raid5_activate_delayed(struct r5conf *conf) + __must_hold(&conf->device_lock) +{ + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { + while (!list_empty(&conf->delayed_list)) { + struct list_head *l = conf->delayed_list.next; + struct stripe_head *sh; + sh = list_entry(l, struct stripe_head, lru); + list_del_init(l); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + list_add_tail(&sh->lru, &conf->hold_list); + raid5_wakeup_stripe_thread(sh); + } + } +} + +static void activate_bit_delay(struct r5conf *conf, + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) +{ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + int hash; + list_del_init(&sh->lru); + atomic_inc(&sh->count); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); + } +} + +static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) +{ + struct r5conf *conf = mddev->private; + sector_t sector = bio->bi_iter.bi_sector; + unsigned int chunk_sectors; + unsigned int bio_sectors = bio_sectors(bio); + + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); + return chunk_sectors >= + ((sector & (chunk_sectors - 1)) + bio_sectors); +} + +/* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) +{ + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + bi->bi_next = conf->retry_read_aligned_list; + conf->retry_read_aligned_list = bi; + + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(conf->mddev->thread); +} + +static struct bio *remove_bio_from_retry(struct r5conf *conf, + unsigned int *offset) +{ + struct bio *bi; + + bi = conf->retry_read_aligned; + if (bi) { + *offset = conf->retry_read_offset; + conf->retry_read_aligned = NULL; + return bi; + } + bi = conf->retry_read_aligned_list; + if(bi) { + conf->retry_read_aligned_list = bi->bi_next; + bi->bi_next = NULL; + *offset = 0; + } + + return bi; +} + +/* + * The "raid5_align_endio" should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +static void raid5_align_endio(struct bio *bi) +{ + struct md_io_acct *md_io_acct = bi->bi_private; + struct bio *raid_bi = md_io_acct->orig_bio; + struct mddev *mddev; + struct r5conf *conf; + struct md_rdev *rdev; + blk_status_t error = bi->bi_status; + unsigned long start_time = md_io_acct->start_time; + + bio_put(bi); + + rdev = (void*)raid_bi->bi_next; + raid_bi->bi_next = NULL; + mddev = rdev->mddev; + conf = mddev->private; + + rdev_dec_pending(rdev, conf->mddev); + + if (!error) { + if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) + bio_end_io_acct(raid_bi, start_time); + bio_endio(raid_bi); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_quiescent); + return; + } + + pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); + + add_bio_to_retry(raid_bi, conf); +} + +static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) +{ + struct r5conf *conf = mddev->private; + struct bio *align_bio; + struct md_rdev *rdev; + sector_t sector, end_sector, first_bad; + int bad_sectors, dd_idx; + struct md_io_acct *md_io_acct; + bool did_inc; + + if (!in_chunk_boundary(mddev, raid_bio)) { + pr_debug("%s: non aligned\n", __func__); + return 0; + } + + sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, + &dd_idx, NULL); + end_sector = sector + bio_sectors(raid_bio); + + rcu_read_lock(); + if (r5c_big_stripe_cached(conf, sector)) + goto out_rcu_unlock; + + rdev = rcu_dereference(conf->disks[dd_idx].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev->recovery_offset < end_sector) { + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (!rdev) + goto out_rcu_unlock; + if (test_bit(Faulty, &rdev->flags) || + !(test_bit(In_sync, &rdev->flags) || + rdev->recovery_offset >= end_sector)) + goto out_rcu_unlock; + } + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, + &bad_sectors)) { + rdev_dec_pending(rdev, mddev); + return 0; + } + + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, + &mddev->io_acct_set); + md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + raid_bio->bi_next = (void *)rdev; + if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) + md_io_acct->start_time = bio_start_io_acct(raid_bio); + md_io_acct->orig_bio = raid_bio; + + align_bio->bi_end_io = raid5_align_endio; + align_bio->bi_private = md_io_acct; + align_bio->bi_iter.bi_sector = sector; + + /* No reshape active, so we can trust rdev->data_offset */ + align_bio->bi_iter.bi_sector += rdev->data_offset; + + did_inc = false; + if (conf->quiesce == 0) { + atomic_inc(&conf->active_aligned_reads); + did_inc = true; + } + /* need a memory barrier to detect the race with raid5_quiesce() */ + if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { + /* quiesce is in progress, so we need to undo io activation and wait + * for it to finish + */ + if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_quiescent); + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + } + + if (mddev->gendisk) + trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), + raid_bio->bi_iter.bi_sector); + submit_bio_noacct(align_bio); + return 1; + +out_rcu_unlock: + rcu_read_unlock(); + return 0; +} + +static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) +{ + struct bio *split; + sector_t sector = raid_bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + + if (sectors < bio_sectors(raid_bio)) { + struct r5conf *conf = mddev->private; + split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); + bio_chain(split, raid_bio); + submit_bio_noacct(raid_bio); + raid_bio = split; + } + + if (!raid5_read_one_chunk(mddev, raid_bio)) + return raid_bio; + + return NULL; +} + +/* __get_priority_stripe - get the next stripe to process + * + * Full stripe writes are allowed to pass preread active stripes up until + * the bypass_threshold is exceeded. In general the bypass_count + * increments when the handle_list is handled before the hold_list; however, it + * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a + * stripe with in flight i/o. The bypass_count will be reset when the + * head of the hold_list has changed, i.e. the head was promoted to the + * handle_list. + */ +static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) + __must_hold(&conf->device_lock) +{ + struct stripe_head *sh, *tmp; + struct list_head *handle_list = NULL; + struct r5worker_group *wg; + bool second_try = !r5c_is_writeback(conf->log) && + !r5l_log_disk_error(conf); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || + r5l_log_disk_error(conf); + +again: + wg = NULL; + sh = NULL; + if (conf->worker_cnt_per_group == 0) { + handle_list = try_loprio ? &conf->loprio_list : + &conf->handle_list; + } else if (group != ANY_GROUP) { + handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : + &conf->worker_groups[group].handle_list; + wg = &conf->worker_groups[group]; + } else { + int i; + for (i = 0; i < conf->group_cnt; i++) { + handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : + &conf->worker_groups[i].handle_list; + wg = &conf->worker_groups[i]; + if (!list_empty(handle_list)) + break; + } + } + + pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", + __func__, + list_empty(handle_list) ? "empty" : "busy", + list_empty(&conf->hold_list) ? "empty" : "busy", + atomic_read(&conf->pending_full_writes), conf->bypass_count); + + if (!list_empty(handle_list)) { + sh = list_entry(handle_list->next, typeof(*sh), lru); + + if (list_empty(&conf->hold_list)) + conf->bypass_count = 0; + else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { + if (conf->hold_list.next == conf->last_hold) + conf->bypass_count++; + else { + conf->last_hold = conf->hold_list.next; + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + } else if (!list_empty(&conf->hold_list) && + ((conf->bypass_threshold && + conf->bypass_count > conf->bypass_threshold) || + atomic_read(&conf->pending_full_writes) == 0)) { + + list_for_each_entry(tmp, &conf->hold_list, lru) { + if (conf->worker_cnt_per_group == 0 || + group == ANY_GROUP || + !cpu_online(tmp->cpu) || + cpu_to_group(tmp->cpu) == group) { + sh = tmp; + break; + } + } + + if (sh) { + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + wg = NULL; + } + + if (!sh) { + if (second_try) + return NULL; + second_try = true; + try_loprio = !try_loprio; + goto again; + } + + if (wg) { + wg->stripes_cnt--; + sh->group = NULL; + } + list_del_init(&sh->lru); + BUG_ON(atomic_inc_return(&sh->count) != 1); + return sh; +} + +struct raid5_plug_cb { + struct blk_plug_cb cb; + struct list_head list; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; +}; + +static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) +{ + struct raid5_plug_cb *cb = container_of( + blk_cb, struct raid5_plug_cb, cb); + struct stripe_head *sh; + struct mddev *mddev = cb->cb.data; + struct r5conf *conf = mddev->private; + int cnt = 0; + int hash; + + if (cb->list.next && !list_empty(&cb->list)) { + spin_lock_irq(&conf->device_lock); + while (!list_empty(&cb->list)) { + sh = list_first_entry(&cb->list, struct stripe_head, lru); + list_del_init(&sh->lru); + /* + * avoid race release_stripe_plug() sees + * STRIPE_ON_UNPLUG_LIST clear but the stripe + * is still in our list + */ + smp_mb__before_atomic(); + clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); + /* + * STRIPE_ON_RELEASE_LIST could be set here. In that + * case, the count is always > 1 here + */ + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); + cnt++; + } + spin_unlock_irq(&conf->device_lock); + } + release_inactive_stripe_list(conf, cb->temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + if (mddev->queue) + trace_block_unplug(mddev->queue, cnt, !from_schedule); + kfree(cb); +} + +static void release_stripe_plug(struct mddev *mddev, + struct stripe_head *sh) +{ + struct blk_plug_cb *blk_cb = blk_check_plugged( + raid5_unplug, mddev, + sizeof(struct raid5_plug_cb)); + struct raid5_plug_cb *cb; + + if (!blk_cb) { + raid5_release_stripe(sh); + return; + } + + cb = container_of(blk_cb, struct raid5_plug_cb, cb); + + if (cb->list.next == NULL) { + int i; + INIT_LIST_HEAD(&cb->list); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(cb->temp_inactive_list + i); + } + + if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) + list_add_tail(&sh->lru, &cb->list); + else + raid5_release_stripe(sh); +} + +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int stripe_sectors; + + /* We need to handle this when io_uring supports discard/trim */ + if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT)) + return; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + last_sector = bio_end_sector(bi); + + bi->bi_next = NULL; + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += RAID5_STRIPE_SECTORS(conf)) { + DEFINE_WAIT(w); + int d; + again: + sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + raid5_release_stripe(sh); + schedule(); + goto again; + } + clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + raid5_release_stripe(sh); + schedule(); + goto again; + } + } + set_bit(STRIPE_DISCARD, &sh->state); + finish_wait(&conf->wait_for_overlap, &w); + sh->overwrite_disks = 0; + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + bio_inc_remaining(bi); + md_write_inc(mddev, bi); + sh->overwrite_disks++; + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + md_bitmap_startwrite(mddev->bitmap, + sh->sector, + RAID5_STRIPE_SECTORS(conf), + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + bio_endio(bi); +} + +static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, + sector_t reshape_sector) +{ + return mddev->reshape_backwards ? sector < reshape_sector : + sector >= reshape_sector; +} + +static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, + sector_t max, sector_t reshape_sector) +{ + return mddev->reshape_backwards ? max < reshape_sector : + min >= reshape_sector; +} + +static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, + struct stripe_head *sh) +{ + sector_t max_sector = 0, min_sector = MaxSector; + bool ret = false; + int dd_idx; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + min_sector = min(min_sector, sh->dev[dd_idx].sector); + max_sector = max(max_sector, sh->dev[dd_idx].sector); + } + + spin_lock_irq(&conf->device_lock); + + if (!range_ahead_of_reshape(mddev, min_sector, max_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + ret = true; + + spin_unlock_irq(&conf->device_lock); + + return ret; +} + +static int add_all_stripe_bios(struct r5conf *conf, + struct stripe_request_ctx *ctx, struct stripe_head *sh, + struct bio *bi, int forwrite, int previous) +{ + int dd_idx; + int ret = 1; + + spin_lock_irq(&sh->stripe_lock); + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &dev->flags); + ret = 0; + continue; + } + } + + if (!ret) + goto out; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); + clear_bit((dev->sector - ctx->first_sector) >> + RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); + } + +out: + spin_unlock_irq(&sh->stripe_lock); + return ret; +} + +static enum stripe_result make_stripe_request(struct mddev *mddev, + struct r5conf *conf, struct stripe_request_ctx *ctx, + sector_t logical_sector, struct bio *bi) +{ + const int rw = bio_data_dir(bi); + enum stripe_result ret; + struct stripe_head *sh; + sector_t new_sector; + int previous = 0, flags = 0; + int seq, dd_idx; + + seq = read_seqcount_begin(&conf->gen_lock); + + if (unlikely(conf->reshape_progress != MaxSector)) { + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { + previous = 1; + } else { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { + spin_unlock_irq(&conf->device_lock); + return STRIPE_SCHEDULE_AND_RETRY; + } + } + spin_unlock_irq(&conf->device_lock); + } + + new_sector = raid5_compute_sector(conf, logical_sector, previous, + &dd_idx, NULL); + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, + new_sector, logical_sector); + + if (previous) + flags |= R5_GAS_PREVIOUS; + if (bi->bi_opf & REQ_RAHEAD) + flags |= R5_GAS_NOBLOCK; + sh = raid5_get_active_stripe(conf, ctx, new_sector, flags); + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + return STRIPE_FAIL; + } + + if (unlikely(previous) && + stripe_ahead_of_reshape(mddev, conf, sh)) { + /* + * Expansion moved on while waiting for a stripe. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + ret = STRIPE_RETRY; + goto out_release; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (stripe_can_batch(sh)) { + stripe_add_to_batch_list(conf, sh, ctx->batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); + atomic_inc(&sh->count); + ctx->batch_last = sh; + } + + if (ctx->do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + ctx->do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); + return STRIPE_SUCCESS; + +out_release: + raid5_release_stripe(sh); + return ret; +} + +/* + * If the bio covers multiple data disks, find sector within the bio that has + * the lowest chunk offset in the first chunk. + */ +static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, + struct bio *bi) +{ + int sectors_per_chunk = conf->chunk_sectors; + int raid_disks = conf->raid_disks; + int dd_idx; + struct stripe_head sh; + unsigned int chunk_offset; + sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + sector_t sector; + + /* We pass in fake stripe_head to get back parity disk numbers */ + sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh); + chunk_offset = sector_div(sector, sectors_per_chunk); + if (sectors_per_chunk - chunk_offset >= bio_sectors(bi)) + return r_sector; + /* + * Bio crosses to the next data disk. Check whether it's in the same + * chunk. + */ + dd_idx++; + while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx) + dd_idx++; + if (dd_idx >= raid_disks) + return r_sector; + return r_sector + sectors_per_chunk - chunk_offset; +} + +static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct r5conf *conf = mddev->private; + sector_t logical_sector; + struct stripe_request_ctx ctx = {}; + const int rw = bio_data_dir(bi); + enum stripe_result res; + int s, stripe_cnt; + + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { + int ret = log_handle_flush_request(conf, bi); + + if (ret == 0) + return true; + if (ret == -ENODEV) { + if (md_flush_request(mddev, bi)) + return true; + } + /* ret == -EAGAIN, fallback */ + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; + } + + if (!md_write_start(mddev, bi)) + return false; + /* + * If array is degraded, better not do chunk aligned read because + * later we might have to read it again in order to reconstruct + * data on failed drives. + */ + if (rw == READ && mddev->degraded == 0 && + mddev->reshape_position == MaxSector) { + bi = chunk_aligned_read(mddev, bi); + if (!bi) + return true; + } + + if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { + make_discard_request(mddev, bi); + md_write_end(mddev); + return true; + } + + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + ctx.first_sector = logical_sector; + ctx.last_sector = bio_end_sector(bi); + bi->bi_next = NULL; + + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + RAID5_STRIPE_SECTORS(conf)); + bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + + pr_debug("raid456: %s, logical %llu to %llu\n", __func__, + bi->bi_iter.bi_sector, ctx.last_sector); + + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ + if ((bi->bi_opf & REQ_NOWAIT) && + (conf->reshape_progress != MaxSector) && + !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && + ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { + bio_wouldblock_error(bi); + if (rw == WRITE) + md_write_end(mddev); + return true; + } + md_account_bio(mddev, &bi); + + /* + * Lets start with the stripe with the lowest chunk offset in the first + * chunk. That has the best chances of creating IOs adjacent to + * previous IOs in case of sequential IO and thus creates the most + * sequential IO pattern. We don't bother with the optimization when + * reshaping as the performance benefit is not worth the complexity. + */ + if (likely(conf->reshape_progress == MaxSector)) + logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); + s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + + add_wait_queue(&conf->wait_for_overlap, &wait); + while (1) { + res = make_stripe_request(mddev, conf, &ctx, logical_sector, + bi); + if (res == STRIPE_FAIL) + break; + + if (res == STRIPE_RETRY) + continue; + + if (res == STRIPE_SCHEDULE_AND_RETRY) { + /* + * Must release the reference to batch_last before + * scheduling and waiting for work to be done, + * otherwise the batch_last stripe head could prevent + * raid5_activate_delayed() from making progress + * and thus deadlocking. + */ + if (ctx.batch_last) { + raid5_release_stripe(ctx.batch_last); + ctx.batch_last = NULL; + } + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + continue; + } + + s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); + if (s == stripe_cnt) + break; + + logical_sector = ctx.first_sector + + (s << RAID5_STRIPE_SHIFT(conf)); + } + remove_wait_queue(&conf->wait_for_overlap, &wait); + + if (ctx.batch_last) + raid5_release_stripe(ctx.batch_last); + + if (rw == WRITE) + md_write_end(mddev); + bio_endio(bi); + return true; +} + +static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); + +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) +{ + /* reshaping is quite different to recovery/resync so it is + * handled quite separately ... here. + * + * On each call to sync_request, we gather one chunk worth of + * destination stripes and flag them as expanding. + * Then we find all the source stripes and request reads. + * As the reads complete, handle_stripe will copy the data + * into the destination stripe and release that stripe. + */ + struct r5conf *conf = mddev->private; + struct stripe_head *sh; + struct md_rdev *rdev; + sector_t first_sector, last_sector; + int raid_disks = conf->previous_raid_disks; + int data_disks = raid_disks - conf->max_degraded; + int new_data_disks = conf->raid_disks - conf->max_degraded; + int i; + int dd_idx; + sector_t writepos, readpos, safepos; + sector_t stripe_addr; + int reshape_sectors; + struct list_head stripes; + sector_t retn; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->reshape_backwards && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (mddev->reshape_backwards && + conf->reshape_progress == MaxSector) { + /* shouldn't happen, but just in case, finish up.*/ + sector_nr = MaxSector; + } else if (!mddev->reshape_backwards && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; + sector_div(sector_nr, new_data_disks); + if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify_dirent_safe(mddev->sysfs_completed); + *skipped = 1; + retn = sector_nr; + goto finish; + } + } + + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these + */ + + reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); + + /* We update the metadata at least every 10 seconds, or when + * the data about to be copied would over-write the source of + * the data at the front of the range. i.e. one new_stripe + * along from reshape_progress new_maps to after where + * reshape_safe old_maps to + */ + writepos = conf->reshape_progress; + sector_div(writepos, new_data_disks); + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); + safepos = conf->reshape_safe; + sector_div(safepos, data_disks); + if (mddev->reshape_backwards) { + BUG_ON(writepos < reshape_sectors); + writepos -= reshape_sectors; + readpos += reshape_sectors; + safepos += reshape_sectors; + } else { + writepos += reshape_sectors; + /* readpos and safepos are worst-case calculations. + * A negative number is overly pessimistic, and causes + * obvious problems for unsigned storage. So clip to 0. + */ + readpos -= min_t(sector_t, reshape_sectors, readpos); + safepos -= min_t(sector_t, reshape_sectors, safepos); + } + + /* Having calculated the 'writepos' possibly use it + * to set 'stripe_addr' which is where we will write to. + */ + if (mddev->reshape_backwards) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If there is a min_offset_diff, these are adjusted either by + * increasing the safepos/readpos if diff is negative, or + * increasing writepos if diff is positive. + * If 'readpos' is then behind 'writepos', there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ + if (conf->min_offset_diff < 0) { + safepos += -conf->min_offset_diff; + readpos += -conf->min_offset_diff; + } else + writepos += conf->min_offset_diff; + + if ((mddev->reshape_backwards + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes)==0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + return 0; + mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + + conf->reshape_checkpoint = jiffies; + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->sb_flags == 0 || + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return 0; + spin_lock_irq(&conf->device_lock); + conf->reshape_safe = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + sysfs_notify_dirent_safe(mddev->sysfs_completed); + } + + INIT_LIST_HEAD(&stripes); + for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { + int j; + int skipped_disk = 0; + sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i, + R5_GAS_NOQUIESCE); + set_bit(STRIPE_EXPANDING, &sh->state); + atomic_inc(&conf->reshape_stripes); + /* If any of this stripe is beyond the end of the old + * array, then we need to zero those blocks + */ + for (j=sh->disks; j--;) { + sector_t s; + if (j == sh->pd_idx) + continue; + if (conf->level == 6 && + j == sh->qd_idx) + continue; + s = raid5_compute_blocknr(sh, j, 0); + if (s < raid5_size(mddev, 0, 0)) { + skipped_disk = 1; + continue; + } + memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); + set_bit(R5_Expanded, &sh->dev[j].flags); + set_bit(R5_UPTODATE, &sh->dev[j].flags); + } + if (!skipped_disk) { + set_bit(STRIPE_EXPAND_READY, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + list_add(&sh->lru, &stripes); + } + spin_lock_irq(&conf->device_lock); + if (mddev->reshape_backwards) + conf->reshape_progress -= reshape_sectors * new_data_disks; + else + conf->reshape_progress += reshape_sectors * new_data_disks; + spin_unlock_irq(&conf->device_lock); + /* Ok, those stripe are ready. We can start scheduling + * reads on the source stripes. + * The source stripes are determined by mapping the first and last + * block on the destination stripes. + */ + first_sector = + raid5_compute_sector(conf, stripe_addr*(new_data_disks), + 1, &dd_idx, NULL); + last_sector = + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) + * new_data_disks - 1), + 1, &dd_idx, NULL); + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; + while (first_sector <= last_sector) { + sh = raid5_get_active_stripe(conf, NULL, first_sector, + R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE); + set_bit(STRIPE_EXPAND_SOURCE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + first_sector += RAID5_STRIPE_SECTORS(conf); + } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } + /* If this takes us to the resync_max point where we have to pause, + * then we need to write out the superblock. + */ + sector_nr += reshape_sectors; + retn = reshape_sectors; +finish: + if (mddev->curr_resync_completed > mddev->resync_max || + (sector_nr - mddev->curr_resync_completed) * 2 + >= mddev->resync_max - mddev->curr_resync_completed) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes) == 0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + goto ret; + mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto ret; + spin_lock_irq(&conf->device_lock); + conf->reshape_safe = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + sysfs_notify_dirent_safe(mddev->sysfs_completed); + } +ret: + return retn; +} + +static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) +{ + struct r5conf *conf = mddev->private; + struct stripe_head *sh; + sector_t max_sector = mddev->dev_sectors; + sector_t sync_blocks; + int still_degraded = 0; + int i; + + if (sector_nr >= max_sector) { + /* just being told to finish up .. nothing much to do */ + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + return 0; + } + + if (mddev->curr_resync < max_sector) /* aborted */ + md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* completed sync */ + conf->fullsync = 0; + md_bitmap_close_sync(mddev->bitmap); + + return 0; + } + + /* Allow raid5_quiesce to complete */ + wait_event(conf->wait_for_overlap, conf->quiesce != 2); + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return reshape_request(mddev, sector_nr, skipped); + + /* No need to check resync_max as we never do more than one + * stripe, and as resync_max will always be on a chunk boundary, + * if the check in md_do_sync didn't fire, there is no chance + * of overstepping resync_max here + */ + + /* if there is too many failed drives and we are trying + * to resync, then assert that we are finished, because there is + * nothing we can do. + */ + if (mddev->degraded >= conf->max_degraded && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + sector_t rv = mddev->dev_sectors - sector_nr; + *skipped = 1; + return rv; + } + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !conf->fullsync && + !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { + /* we can skip this block, and probably more */ + do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); + *skipped = 1; + /* keep things rounded to whole stripes */ + return sync_blocks * RAID5_STRIPE_SECTORS(conf); + } + + md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); + + sh = raid5_get_active_stripe(conf, NULL, sector_nr, + R5_GAS_NOBLOCK); + if (sh == NULL) { + sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0); + /* make sure we don't swamp the stripe cache if someone else + * is trying to get access + */ + schedule_timeout_uninterruptible(1); + } + /* Need to check if array will still be degraded after recovery/resync + * Note in case of > 1 drive failures it's possible we're rebuilding + * one drive while leaving another faulty drive in array. + */ + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) + still_degraded = 1; + } + rcu_read_unlock(); + + md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + + set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + + raid5_release_stripe(sh); + + return RAID5_STRIPE_SECTORS(conf); +} + +static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, + unsigned int offset) +{ + /* We may not be able to submit a whole bio at once as there + * may not be enough stripe_heads available. + * We cannot pre-allocate enough stripe_heads as we may need + * more than exist in the cache (if we allow ever large chunks). + * So we do one stripe head at a time and record in + * ->bi_hw_segments how many have been done. + * + * We *know* that this entire raid_bio is in one chunk, so + * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. + */ + struct stripe_head *sh; + int dd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int handled = 0; + + logical_sector = raid_bio->bi_iter.bi_sector & + ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, NULL); + last_sector = bio_end_sector(raid_bio); + + for (; logical_sector < last_sector; + logical_sector += RAID5_STRIPE_SECTORS(conf), + sector += RAID5_STRIPE_SECTORS(conf), + scnt++) { + + if (scnt < offset) + /* already done this stripe */ + continue; + + sh = raid5_get_active_stripe(conf, NULL, sector, + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); + if (!sh) { + /* failed to get a stripe - must wait */ + conf->retry_read_aligned = raid_bio; + conf->retry_read_offset = scnt; + return handled; + } + + if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { + raid5_release_stripe(sh); + conf->retry_read_aligned = raid_bio; + conf->retry_read_offset = scnt; + return handled; + } + + set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); + handle_stripe(sh); + raid5_release_stripe(sh); + handled++; + } + + bio_endio(raid_bio); + + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_quiescent); + return handled; +} + +static int handle_active_stripes(struct r5conf *conf, int group, + struct r5worker *worker, + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) +{ + struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; + int i, batch_size = 0, hash; + bool release_inactive = false; + + while (batch_size < MAX_STRIPE_BATCH && + (sh = __get_priority_stripe(conf, group)) != NULL) + batch[batch_size++] = sh; + + if (batch_size == 0) { + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + if (!list_empty(temp_inactive_list + i)) + break; + if (i == NR_STRIPE_HASH_LOCKS) { + spin_unlock_irq(&conf->device_lock); + log_flush_stripe_to_raid(conf); + spin_lock_irq(&conf->device_lock); + return batch_size; + } + release_inactive = true; + } + spin_unlock_irq(&conf->device_lock); + + release_inactive_stripe_list(conf, temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + + r5l_flush_stripe_to_raid(conf->log); + if (release_inactive) { + spin_lock_irq(&conf->device_lock); + return 0; + } + + for (i = 0; i < batch_size; i++) + handle_stripe(batch[i]); + log_write_stripe_run(conf); + + cond_resched(); + + spin_lock_irq(&conf->device_lock); + for (i = 0; i < batch_size; i++) { + hash = batch[i]->hash_lock_index; + __release_stripe(conf, batch[i], &temp_inactive_list[hash]); + } + return batch_size; +} + +static void raid5_do_work(struct work_struct *work) +{ + struct r5worker *worker = container_of(work, struct r5worker, work); + struct r5worker_group *group = worker->group; + struct r5conf *conf = group->conf; + struct mddev *mddev = conf->mddev; + int group_id = group - conf->worker_groups; + int handled; + struct blk_plug plug; + + pr_debug("+++ raid5worker active\n"); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + int batch_size, released; + + released = release_stripe_list(conf, worker->temp_inactive_list); + + batch_size = handle_active_stripes(conf, group_id, worker, + worker->temp_inactive_list); + worker->working = false; + if (!batch_size && !released) + break; + handled += batch_size; + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + + flush_deferred_bios(conf); + + r5l_flush_stripe_to_raid(conf->log); + + async_tx_issue_pending_all(); + blk_finish_plug(&plug); + + pr_debug("--- raid5worker inactive\n"); +} + +/* + * This is our raid5 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void raid5d(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; + int handled; + struct blk_plug plug; + + pr_debug("+++ raid5d active\n"); + + md_check_recovery(mddev); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + struct bio *bio; + int batch_size, released; + unsigned int offset; + + released = release_stripe_list(conf, conf->temp_inactive_list); + if (released) + clear_bit(R5_DID_ALLOC, &conf->cache_state); + + if ( + !list_empty(&conf->bitmap_list)) { + /* Now is a good time to flush some bitmap updates */ + conf->seq_flush++; + spin_unlock_irq(&conf->device_lock); + md_bitmap_unplug(mddev->bitmap); + spin_lock_irq(&conf->device_lock); + conf->seq_write = conf->seq_flush; + activate_bit_delay(conf, conf->temp_inactive_list); + } + raid5_activate_delayed(conf); + + while ((bio = remove_bio_from_retry(conf, &offset))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio, offset); + spin_lock_irq(&conf->device_lock); + if (!ok) + break; + handled++; + } + + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, + conf->temp_inactive_list); + if (!batch_size && !released) + break; + handled += batch_size; + + if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { + spin_unlock_irq(&conf->device_lock); + md_check_recovery(mddev); + spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; + } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && + mutex_trylock(&conf->cache_size_mutex)) { + grow_one_stripe(conf, __GFP_NOWARN); + /* Set flag even if allocation failed. This helps + * slow down allocation requests when mem is short + */ + set_bit(R5_DID_ALLOC, &conf->cache_state); + mutex_unlock(&conf->cache_size_mutex); + } + + flush_deferred_bios(conf); + + r5l_flush_stripe_to_raid(conf->log); + + async_tx_issue_pending_all(); + blk_finish_plug(&plug); + + pr_debug("--- raid5d inactive\n"); +} + +static ssize_t +raid5_show_stripe_cache_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%d\n", conf->min_nr_stripes); + spin_unlock(&mddev->lock); + return ret; +} + +int +raid5_set_cache_size(struct mddev *mddev, int size) +{ + int result = 0; + struct r5conf *conf = mddev->private; + + if (size <= 16 || size > 32768) + return -EINVAL; + + conf->min_nr_stripes = size; + mutex_lock(&conf->cache_size_mutex); + while (size < conf->max_nr_stripes && + drop_one_stripe(conf)) + ; + mutex_unlock(&conf->cache_size_mutex); + + md_allow_write(mddev); + + mutex_lock(&conf->cache_size_mutex); + while (size > conf->max_nr_stripes) + if (!grow_one_stripe(conf, GFP_KERNEL)) { + conf->min_nr_stripes = conf->max_nr_stripes; + result = -ENOMEM; + break; + } + mutex_unlock(&conf->cache_size_mutex); + + return result; +} +EXPORT_SYMBOL(raid5_set_cache_size); + +static ssize_t +raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else + err = raid5_set_cache_size(mddev, new); + mddev_unlock(mddev); + + return err ?: len; +} + +static struct md_sysfs_entry +raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, + raid5_show_stripe_cache_size, + raid5_store_stripe_cache_size); + +static ssize_t +raid5_show_rmw_level(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->rmw_level); + else + return 0; +} + +static ssize_t +raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + + if (!conf) + return -ENODEV; + + if (len >= PAGE_SIZE) + return -EINVAL; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) + return -EINVAL; + + if (new != PARITY_DISABLE_RMW && + new != PARITY_ENABLE_RMW && + new != PARITY_PREFER_RMW) + return -EINVAL; + + conf->rmw_level = new; + return len; +} + +static struct md_sysfs_entry +raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, + raid5_show_rmw_level, + raid5_store_rmw_level); + +static ssize_t +raid5_show_stripe_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); + spin_unlock(&mddev->lock); + return ret; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static ssize_t +raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + int size; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + /* + * The value should not be bigger than PAGE_SIZE. It requires to + * be multiple of DEFAULT_STRIPE_SIZE and the value should be power + * of two. + */ + if (new % DEFAULT_STRIPE_SIZE != 0 || + new > PAGE_SIZE || new == 0 || + new != roundup_pow_of_two(new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + goto out_unlock; + } + + if (new == conf->stripe_size) + goto out_unlock; + + pr_debug("md/raid: change stripe_size from %lu to %lu\n", + conf->stripe_size, new); + + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) { + err = -EBUSY; + goto out_unlock; + } + + mddev_suspend(mddev); + mutex_lock(&conf->cache_size_mutex); + size = conf->max_nr_stripes; + + shrink_stripes(conf); + + conf->stripe_size = new; + conf->stripe_shift = ilog2(new) - 9; + conf->stripe_sectors = new >> 9; + if (grow_stripes(conf, size)) { + pr_warn("md/raid:%s: couldn't allocate buffers\n", + mdname(mddev)); + err = -ENOMEM; + } + mutex_unlock(&conf->cache_size_mutex); + mddev_resume(mddev); + +out_unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0644, + raid5_show_stripe_size, + raid5_store_stripe_size); +#else +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0444, + raid5_show_stripe_size, + NULL); +#endif + +static ssize_t +raid5_show_preread_threshold(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%d\n", conf->bypass_threshold); + spin_unlock(&mddev->lock); + return ret; +} + +static ssize_t +raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new > conf->min_nr_stripes) + err = -EINVAL; + else + conf->bypass_threshold = new; + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, + S_IRUGO | S_IWUSR, + raid5_show_preread_threshold, + raid5_store_preread_threshold); + +static ssize_t +raid5_show_skip_copy(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%d\n", conf->skip_copy); + spin_unlock(&mddev->lock); + return ret; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + new = !!new; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->skip_copy) { + struct request_queue *q = mddev->queue; + + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); + mddev_resume(mddev); + } + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, + raid5_show_skip_copy, + raid5_store_skip_copy); + +static ssize_t +stripe_cache_active_show(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); + else + return 0; +} + +static struct md_sysfs_entry +raid5_stripecache_active = __ATTR_RO(stripe_cache_active); + +static ssize_t +raid5_show_group_thread_cnt(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); + spin_unlock(&mddev->lock); + return ret; +} + +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + struct r5worker_group **worker_groups); +static ssize_t +raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned int new; + int err; + struct r5worker_group *new_groups, *old_groups; + int group_cnt; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtouint(page, 10, &new)) + return -EINVAL; + /* 8192 should be big enough */ + if (new > 8192) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->worker_cnt_per_group) { + mddev_suspend(mddev); + + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); + + err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = new; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); + + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + mddev_resume(mddev); + } + mddev_unlock(mddev); + + return err ?: len; +} + +static struct md_sysfs_entry +raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, + raid5_show_group_thread_cnt, + raid5_store_group_thread_cnt); + +static struct attribute *raid5_attrs[] = { + &raid5_stripecache_size.attr, + &raid5_stripecache_active.attr, + &raid5_preread_bypass_threshold.attr, + &raid5_group_thread_cnt.attr, + &raid5_skip_copy.attr, + &raid5_rmw_level.attr, + &raid5_stripe_size.attr, + &r5c_journal_mode.attr, + &ppl_write_hint.attr, + NULL, +}; +static const struct attribute_group raid5_attrs_group = { + .name = NULL, + .attrs = raid5_attrs, +}; + +static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, + struct r5worker_group **worker_groups) +{ + int i, j, k; + ssize_t size; + struct r5worker *workers; + + if (cnt == 0) { + *group_cnt = 0; + *worker_groups = NULL; + return 0; + } + *group_cnt = num_possible_nodes(); + size = sizeof(struct r5worker) * cnt; + workers = kcalloc(size, *group_cnt, GFP_NOIO); + *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group), + GFP_NOIO); + if (!*worker_groups || !workers) { + kfree(workers); + kfree(*worker_groups); + return -ENOMEM; + } + + for (i = 0; i < *group_cnt; i++) { + struct r5worker_group *group; + + group = &(*worker_groups)[i]; + INIT_LIST_HEAD(&group->handle_list); + INIT_LIST_HEAD(&group->loprio_list); + group->conf = conf; + group->workers = workers + i * cnt; + + for (j = 0; j < cnt; j++) { + struct r5worker *worker = group->workers + j; + worker->group = group; + INIT_WORK(&worker->work, raid5_do_work); + + for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) + INIT_LIST_HEAD(worker->temp_inactive_list + k); + } + } + + return 0; +} + +static void free_thread_groups(struct r5conf *conf) +{ + if (conf->worker_groups) + kfree(conf->worker_groups[0].workers); + kfree(conf->worker_groups); + conf->worker_groups = NULL; +} + +static sector_t +raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct r5conf *conf = mddev->private; + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) + /* size is defined by the smallest of previous and new size */ + raid_disks = min(conf->raid_disks, conf->previous_raid_disks); + + sectors &= ~((sector_t)conf->chunk_sectors - 1); + sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); + return sectors * (raid_disks - conf->max_degraded); +} + +static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + safe_put_page(percpu->spare_page); + percpu->spare_page = NULL; + kvfree(percpu->scribble); + percpu->scribble = NULL; +} + +static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + if (conf->level == 6 && !percpu->spare_page) { + percpu->spare_page = alloc_page(GFP_KERNEL); + if (!percpu->spare_page) + return -ENOMEM; + } + + if (scribble_alloc(percpu, + max(conf->raid_disks, + conf->previous_raid_disks), + max(conf->chunk_sectors, + conf->prev_chunk_sectors) + / RAID5_STRIPE_SECTORS(conf))) { + free_scratch_buffer(conf, percpu); + return -ENOMEM; + } + + local_lock_init(&percpu->lock); + return 0; +} + +static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); + + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + return 0; +} + +static void raid5_free_percpu(struct r5conf *conf) +{ + if (!conf->percpu) + return; + + cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); + free_percpu(conf->percpu); +} + +static void free_conf(struct r5conf *conf) +{ + int i; + + log_exit(conf); + + unregister_shrinker(&conf->shrinker); + free_thread_groups(conf); + shrink_stripes(conf); + raid5_free_percpu(conf); + for (i = 0; i < conf->pool_size; i++) + if (conf->disks[i].extra_page) + put_page(conf->disks[i].extra_page); + kfree(conf->disks); + bioset_exit(&conf->bio_split); + kfree(conf->stripe_hashtbl); + kfree(conf->pending_data); + kfree(conf); +} + +static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + + if (alloc_scratch_buffer(conf, percpu)) { + pr_warn("%s: failed memory allocation for cpu%u\n", + __func__, cpu); + return -ENOMEM; + } + return 0; +} + +static int raid5_alloc_percpu(struct r5conf *conf) +{ + int err = 0; + + conf->percpu = alloc_percpu(struct raid5_percpu); + if (!conf->percpu) + return -ENOMEM; + + err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); + if (!err) { + conf->scribble_disks = max(conf->raid_disks, + conf->previous_raid_disks); + conf->scribble_sectors = max(conf->chunk_sectors, + conf->prev_chunk_sectors); + } + return err; +} + +static unsigned long raid5_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + unsigned long ret = SHRINK_STOP; + + if (mutex_trylock(&conf->cache_size_mutex)) { + ret= 0; + while (ret < sc->nr_to_scan && + conf->max_nr_stripes > conf->min_nr_stripes) { + if (drop_one_stripe(conf) == 0) { + ret = SHRINK_STOP; + break; + } + ret++; + } + mutex_unlock(&conf->cache_size_mutex); + } + return ret; +} + +static unsigned long raid5_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + + if (conf->max_nr_stripes < conf->min_nr_stripes) + /* unlikely, but not impossible */ + return 0; + return conf->max_nr_stripes - conf->min_nr_stripes; +} + +static struct r5conf *setup_conf(struct mddev *mddev) +{ + struct r5conf *conf; + int raid_disk, memory, max_disks; + struct md_rdev *rdev; + struct disk_info *disk; + char pers_name[6]; + int i; + int group_cnt; + struct r5worker_group *new_group; + int ret = -ENOMEM; + + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { + pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); + } + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { + pr_warn("md/raid:%s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); + } + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); + } + + if (!mddev->new_chunk_sectors || + (mddev->new_chunk_sectors << 9) % PAGE_SIZE || + !is_power_of_2(mddev->new_chunk_sectors)) { + pr_warn("md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); + return ERR_PTR(-EINVAL); + } + + conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); + if (conf == NULL) + goto abort; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + conf->stripe_size = DEFAULT_STRIPE_SIZE; + conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; + conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; +#endif + INIT_LIST_HEAD(&conf->free_list); + INIT_LIST_HEAD(&conf->pending_list); + conf->pending_data = kcalloc(PENDING_IO_MAX, + sizeof(struct r5pending_data), + GFP_KERNEL); + if (!conf->pending_data) + goto abort; + for (i = 0; i < PENDING_IO_MAX; i++) + list_add(&conf->pending_data[i].sibling, &conf->free_list); + /* Don't enable multi-threading by default*/ + if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = 0; + conf->worker_groups = new_group; + } else + goto abort; + spin_lock_init(&conf->device_lock); + seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); + mutex_init(&conf->cache_size_mutex); + + init_waitqueue_head(&conf->wait_for_quiescent); + init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_overlap); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->loprio_list); + INIT_LIST_HEAD(&conf->hold_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); + init_llist_head(&conf->released_stripes); + atomic_set(&conf->active_stripes, 0); + atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); + spin_lock_init(&conf->pending_bios_lock); + conf->batch_bio_dispatch = true; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (bdev_nonrot(rdev->bdev)) { + conf->batch_bio_dispatch = false; + break; + } + } + + conf->bypass_threshold = BYPASS_THRESHOLD; + conf->recovery_disabled = mddev->recovery_disabled - 1; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else + conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + max_disks = max(conf->raid_disks, conf->previous_raid_disks); + + conf->disks = kcalloc(max_disks, sizeof(struct disk_info), + GFP_KERNEL); + + if (!conf->disks) + goto abort; + + for (i = 0; i < max_disks; i++) { + conf->disks[i].extra_page = alloc_page(GFP_KERNEL); + if (!conf->disks[i].extra_page) + goto abort; + } + + ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); + if (ret) + goto abort; + conf->mddev = mddev; + + ret = -ENOMEM; + conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!conf->stripe_hashtbl) + goto abort; + + /* We init hash_locks[0] separately to that it can be used + * as the reference lock in the spin_lock_nest_lock() call + * in lock_all_device_hash_locks_irq in order to convince + * lockdep that we know what we are doing. + */ + spin_lock_init(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_init(conf->hash_locks + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->inactive_list + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->temp_inactive_list + i); + + atomic_set(&conf->r5c_cached_full_stripes, 0); + INIT_LIST_HEAD(&conf->r5c_full_stripe_list); + atomic_set(&conf->r5c_cached_partial_stripes, 0); + INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + atomic_set(&conf->r5c_flushing_full_stripes, 0); + atomic_set(&conf->r5c_flushing_partial_stripes, 0); + + conf->level = mddev->new_level; + conf->chunk_sectors = mddev->new_chunk_sectors; + ret = raid5_alloc_percpu(conf); + if (ret) + goto abort; + + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); + + ret = -EIO; + rdev_for_each(rdev, mddev) { + raid_disk = rdev->raid_disk; + if (raid_disk >= max_disks + || raid_disk < 0 || test_bit(Journal, &rdev->flags)) + continue; + disk = conf->disks + raid_disk; + + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto abort; + RCU_INIT_POINTER(disk->replacement, rdev); + } else { + if (disk->rdev) + goto abort; + RCU_INIT_POINTER(disk->rdev, rdev); + } + + if (test_bit(In_sync, &rdev->flags)) { + pr_info("md/raid:%s: device %pg operational as raid disk %d\n", + mdname(mddev), rdev->bdev, raid_disk); + } else if (rdev->saved_raid_disk != raid_disk) + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; + } + + conf->level = mddev->new_level; + if (conf->level == 6) { + conf->max_degraded = 2; + if (raid6_call.xor_syndrome) + conf->rmw_level = PARITY_ENABLE_RMW; + else + conf->rmw_level = PARITY_DISABLE_RMW; + } else { + conf->max_degraded = 1; + conf->rmw_level = PARITY_ENABLE_RMW; + } + conf->algorithm = mddev->new_layout; + conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) { + conf->prev_chunk_sectors = mddev->chunk_sectors; + conf->prev_algo = mddev->layout; + } else { + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->prev_algo = conf->algorithm; + } + + conf->min_nr_stripes = NR_STRIPES; + if (mddev->reshape_position != MaxSector) { + int stripes = max_t(int, + ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); + conf->min_nr_stripes = max(NR_STRIPES, stripes); + if (conf->min_nr_stripes != NR_STRIPES) + pr_info("md/raid:%s: force stripe size %d for reshape\n", + mdname(mddev), conf->min_nr_stripes); + } + memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); + if (grow_stripes(conf, conf->min_nr_stripes)) { + pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); + ret = -ENOMEM; + goto abort; + } else + pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); + /* + * Losing a stripe head costs more than the time to refill it, + * it reduces the queue depth and so can hurt throughput. + * So set it rather large, scaled by number of devices. + */ + conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; + conf->shrinker.scan_objects = raid5_cache_scan; + conf->shrinker.count_objects = raid5_cache_count; + conf->shrinker.batch = 128; + conf->shrinker.flags = 0; + ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); + if (ret) { + pr_warn("md/raid:%s: couldn't register shrinker.\n", + mdname(mddev)); + goto abort; + } + + sprintf(pers_name, "raid%d", mddev->new_level); + conf->thread = md_register_thread(raid5d, mddev, pers_name); + if (!conf->thread) { + pr_warn("md/raid:%s: couldn't allocate thread.\n", + mdname(mddev)); + ret = -ENOMEM; + goto abort; + } + + return conf; + + abort: + if (conf) + free_conf(conf); + return ERR_PTR(ret); +} + +static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) +{ + switch (algo) { + case ALGORITHM_PARITY_0: + if (raid_disk < max_degraded) + return 1; + break; + case ALGORITHM_PARITY_N: + if (raid_disk >= raid_disks - max_degraded) + return 1; + break; + case ALGORITHM_PARITY_0_6: + if (raid_disk == 0 || + raid_disk == raid_disks - 1) + return 1; + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (raid_disk == raid_disks - 1) + return 1; + } + return 0; +} + +static void raid5_set_io_opt(struct r5conf *conf) +{ + blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * + (conf->raid_disks - conf->max_degraded)); +} + +static int raid5_run(struct mddev *mddev) +{ + struct r5conf *conf; + int working_disks = 0; + int dirty_parity_disks = 0; + struct md_rdev *rdev; + struct md_rdev *journal_dev = NULL; + sector_t reshape_offset = 0; + int i, ret = 0; + long long min_offset_diff = 0; + int first = 1; + + if (acct_bioset_init(mddev)) { + pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev)); + return -ENOMEM; + } + + if (mddev_init_writes_pending(mddev) < 0) { + ret = -ENOMEM; + goto exit_acct_set; + } + + if (mddev->recovery_cp != MaxSector) + pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); + + rdev_for_each(rdev, mddev) { + long long diff; + + if (test_bit(Journal, &rdev->flags)) { + journal_dev = rdev; + continue; + } + if (rdev->raid_disk < 0) + continue; + diff = (rdev->new_data_offset - rdev->data_offset); + if (first) { + min_offset_diff = diff; + first = 0; + } else if (mddev->reshape_backwards && + diff < min_offset_diff) + min_offset_diff = diff; + else if (!mddev->reshape_backwards && + diff > min_offset_diff) + min_offset_diff = diff; + } + + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + ret = -EINVAL; + goto exit_acct_set; + } + + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Difficulties arise if the stripe we would write to + * next is at or after the stripe we would read from next. + * For a reshape that changes the number of devices, this + * is only possible for a very short time, and mdadm makes + * sure that time appears to have past before assembling + * the array. So we fail if that time hasn't passed. + * For a reshape that keeps the number of devices the same + * mdadm must be monitoring the reshape can keeping the + * critical areas read-only and backed up. It will start + * the array in read-only mode, so we check for that. + */ + sector_t here_new, here_old; + int old_disks; + int max_degraded = (mddev->level == 6 ? 2 : 1); + int chunk_sectors; + int new_data_disks; + + if (journal_dev) { + pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", + mdname(mddev)); + ret = -EINVAL; + goto exit_acct_set; + } + + if (mddev->new_level != mddev->level) { + pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", + mdname(mddev)); + ret = -EINVAL; + goto exit_acct_set; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old + * geometry. + * If the chunk sizes are different, then as we perform reshape + * in units of the largest of the two, reshape_position needs + * be a multiple of the largest chunk size times new data disks. + */ + here_new = mddev->reshape_position; + chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); + new_data_disks = mddev->raid_disks - max_degraded; + if (sector_div(here_new, chunk_sectors * new_data_disks)) { + pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", + mdname(mddev)); + ret = -EINVAL; + goto exit_acct_set; + } + reshape_offset = here_new * chunk_sectors; + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ + if (mddev->delta_disks == 0) { + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space is monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if (abs(min_offset_diff) >= mddev->chunk_sectors && + abs(min_offset_diff) >= mddev->new_chunk_sectors) + /* not really in-place - so OK */; + else if (mddev->ro == 0) { + pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", + mdname(mddev)); + ret = -EINVAL; + goto exit_acct_set; + } + } else if (mddev->reshape_backwards + ? (here_new * chunk_sectors + min_offset_diff <= + here_old * chunk_sectors) + : (here_new * chunk_sectors >= + here_old * chunk_sectors + (-min_offset_diff))) { + /* Reading from the same stripe as writing to - bad */ + pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", + mdname(mddev)); + ret = -EINVAL; + goto exit_acct_set; + } + pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); + /* OK, we should be able to continue; */ + } else { + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); + BUG_ON(mddev->delta_disks != 0); + } + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && + test_bit(MD_HAS_PPL, &mddev->flags)) { + pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", + mdname(mddev)); + clear_bit(MD_HAS_PPL, &mddev->flags); + clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); + } + + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + + if (IS_ERR(conf)) { + ret = PTR_ERR(conf); + goto exit_acct_set; + } + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + if (!journal_dev) { + pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } else if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + } + + conf->min_offset_diff = min_offset_diff; + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; + i++) { + rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); + if (!rdev && conf->disks[i].replacement) { + /* The replacement is all we have yet */ + rdev = rdev_mdlock_deref(mddev, + conf->disks[i].replacement); + conf->disks[i].replacement = NULL; + clear_bit(Replacement, &rdev->flags); + rcu_assign_pointer(conf->disks[i].rdev, rdev); + } + if (!rdev) + continue; + if (rcu_access_pointer(conf->disks[i].replacement) && + conf->reshape_progress != MaxSector) { + /* replacements and reshape simply do not mix. */ + pr_warn("md: cannot handle concurrent replacement and reshape.\n"); + goto abort; + } + if (test_bit(In_sync, &rdev->flags)) { + working_disks++; + continue; + } + /* This disc is not fully in-sync. However if it + * just stored parity (beyond the recovery_offset), + * when we don't need to be concerned about the + * array being dirty. + * When reshape goes 'backwards', we never have + * partially completed devices, so we only need + * to worry about reshape going forwards. + */ + /* Hack because v0.91 doesn't store recovery_offset properly. */ + if (mddev->major_version == 0 && + mddev->minor_version > 90) + rdev->recovery_offset = reshape_offset; + + if (rdev->recovery_offset < reshape_offset) { + /* We need to check old and new layout */ + if (!only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)) + continue; + } + if (!only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded)) + continue; + dirty_parity_disks++; + } + + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + mddev->degraded = raid5_calc_degraded(conf); + + if (has_failed(conf)) { + pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", + mdname(mddev), mddev->degraded, conf->raid_disks); + goto abort; + } + + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + + if (mddev->degraded > dirty_parity_disks && + mddev->recovery_cp != MaxSector) { + if (test_bit(MD_HAS_PPL, &mddev->flags)) + pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", + mdname(mddev)); + else if (mddev->ok_start_degraded) + pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", + mdname(mddev)); + else { + pr_crit("md/raid:%s: cannot start dirty degraded array.\n", + mdname(mddev)); + goto abort; + } + } + + pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); + + print_raid5_conf(conf); + + if (conf->reshape_progress != MaxSector) { + conf->reshape_safe = conf->reshape_progress; + atomic_set(&conf->reshape_stripes, 0); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) + goto abort; + } + + /* Ok, everything is just fine now */ + if (mddev->to_remove == &raid5_attrs_group) + mddev->to_remove = NULL; + else if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + pr_warn("raid5: failed to create sysfs attributes for %s\n", + mdname(mddev)); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + + if (mddev->queue) { + int chunk_size; + /* read-ahead size must cover two whole stripes, which + * is 2 * (datadisks) * chunksize where 'n' is the + * number of raid devices + */ + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + raid5_set_io_opt(conf); + mddev->queue->limits.raid_partial_stripes_expensive = 1; + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + stripe = roundup_pow_of_two(stripe); + mddev->queue->limits.discard_granularity = stripe; + + blk_queue_max_write_zeroes_sectors(mddev->queue, 0); + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->new_data_offset << 9); + } + + /* + * zeroing is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + * + * We only allow DISCARD if the sysadmin has confirmed that + * only safe devices are in use by setting a module parameter. + * A better idea might be to turn DISCARD into WRITE_ZEROES + * requests, as that is required to be safe. + */ + if (!devices_handle_discard_safely || + mddev->queue->limits.max_discard_sectors < (stripe >> 9) || + mddev->queue->limits.discard_granularity < stripe) + blk_queue_max_discard_sectors(mddev->queue, 0); + + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + blk_queue_max_hw_sectors(mddev->queue, + RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); + + /* No restrictions on the number of segments in the request */ + blk_queue_max_segments(mddev->queue, USHRT_MAX); + } + + if (log_init(conf, journal_dev, raid5_has_ppl(conf))) + goto abort; + + return 0; +abort: + md_unregister_thread(&mddev->thread); + print_raid5_conf(conf); + free_conf(conf); + mddev->private = NULL; + pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); + ret = -EIO; +exit_acct_set: + acct_bioset_exit(mddev); + return ret; +} + +static void raid5_free(struct mddev *mddev, void *priv) +{ + struct r5conf *conf = priv; + + free_conf(conf); + acct_bioset_exit(mddev); + mddev->to_remove = &raid5_attrs_group; +} + +static void raid5_status(struct seq_file *seq, struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + int i; + + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, + conf->chunk_sectors / 2, mddev->layout); + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); + seq_printf (seq, "]"); +} + +static void print_raid5_conf (struct r5conf *conf) +{ + struct md_rdev *rdev; + int i; + + pr_debug("RAID conf printout:\n"); + if (!conf) { + pr_debug("(conf==NULL)\n"); + return; + } + pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, + conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); + + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev) + pr_debug(" disk %d, o:%d, dev:%pg\n", + i, !test_bit(Faulty, &rdev->flags), + rdev->bdev); + } + rcu_read_unlock(); +} + +static int raid5_spare_active(struct mddev *mddev) +{ + int i; + struct r5conf *conf = mddev->private; + struct md_rdev *rdev, *replacement; + int count = 0; + unsigned long flags; + + for (i = 0; i < conf->raid_disks; i++) { + rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); + replacement = rdev_mdlock_deref(mddev, + conf->disks[i].replacement); + if (replacement + && replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &replacement->flags) + && !test_and_set_bit(In_sync, &replacement->flags)) { + /* Replacement has just become active. */ + if (!rdev + || !test_and_clear_bit(In_sync, &rdev->flags)) + count++; + if (rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &rdev->flags); + sysfs_notify_dirent_safe( + rdev->sysfs_state); + } + sysfs_notify_dirent_safe(replacement->sysfs_state); + } else if (rdev + && rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { + count++; + sysfs_notify_dirent_safe(rdev->sysfs_state); + } + } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = raid5_calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + print_raid5_conf(conf); + return count; +} + +static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r5conf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct md_rdev __rcu **rdevp; + struct disk_info *p; + struct md_rdev *tmp; + + print_raid5_conf(conf); + if (test_bit(Journal, &rdev->flags) && conf->log) { + /* + * we can't wait pending write here, as this is called in + * raid5d, wait will deadlock. + * neilb: there is no locking about new writes here, + * so this cannot be safe. + */ + if (atomic_read(&conf->active_stripes) || + atomic_read(&conf->r5c_cached_full_stripes) || + atomic_read(&conf->r5c_cached_partial_stripes)) { + return -EBUSY; + } + log_exit(conf); + return 0; + } + if (unlikely(number >= conf->pool_size)) + return 0; + p = conf->disks + number; + if (rdev == rcu_access_pointer(p->rdev)) + rdevp = &p->rdev; + else if (rdev == rcu_access_pointer(p->replacement)) + rdevp = &p->replacement; + else + return 0; + + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && + !has_failed(conf) && + (!rcu_access_pointer(p->replacement) || + rcu_access_pointer(p->replacement) == rdev) && + number < conf->raid_disks) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + lockdep_assert_held(&mddev->reconfig_mutex); + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + rcu_assign_pointer(*rdevp, rdev); + } + } + if (!err) { + err = log_modify(conf, rdev, false); + if (err) + goto abort; + } + + tmp = rcu_access_pointer(p->replacement); + if (tmp) { + /* We must have just cleared 'rdev' */ + rcu_assign_pointer(p->rdev, tmp); + clear_bit(Replacement, &tmp->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither - if they are careful + */ + rcu_assign_pointer(p->replacement, NULL); + + if (!err) + err = log_modify(conf, tmp, true); + } + + clear_bit(WantReplacement, &rdev->flags); +abort: + + print_raid5_conf(conf); + return err; +} + +static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r5conf *conf = mddev->private; + int ret, err = -EEXIST; + int disk; + struct disk_info *p; + struct md_rdev *tmp; + int first = 0; + int last = conf->raid_disks - 1; + + if (test_bit(Journal, &rdev->flags)) { + if (conf->log) + return -EBUSY; + + rdev->raid_disk = 0; + /* + * The array is in readonly mode if journal is missing, so no + * write requests running. We should be safe + */ + ret = log_init(conf, rdev, false); + if (ret) + return ret; + + ret = r5l_start(conf->log); + if (ret) + return ret; + + return 0; + } + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + + if (rdev->saved_raid_disk < 0 && has_failed(conf)) + /* no point adding a device */ + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= first && + rdev->saved_raid_disk <= last && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + first = rdev->saved_raid_disk; + + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; + if (p->rdev == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->raid_disk = disk; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + + err = log_modify(conf, rdev, true); + + goto out; + } + } + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; + tmp = rdev_mdlock_deref(mddev, p->rdev); + if (test_bit(WantReplacement, &tmp->flags) && + p->replacement == NULL) { + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = disk; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } + } +out: + print_raid5_conf(conf); + return err; +} + +static int raid5_resize(struct mddev *mddev, sector_t sectors) +{ + /* no resync is happening, and there is enough space + * on all devices, so we can resize. + * We need to make sure resync covers any new space. + * If the array is shrinking we should possibly wait until + * any io in the removed space completes, but it hardly seems + * worth it. + */ + sector_t newsize; + struct r5conf *conf = mddev->private; + + if (raid5_has_log(conf) || raid5_has_ppl(conf)) + return -EINVAL; + sectors &= ~((sector_t)conf->chunk_sectors - 1); + newsize = raid5_size(mddev, sectors, mddev->raid_disks); + if (mddev->external_size && + mddev->array_sectors > newsize) + return -EINVAL; + if (mddev->bitmap) { + int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { + mddev->recovery_cp = mddev->dev_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->dev_sectors = sectors; + mddev->resync_max_sectors = sectors; + return 0; +} + +static int check_stripe_cache(struct mddev *mddev) +{ + /* Can only proceed if there are plenty of stripe_heads. + * We need a minimum of one full stripe,, and for sensible progress + * it is best to have about 4 times that. + * If we require 4 times, then the default 256 4K stripe_heads will + * allow for chunk sizes up to 256K, which is probably OK. + * If the chunk size is greater, user-space should request more + * stripe_heads first. + */ + struct r5conf *conf = mddev->private; + if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 + > conf->min_nr_stripes || + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 + > conf->min_nr_stripes) { + pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / RAID5_STRIPE_SIZE(conf))*4); + return 0; + } + return 1; +} + +static int check_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + if (raid5_has_log(conf) || raid5_has_ppl(conf)) + return -EINVAL; + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk_sectors == mddev->chunk_sectors) + return 0; /* nothing to do */ + if (has_failed(conf)) + return -EINVAL; + if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } + + if (!check_stripe_cache(mddev)) + return -ENOSPC; + + if (mddev->new_chunk_sectors > mddev->chunk_sectors || + mddev->delta_disks > 0) + if (resize_chunks(conf, + conf->previous_raid_disks + + max(0, mddev->delta_disks), + max(mddev->new_chunk_sectors, + mddev->chunk_sectors) + ) < 0) + return -ENOMEM; + + if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) + return 0; /* never bother to shrink */ + return resize_stripes(conf, (conf->previous_raid_disks + + mddev->delta_disks)); +} + +static int raid5_start_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + struct md_rdev *rdev; + int spares = 0; + unsigned long flags; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + if (!check_stripe_cache(mddev)) + return -ENOSPC; + + if (has_failed(conf)) + return -EINVAL; + + rdev_for_each(rdev, mddev) { + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) + spares++; + } + + if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) + /* Not enough devices even to make a degraded array + * of that size + */ + return -EINVAL; + + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + pr_warn("md/raid:%s: array size must be reduced before number of disks\n", + mdname(mddev)); + return -EINVAL; + } + + atomic_set(&conf->reshape_stripes, 0); + spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); + conf->previous_raid_disks = conf->raid_disks; + conf->raid_disks += mddev->delta_disks; + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->chunk_sectors = mddev->new_chunk_sectors; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; + conf->generation++; + /* Code that selects data_offset needs to see the generation update + * if reshape_progress has been set - so a memory barrier needed. + */ + smp_mb(); + if (mddev->reshape_backwards) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; + write_seqcount_end(&conf->gen_lock); + spin_unlock_irq(&conf->device_lock); + + /* Now make sure any requests that proceeded on the assumption + * the reshape wasn't running - like Discard or Read - have + * completed. + */ + mddev_suspend(mddev); + mddev_resume(mddev); + + /* Add some new drives, as many as will fit. + * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. + */ + if (mddev->delta_disks >= 0) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev) == 0) { + if (rdev->raid_disk + >= conf->previous_raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; + + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); + } + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + } + + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post number of devices. + */ + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = raid5_calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + } + mddev->raid_disks = conf->raid_disks; + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) { + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); + mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->new_chunk_sectors = + conf->chunk_sectors = conf->prev_chunk_sectors; + mddev->new_layout = conf->algorithm = conf->prev_algo; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); + conf->generation --; + conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; + write_seqcount_end(&conf->gen_lock); + spin_unlock_irq(&conf->device_lock); + return -EAGAIN; + } + conf->reshape_checkpoint = jiffies; + md_wakeup_thread(mddev->sync_thread); + md_new_event(); + return 0; +} + +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ +static void end_reshape(struct r5conf *conf) +{ + + if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; + + spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; + md_finish_reshape(conf->mddev); + smp_wmb(); + conf->reshape_progress = MaxSector; + conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + + if (conf->mddev->queue) + raid5_set_io_opt(conf); + } +} + +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ +static void raid5_finish_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + struct md_rdev *rdev; + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + if (mddev->delta_disks <= 0) { + int d; + spin_lock_irq(&conf->device_lock); + mddev->degraded = raid5_calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) { + rdev = rdev_mdlock_deref(mddev, + conf->disks[d].rdev); + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = rdev_mdlock_deref(mddev, + conf->disks[d].replacement); + if (rdev) + clear_bit(In_sync, &rdev->flags); + } + } + mddev->layout = conf->algorithm; + mddev->chunk_sectors = conf->chunk_sectors; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } +} + +static void raid5_quiesce(struct mddev *mddev, int quiesce) +{ + struct r5conf *conf = mddev->private; + + if (quiesce) { + /* stop all writes */ + lock_all_device_hash_locks_irq(conf); + /* '2' tells resync/reshape to pause so that all + * active stripes can drain + */ + r5c_flush_cache(conf, INT_MAX); + /* need a memory barrier to make sure read_one_chunk() sees + * quiesce started and reverts to slow (locked) path. + */ + smp_store_release(&conf->quiesce, 2); + wait_event_cmd(conf->wait_for_quiescent, + atomic_read(&conf->active_stripes) == 0 && + atomic_read(&conf->active_aligned_reads) == 0, + unlock_all_device_hash_locks_irq(conf), + lock_all_device_hash_locks_irq(conf)); + conf->quiesce = 1; + unlock_all_device_hash_locks_irq(conf); + /* allow reshape to continue */ + wake_up(&conf->wait_for_overlap); + } else { + /* re-enable writes */ + lock_all_device_hash_locks_irq(conf); + conf->quiesce = 0; + wake_up(&conf->wait_for_quiescent); + wake_up(&conf->wait_for_overlap); + unlock_all_device_hash_locks_irq(conf); + } + log_quiesce(conf, quiesce); +} + +static void *raid45_takeover_raid0(struct mddev *mddev, int level) +{ + struct r0conf *raid0_conf = mddev->private; + sector_t sectors; + + /* for raid0 takeover only one zone is supported */ + if (raid0_conf->nr_strip_zones > 1) { + pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + sectors = raid0_conf->strip_zone[0].zone_end; + sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); + mddev->dev_sectors = sectors; + mddev->new_level = level; + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks += 1; + mddev->delta_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + return setup_conf(mddev); +} + +static void *raid5_takeover_raid1(struct mddev *mddev) +{ + int chunksect; + void *ret; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk_sectors = chunksect; + + ret = setup_conf(mddev); + if (!IS_ERR(ret)) + mddev_clear_unsupported_flags(mddev, + UNSUPPORTED_MDDEV_FLAGS); + return ret; +} + +static void *raid5_takeover_raid6(struct mddev *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + +static int raid5_check_reshape(struct mddev *mddev) +{ + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. + */ + struct r5conf *conf = mddev->private; + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (!is_power_of_2(new_chunk)) + return -EINVAL; + if (new_chunk < (PAGE_SIZE>>9)) + return -EINVAL; + if (mddev->array_sectors & (new_chunk-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks == 2) { + /* can make the change immediately */ + if (mddev->new_layout >= 0) { + conf->algorithm = mddev->new_layout; + mddev->layout = mddev->new_layout; + } + if (new_chunk > 0) { + conf->chunk_sectors = new_chunk ; + mddev->chunk_sectors = new_chunk; + } + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + } + return check_reshape(mddev); +} + +static int raid6_check_reshape(struct mddev *mddev) +{ + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (!is_power_of_2(new_chunk)) + return -EINVAL; + if (new_chunk < (PAGE_SIZE >> 9)) + return -EINVAL; + if (mddev->array_sectors & (new_chunk-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + return check_reshape(mddev); +} + +static void *raid5_takeover(struct mddev *mddev) +{ + /* raid5 can take over: + * raid0 - if there is only one strip zone - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 5); + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); + + return ERR_PTR(-EINVAL); +} + +static void *raid4_takeover(struct mddev *mddev) +{ + /* raid4 can take over: + * raid0 - if there is only one strip zone + * raid5 - if layout is right + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 4); + if (mddev->level == 5 && + mddev->layout == ALGORITHM_PARITY_N) { + mddev->new_layout = 0; + mddev->new_level = 4; + return setup_conf(mddev); + } + return ERR_PTR(-EINVAL); +} + +static struct md_personality raid5_personality; + +static void *raid6_takeover(struct mddev *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + +static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) +{ + struct r5conf *conf; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) { + mddev_unlock(mddev); + return -ENODEV; + } + + if (strncmp(buf, "ppl", 3) == 0) { + /* ppl only works with RAID 5 */ + if (!raid5_has_ppl(conf) && conf->level == 5) { + err = log_init(conf, NULL, true); + if (!err) { + err = resize_stripes(conf, conf->pool_size); + if (err) { + mddev_suspend(mddev); + log_exit(conf); + mddev_resume(mddev); + } + } + } else + err = -EINVAL; + } else if (strncmp(buf, "resync", 6) == 0) { + if (raid5_has_ppl(conf)) { + mddev_suspend(mddev); + log_exit(conf); + mddev_resume(mddev); + err = resize_stripes(conf, conf->pool_size); + } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && + r5l_log_disk_error(conf)) { + bool journal_dev_exists = false; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (test_bit(Journal, &rdev->flags)) { + journal_dev_exists = true; + break; + } + + if (!journal_dev_exists) { + mddev_suspend(mddev); + clear_bit(MD_HAS_JOURNAL, &mddev->flags); + mddev_resume(mddev); + } else /* need remove journal device first */ + err = -EBUSY; + } else + err = -EINVAL; + } else { + err = -EINVAL; + } + + if (!err) + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + + return err; +} + +static int raid5_start(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + return r5l_start(conf->log); +} + +static struct md_personality raid6_personality = +{ + .name = "raid6", + .level = 6, + .owner = THIS_MODULE, + .make_request = raid5_make_request, + .run = raid5_run, + .start = raid5_start, + .free = raid5_free, + .status = raid5_status, + .error_handler = raid5_error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = raid5_sync_request, + .resize = raid5_resize, + .size = raid5_size, + .check_reshape = raid6_check_reshape, + .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, + .quiesce = raid5_quiesce, + .takeover = raid6_takeover, + .change_consistency_policy = raid5_change_consistency_policy, +}; +static struct md_personality raid5_personality = +{ + .name = "raid5", + .level = 5, + .owner = THIS_MODULE, + .make_request = raid5_make_request, + .run = raid5_run, + .start = raid5_start, + .free = raid5_free, + .status = raid5_status, + .error_handler = raid5_error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = raid5_sync_request, + .resize = raid5_resize, + .size = raid5_size, + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, + .quiesce = raid5_quiesce, + .takeover = raid5_takeover, + .change_consistency_policy = raid5_change_consistency_policy, +}; + +static struct md_personality raid4_personality = +{ + .name = "raid4", + .level = 4, + .owner = THIS_MODULE, + .make_request = raid5_make_request, + .run = raid5_run, + .start = raid5_start, + .free = raid5_free, + .status = raid5_status, + .error_handler = raid5_error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = raid5_sync_request, + .resize = raid5_resize, + .size = raid5_size, + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, + .quiesce = raid5_quiesce, + .takeover = raid4_takeover, + .change_consistency_policy = raid5_change_consistency_policy, +}; + +static int __init raid5_init(void) +{ + int ret; + + raid5_wq = alloc_workqueue("raid5wq", + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); + if (!raid5_wq) + return -ENOMEM; + + ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, + "md/raid5:prepare", + raid456_cpu_up_prepare, + raid456_cpu_dead); + if (ret) { + destroy_workqueue(raid5_wq); + return ret; + } + register_md_personality(&raid6_personality); + register_md_personality(&raid5_personality); + register_md_personality(&raid4_personality); + return 0; +} + +static void raid5_exit(void) +{ + unregister_md_personality(&raid6_personality); + unregister_md_personality(&raid5_personality); + unregister_md_personality(&raid4_personality); + cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); + destroy_workqueue(raid5_wq); +} + +module_init(raid5_init); +module_exit(raid5_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); +MODULE_ALIAS("md-personality-4"); /* RAID5 */ +MODULE_ALIAS("md-raid5"); +MODULE_ALIAS("md-raid4"); +MODULE_ALIAS("md-level-5"); +MODULE_ALIAS("md-level-4"); +MODULE_ALIAS("md-personality-8"); /* RAID6 */ +MODULE_ALIAS("md-raid6"); +MODULE_ALIAS("md-level-6"); + +/* This used to be two separate modules, they were: */ +MODULE_ALIAS("raid5"); +MODULE_ALIAS("raid6"); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 000000000..e873938a6 --- /dev/null +++ b/drivers/md/raid5.h @@ -0,0 +1,826 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include +#include + +/* + * + * Each stripe contains one buffer per device. Each buffer can be in + * one of a number of states stored in "flags". Changes between + * these states happen *almost* exclusively under the protection of the + * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and + * these are not protected by STRIPE_ACTIVE. + * + * The flag bits that are used to represent these states are: + * R5_UPTODATE and R5_LOCKED + * + * State Empty == !UPTODATE, !LOCK + * We have no data, and there is no active request + * State Want == !UPTODATE, LOCK + * A read request is being submitted for this block + * State Dirty == UPTODATE, LOCK + * Some new data is in this buffer, and it is being written out + * State Clean == UPTODATE, !LOCK + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request. + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distinguish these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write it is copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list, + * write list and written list are protected by the device_lock. + * The device_lock is only for list manipulations and will only be + * held for a very short time. It can be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * The stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. + */ + +/* + * Operations state - intermediate states that are visible outside of + * STRIPE_ACTIVE. + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* xor parity check */ + check_state_run_q, /* q-parity check */ + check_state_run_pq, /* pq dual parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_prexor_drain_run, /* prexor-write */ + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_prexor_drain_result, + reconstruct_state_drain_result, + reconstruct_state_result, +}; + +#define DEFAULT_STRIPE_SIZE 4096 +struct stripe_head { + struct hlist_node hash; + struct list_head lru; /* inactive_list or handle_list */ + struct llist_node release_list; + struct r5conf *raid_conf; + short generation; /* increments with every + * reshape */ + sector_t sector; /* sector of this row */ + short pd_idx; /* parity disk index */ + short qd_idx; /* 'Q' disk index for raid6 */ + short ddf_layout;/* use DDF ordering to calculate Q */ + short hash_lock_index; + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ + int overwrite_disks; /* total overwrite disks in stripe, + * this is only checked when stripe + * has STRIPE_BATCH_READY + */ + enum check_states check_state; + enum reconstruct_states reconstruct_state; + spinlock_t stripe_lock; + int cpu; + struct r5worker_group *group; + + struct stripe_head *batch_head; /* protected by stripe lock */ + spinlock_t batch_lock; /* only header's lock is useful */ + struct list_head batch_list; /* protected by head's batch lock*/ + + union { + struct r5l_io_unit *log_io; + struct ppl_io_unit *ppl_io; + }; + + struct list_head log_list; + sector_t log_start; /* first meta block on the journal */ + struct list_head r5c; /* for r5c_cache->stripe_in_journal */ + + struct page *ppl_page; /* partial parity of this stripe */ + /** + * struct stripe_operations + * @target - STRIPE_OP_COMPUTE_BLK target + * @target2 - 2nd compute target in the raid6 case + * @zero_sum_result - P and Q verification flags + * @request - async service request flags for raid_run_ops + */ + struct stripe_operations { + int target, target2; + enum sum_check_flags zero_sum_result; + } ops; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + /* These pages will be used by bios in dev[i] */ + struct page **pages; + int nr_pages; /* page array size */ + int stripes_per_page; +#endif + struct r5dev { + /* rreq and rvec are used for the replacement device when + * writing data to both devices. + */ + struct bio req, rreq; + struct bio_vec vec, rvec; + struct page *page, *orig_page; + unsigned int offset; /* offset of the page */ + struct bio *toread, *read, *towrite, *written; + sector_t sector; /* sector of this page */ + unsigned long flags; + u32 log_checksum; + unsigned short write_hint; + } dev[1]; /* allocated with extra space depending of RAID geometry */ +}; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. + */ +struct stripe_head_state { + /* 'syncing' means that we need to read all devices, either + * to check/correct parity, or to reconstruct a missing device. + * 'replacing' means we are replacing one or more drives and + * the source is valid at this point so we don't need to + * read all devices, just the replacement targets. + */ + int syncing, expanding, expanded, replacing; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int injournal, just_cached; + int failed_num[2]; + int p_failed, q_failed; + int dec_preread_active; + unsigned long ops_request; + + struct md_rdev *blocked_rdev; + int handle_bad_blocks; + int log_failed; + int waiting_extra_page; +}; + +/* Flags for struct r5dev.flags */ +enum r5dev_flags { + R5_UPTODATE, /* page contains current data */ + R5_LOCKED, /* IO has been submitted on "req" */ + R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ + R5_OVERWRITE, /* towrite covers whole page */ +/* and some that are internal to handle_stripe */ + R5_Insync, /* rdev && rdev->in_sync at start */ + R5_Wantread, /* want to schedule a read */ + R5_Wantwrite, + R5_Overlap, /* There is a pending overlapping request + * on this block */ + R5_ReadNoMerge, /* prevent bio from merging in block-layer */ + R5_ReadError, /* seen a read error here recently */ + R5_ReWrite, /* have tried to over-write the readerror */ + + R5_Expanded, /* This block now has post-expand data */ + R5_Wantcompute, /* compute_block in progress treat as + * uptodate + */ + R5_Wantfill, /* dev->toread contains a bio that needs + * filling + */ + R5_Wantdrain, /* dev->towrite needs to be drained */ + R5_WantFUA, /* Write should be FUA */ + R5_SyncIO, /* The IO is sync */ + R5_WriteError, /* got a write error - need to record it */ + R5_MadeGood, /* A bad block has been fixed by writing to it */ + R5_ReadRepl, /* Will/did read from replacement rather than orig */ + R5_MadeGoodRepl,/* A bad block on the replacement device has been + * fixed by writing to it */ + R5_NeedReplace, /* This device has a replacement which is not + * up-to-date at this stripe. */ + R5_WantReplace, /* We need to update the replacement, we have read + * data in, and now is a good time to write it out. + */ + R5_Discard, /* Discard the stripe */ + R5_SkipCopy, /* Don't copy data from bio to stripe cache */ + R5_InJournal, /* data being written is in the journal device. + * if R5_InJournal is set for parity pd_idx, all the + * data and parity being written are in the journal + * device + */ + R5_OrigPageUPTDODATE, /* with write back cache, we read old data into + * dev->orig_page for prexor. When this flag is + * set, orig_page contains latest data in the + * raid disk. + */ +}; + +/* + * Stripe state + */ +enum { + STRIPE_ACTIVE, + STRIPE_HANDLE, + STRIPE_SYNC_REQUESTED, + STRIPE_SYNCING, + STRIPE_INSYNC, + STRIPE_REPLACED, + STRIPE_PREREAD_ACTIVE, + STRIPE_DELAYED, + STRIPE_DEGRADED, + STRIPE_BIT_DELAY, + STRIPE_EXPANDING, + STRIPE_EXPAND_SOURCE, + STRIPE_EXPAND_READY, + STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ + STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ + STRIPE_BIOFILL_RUN, + STRIPE_COMPUTE_RUN, + STRIPE_ON_UNPLUG_LIST, + STRIPE_DISCARD, + STRIPE_ON_RELEASE_LIST, + STRIPE_BATCH_READY, + STRIPE_BATCH_ERR, + STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add + * to batch yet. + */ + STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) + * this bit is used in two scenarios: + * + * 1. write-out phase + * set in first entry of r5l_write_stripe + * clear in second entry of r5l_write_stripe + * used to bypass logic in handle_stripe + * + * 2. caching phase + * set in r5c_try_caching_write() + * clear when journal write is done + * used to initiate r5c_cache_data() + * also used to bypass logic in handle_stripe + */ + STRIPE_R5C_CACHING, /* the stripe is in caching phase + * see more detail in the raid5-cache.c + */ + STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or + * in conf->r5c_partial_stripe_list) + */ + STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or + * in conf->r5c_full_stripe_list) + */ + STRIPE_R5C_PREFLUSH, /* need to flush journal device */ +}; + +#define STRIPE_EXPAND_SYNC_FLAGS \ + ((1 << STRIPE_EXPAND_SOURCE) |\ + (1 << STRIPE_EXPAND_READY) |\ + (1 << STRIPE_EXPANDING) |\ + (1 << STRIPE_SYNC_REQUESTED)) +/* + * Operation request flags + */ +enum { + STRIPE_OP_BIOFILL, + STRIPE_OP_COMPUTE_BLK, + STRIPE_OP_PREXOR, + STRIPE_OP_BIODRAIN, + STRIPE_OP_RECONSTRUCT, + STRIPE_OP_CHECK, + STRIPE_OP_PARTIAL_PARITY, +}; + +/* + * RAID parity calculation preferences + */ +enum { + PARITY_DISABLE_RMW = 0, + PARITY_ENABLE_RMW, + PARITY_PREFER_RMW, +}; + +/* + * Pages requested from set_syndrome_sources() + */ +enum { + SYNDROME_SRC_ALL, + SYNDROME_SRC_WANT_DRAIN, + SYNDROME_SRC_WRITTEN, +}; +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplug call is made. (the unplug_io_fn() is called). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the 'handle' queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set + * PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leaves nothing locked. + */ + +/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk. + * There are three safe ways to access disk_info.rdev. + * 1/ when holding mddev->reconfig_mutex + * 2/ when resync/recovery/reshape is known to be happening - i.e. in code that + * is called as part of performing resync/recovery/reshape. + * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer + * and if it is non-NULL, increment rdev->nr_pending before dropping the RCU + * lock. + * When .rdev is set to NULL, the nr_pending count checked again and if + * it has been incremented, the pointer is put back in .rdev. + */ + +struct disk_info { + struct md_rdev __rcu *rdev; + struct md_rdev __rcu *replacement; + struct page *extra_page; /* extra page to use in prexor */ +}; + +/* + * Stripe cache + */ + +#define NR_STRIPES 256 + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define STRIPE_SIZE PAGE_SIZE +#define STRIPE_SHIFT (PAGE_SHIFT - 9) +#define STRIPE_SECTORS (STRIPE_SIZE>>9) +#endif + +#define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) +#define HASH_MASK (NR_HASH - 1) +#define MAX_STRIPE_BATCH 8 + +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + +struct r5worker { + struct work_struct work; + struct r5worker_group *group; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; + bool working; +}; + +struct r5worker_group { + struct list_head handle_list; + struct list_head loprio_list; + struct r5conf *conf; + struct r5worker *workers; + int stripes_cnt; +}; + +/* + * r5c journal modes of the array: write-back or write-through. + * write-through mode has identical behavior as existing log only + * implementation. + */ +enum r5c_journal_mode { + R5C_JOURNAL_MODE_WRITE_THROUGH = 0, + R5C_JOURNAL_MODE_WRITE_BACK = 1, +}; + +enum r5_cache_state { + R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + R5_ALLOC_MORE, /* It might help to allocate another + * stripe. + */ + R5_DID_ALLOC, /* A stripe was allocated, don't allocate + * more until at least one has been + * released. This avoids flooding + * the cache. + */ + R5C_LOG_TIGHT, /* log device space tight, need to + * prioritize stripes at last_checkpoint + */ + R5C_LOG_CRITICAL, /* log device is running out of space, + * only process stripes that are already + * occupying the log + */ + R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page + * for prexor + */ +}; + +#define PENDING_IO_MAX 512 +#define PENDING_IO_ONE_FLUSH 128 +struct r5pending_data { + struct list_head sibling; + sector_t sector; /* stripe sector */ + struct bio_list bios; +}; + +struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + int scribble_obj_size; + local_lock_t lock; +}; + +struct r5conf { + struct hlist_head *stripe_hashtbl; + /* only protect corresponding hash list and inactive_list */ + spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; + struct mddev *mddev; + int chunk_sectors; + int level, algorithm, rmw_level; + int max_degraded; + int raid_disks; + int max_nr_stripes; + int min_nr_stripes; +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + unsigned long stripe_size; + unsigned int stripe_shift; + unsigned long stripe_sectors; +#endif + + /* reshape_progress is the leading edge of a 'reshape' + * It has value MaxSector when no reshape is happening + * If delta_disks < 0, it is the last sector we started work on, + * else is it the next sector to work on. + */ + sector_t reshape_progress; + /* reshape_safe is the trailing edge of a reshape. We know that + * before (or after) this address, all reshape has completed. + */ + sector_t reshape_safe; + int previous_raid_disks; + int prev_chunk_sectors; + int prev_algo; + short generation; /* increments with every reshape */ + seqcount_spinlock_t gen_lock; /* lock against generation changes */ + unsigned long reshape_checkpoint; /* Time we last updated + * metadata */ + long long min_offset_diff; /* minimum difference between + * data_offset and + * new_data_offset across all + * devices. May be negative, + * but is closest to zero. + */ + + struct list_head handle_list; /* stripes needing handling */ + struct list_head loprio_list; /* low priority stripes */ + struct list_head hold_list; /* preread ready stripes */ + struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + unsigned int retry_read_offset; /* sector offset into retry_read_aligned */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; + atomic_t pending_full_writes; /* full write backlog */ + int bypass_count; /* bypassed prereads */ + int bypass_threshold; /* preread nice */ + int skip_copy; /* Don't copy data from bio to stripe cache */ + struct list_head *last_hold; /* detect hold_list promotions */ + + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][32]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + struct mutex cache_size_mutex; /* Protect changes to cache size */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu __percpu *percpu; + int scribble_disks; + int scribble_sectors; + struct hlist_node node; + + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + + atomic_t r5c_cached_full_stripes; + struct list_head r5c_full_stripe_list; + atomic_t r5c_cached_partial_stripes; + struct list_head r5c_partial_stripe_list; + atomic_t r5c_flushing_full_stripes; + atomic_t r5c_flushing_partial_stripes; + + atomic_t empty_inactive_list_nr; + struct llist_head released_stripes; + wait_queue_head_t wait_for_quiescent; + wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_overlap; + unsigned long cache_state; + struct shrinker shrinker; + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; + struct bio_set bio_split; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; + struct r5worker_group *worker_groups; + int group_cnt; + int worker_cnt_per_group; + struct r5l_log *log; + void *log_private; + + spinlock_t pending_bios_lock; + bool batch_bio_dispatch; + struct r5pending_data *pending_data; + struct list_head free_list; + struct list_head pending_list; + int pending_data_cnt; + struct r5pending_data *next_pending_data; +}; + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE +#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT +#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS +#else +#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size) +#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift) +#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors) +#endif + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the + * sector of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector) +{ + if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf)) + return bio->bi_next; + else + return NULL; +} + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ +#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ +#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ +#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +static inline int algorithm_valid_raid5(int layout) +{ + return (layout >= 0) && + (layout <= 5); +} +static inline int algorithm_valid_raid6(int layout) +{ + return (layout >= 0 && layout <= 5) + || + (layout >= 8 && layout <= 10) + || + (layout >= 16 && layout <= 20); +} + +static inline int algorithm_is_DDF(int layout) +{ + return layout >= 8 && layout <= 10; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +/* + * Return offset of the corresponding page for r5dev. + */ +static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx) +{ + return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf); +} + +/* + * Return corresponding page address for r5dev. + */ +static inline struct page * +raid5_get_dev_page(struct stripe_head *sh, int disk_idx) +{ + return sh->pages[disk_idx / sh->stripes_per_page]; +} +#endif + +void md_raid5_kick_device(struct r5conf *conf); +int raid5_set_cache_size(struct mddev *mddev, int size); +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); +void raid5_release_stripe(struct stripe_head *sh); +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, struct stripe_head *sh); + +struct stripe_request_ctx; +/* get stripe from previous generation (when reshaping) */ +#define R5_GAS_PREVIOUS (1 << 0) +/* do not block waiting for a free stripe */ +#define R5_GAS_NOBLOCK (1 << 1) +/* do not block waiting for quiesce to be released */ +#define R5_GAS_NOQUIESCE (1 << 2) +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + struct stripe_request_ctx *ctx, sector_t sector, + unsigned int flags); + +int raid5_calc_degraded(struct r5conf *conf); +int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); +#endif -- cgit v1.2.3