summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:12 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:12 +0000
commit8665bd53f2f2e27e5511d90428cb3f60e6d0ce15 (patch)
tree8d58900dc0ebd4a3011f92c128d2fe45bc7c4bf2 /drivers/md
parentAdding debian version 6.7.12-1. (diff)
downloadlinux-8665bd53f2f2e27e5511d90428cb3f60e6d0ce15.tar.xz
linux-8665bd53f2f2e27e5511d90428cb3f60e6d0ce15.zip
Merging upstream version 6.8.9.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig34
-rw-r--r--drivers/md/Makefile10
-rw-r--r--drivers/md/bcache/super.c1
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-crypt.c4
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-integrity.c4
-rw-r--r--drivers/md/dm-kcopyd.c2
-rw-r--r--drivers/md/dm-stats.c9
-rw-r--r--drivers/md/dm-table.c45
-rw-r--r--drivers/md/dm-verity-target.c2
-rw-r--r--drivers/md/dm-writecache.c8
-rw-r--r--drivers/md/dm-zoned-metadata.c7
-rw-r--r--drivers/md/dm-zoned-target.c4
-rw-r--r--drivers/md/dm.c4
-rw-r--r--drivers/md/md-autodetect.c8
-rw-r--r--drivers/md/md-faulty.c365
-rw-r--r--drivers/md/md-linear.c318
-rw-r--r--drivers/md/md-multipath.c462
-rw-r--r--drivers/md/md.c243
-rw-r--r--drivers/md/raid1-10.c54
-rw-r--r--drivers/md/raid1.c22
-rw-r--r--drivers/md/raid10.c262
-rw-r--r--drivers/md/raid5-cache.c11
-rw-r--r--drivers/md/raid5-ppl.c16
-rw-r--r--drivers/md/raid5.c182
-rw-r--r--drivers/md/raid5.h4
27 files changed, 359 insertions, 1726 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3ff87cb4dc..a743e2c572 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -61,19 +61,6 @@ config MD_BITMAP_FILE
various kernel APIs and can only work with files on a file system not
actually sitting on the MD device.
-config MD_LINEAR
- tristate "Linear (append) mode (deprecated)"
- depends on BLK_DEV_MD
- help
- If you say Y here, then your multiple devices driver will be able to
- use the so-called linear mode, i.e. it will combine the hard disk
- partitions by simply appending one to the other.
-
- To compile this as a module, choose M here: the module
- will be called linear.
-
- If unsure, say Y.
-
config MD_RAID0
tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD
@@ -172,27 +159,6 @@ config MD_RAID456
If unsure, say Y.
-config MD_MULTIPATH
- tristate "Multipath I/O support (deprecated)"
- depends on BLK_DEV_MD
- help
- MD_MULTIPATH provides a simple multi-path personality for use
- the MD framework. It is not under active development. New
- projects should consider using DM_MULTIPATH which has more
- features and more testing.
-
- If unsure, say N.
-
-config MD_FAULTY
- tristate "Faulty test module for MD (deprecated)"
- depends on BLK_DEV_MD
- help
- The "faulty" module allows for a block device that occasionally returns
- read or write errors. It is useful for testing.
-
- In unsure, say N.
-
-
config MD_CLUSTER
tristate "Cluster Support for MD"
depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 84291e38dc..027d7cfeca 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,22 +29,16 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
-linear-y += md-linear.o
-multipath-y += md-multipath.o
-faulty-y += md-faulty.o
# Note: link order is important. All raid personalities
-# and must come before md.o, as they each initialise
-# themselves, and md.o may use the personalities when it
+# and must come before md.o, as they each initialise
+# themselves, and md.o may use the personalities when it
# auto-initialised.
-obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
obj-$(CONFIG_MD_RAID456) += raid456.o
-obj-$(CONFIG_MD_MULTIPATH) += multipath.o
-obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
obj-$(CONFIG_BCACHE) += bcache/
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1402096b80..dc3f50f697 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -954,7 +954,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_VECS;
blk_queue_max_discard_sectors(q, UINT_MAX);
- q->limits.discard_granularity = 512;
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 4f2808ef38..f5541b8f63 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1170,7 +1170,7 @@ static void __cache_size_refresh(void)
* If the allocation may fail we use __get_free_pages. Memory fragmentation
* won't have a fatal effect here, but it just causes flushes of some other
* buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order > MAX_ORDER).
+ * always fails (i.e. order > MAX_PAGE_ORDER).
*
* If the allocation shouldn't fail we use __vmalloc. This is only for the
* initial reserve allocation, so there's no risk of wasting all vmalloc
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 35f5019395..59445763e5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1678,7 +1678,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
unsigned int remaining_size;
- unsigned int order = MAX_ORDER;
+ unsigned int order = MAX_PAGE_ORDER;
retry:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3702,7 +3702,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 24, 0},
+ .version = {1, 25, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index f57fb82152..7916ed9f10 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
remaining_size = size;
- order = MAX_ORDER;
+ order = MAX_PAGE_ORDER;
while (remaining_size) {
struct page *pages;
unsigned size_to_add, to_copy;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 2cc30b9ab2..3b4218a2e7 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4221,7 +4221,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
} else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
} else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
- if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+ if ((uint64_t)val >= (uint64_t)UINT_MAX * 1000 / HZ) {
r = -EINVAL;
ti->error = "Invalid bitmap_flush_interval argument";
goto bad;
@@ -4742,7 +4742,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 79c65c9ad5..6ea75436a4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -807,7 +807,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
*/
if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
for (i = 0; i < job->num_dests; i++) {
- if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
+ if (bdev_is_zoned(dests[i].bdev)) {
job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
break;
}
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index bdc14ec998..1e5d988f44 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -66,6 +66,9 @@ struct dm_stats_last_position {
unsigned int last_rw;
};
+#define DM_STAT_MAX_ENTRIES 8388608
+#define DM_STAT_MAX_HISTOGRAM_ENTRIES 134217728
+
/*
* A typo on the command line could possibly make the kernel run out of memory
* and crash. To prevent the crash we account all used memory. We fail if we
@@ -285,6 +288,9 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
return -EOVERFLOW;
+ if (n_entries > DM_STAT_MAX_ENTRIES)
+ return -EOVERFLOW;
+
shared_alloc_size = struct_size(s, stat_shared, n_entries);
if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
return -EOVERFLOW;
@@ -297,6 +303,9 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
return -EOVERFLOW;
+ if ((n_histogram_entries + 1) * (size_t)n_entries > DM_STAT_MAX_HISTOGRAM_ENTRIES)
+ return -EOVERFLOW;
+
if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
return -ENOMEM;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 08c9c20f9c..41f1d731ae 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1584,21 +1584,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t)
return true;
}
-static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- enum blk_zoned_model *zoned_model = data;
+ bool *zoned = data;
- return blk_queue_zoned_model(q) != *zoned_model;
+ return bdev_is_zoned(dev->bdev) != *zoned;
}
static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return blk_queue_zoned_model(q) != BLK_ZONED_NONE;
+ return bdev_is_zoned(dev->bdev);
}
/*
@@ -1608,8 +1605,7 @@ static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
* has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
* zoned model with all zoned devices having the same zone size.
*/
-static bool dm_table_supports_zoned_model(struct dm_table *t,
- enum blk_zoned_model zoned_model)
+static bool dm_table_supports_zoned(struct dm_table *t, bool zoned)
{
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
@@ -1628,11 +1624,11 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
if (dm_target_supports_zoned_hm(ti->type)) {
if (!ti->type->iterate_devices ||
- ti->type->iterate_devices(ti, device_not_zoned_model,
- &zoned_model))
+ ti->type->iterate_devices(ti, device_not_zoned,
+ &zoned))
return false;
} else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
- if (zoned_model == BLK_ZONED_HM)
+ if (zoned)
return false;
}
}
@@ -1655,14 +1651,13 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *
* zone sectors, if the destination device is a zoned block device, it shall
* have the specified zone_sectors.
*/
-static int validate_hardware_zoned_model(struct dm_table *t,
- enum blk_zoned_model zoned_model,
- unsigned int zone_sectors)
+static int validate_hardware_zoned(struct dm_table *t, bool zoned,
+ unsigned int zone_sectors)
{
- if (zoned_model == BLK_ZONED_NONE)
+ if (!zoned)
return 0;
- if (!dm_table_supports_zoned_model(t, zoned_model)) {
+ if (!dm_table_supports_zoned(t, zoned)) {
DMERR("%s: zoned model is not consistent across all devices",
dm_device_name(t->md));
return -EINVAL;
@@ -1688,8 +1683,8 @@ int dm_calculate_queue_limits(struct dm_table *t,
struct queue_limits *limits)
{
struct queue_limits ti_limits;
- enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
unsigned int zone_sectors = 0;
+ bool zoned = false;
blk_set_stacking_limits(limits);
@@ -1711,12 +1706,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
ti->type->iterate_devices(ti, dm_set_device_limits,
&ti_limits);
- if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
+ if (!zoned && ti_limits.zoned) {
/*
* After stacking all limits, validate all devices
* in table support this zoned model and zone sectors.
*/
- zoned_model = ti_limits.zoned;
+ zoned = ti_limits.zoned;
zone_sectors = ti_limits.chunk_sectors;
}
@@ -1749,18 +1744,18 @@ combine_limits:
* Verify that the zoned model and zone sectors, as determined before
* any .io_hints override, are the same across all devices in the table.
* - this is especially relevant if .io_hints is emulating a disk-managed
- * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
+ * zoned model on host-managed zoned block devices.
* BUT...
*/
- if (limits->zoned != BLK_ZONED_NONE) {
+ if (limits->zoned) {
/*
* ...IF the above limits stacking determined a zoned model
* validate that all of the table's devices conform to it.
*/
- zoned_model = limits->zoned;
+ zoned = limits->zoned;
zone_sectors = limits->chunk_sectors;
}
- if (validate_hardware_zoned_model(t, zoned_model, zone_sectors))
+ if (validate_hardware_zoned(t, zoned, zone_sectors))
return -EINVAL;
return validate_hardware_logical_block_alignment(t, limits);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 49e4a35d70..abc008bae9 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1560,7 +1560,7 @@ int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned i
static struct target_type verity_target = {
.name = "verity",
.features = DM_TARGET_IMMUTABLE,
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 6a4279bfb1..01ab141bc5 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -299,7 +299,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
long i;
wc->memory_map = NULL;
- pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
+ pages = vmalloc_array(p, sizeof(struct page *));
if (!pages) {
r = -ENOMEM;
goto err2;
@@ -330,7 +330,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
r = -ENOMEM;
goto err3;
}
- kvfree(pages);
+ vfree(pages);
wc->memory_vmapped = true;
}
@@ -341,7 +341,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
return 0;
err3:
- kvfree(pages);
+ vfree(pages);
err2:
dax_read_unlock(id);
err1:
@@ -962,7 +962,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc)
if (wc->entries)
return 0;
- wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
+ wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
if (!wc->entries)
return -ENOMEM;
for (b = 0; b < wc->n_blocks; b++) {
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 60a4dc01ea..fdfe30f7b6 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -2836,12 +2836,11 @@ static void dmz_print_dev(struct dmz_metadata *zmd, int num)
{
struct dmz_dev *dev = &zmd->dev[num];
- if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
+ if (!bdev_is_zoned(dev->bdev))
dmz_dev_info(dev, "Regular block device");
else
- dmz_dev_info(dev, "Host-%s zoned block device",
- bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
- "aware" : "managed");
+ dmz_dev_info(dev, "Host-managed zoned block device");
+
if (zmd->sb_version > 1) {
sector_t sector_offset =
dev->zone_offset << zmd->zone_nr_sectors_shift;
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index b487f7acc8..621794a9ed 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -702,7 +702,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path,
}
bdev = ddev->bdev;
- if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
+ if (!bdev_is_zoned(bdev)) {
if (nr_devs == 1) {
ti->error = "Invalid regular device";
goto err;
@@ -1010,7 +1010,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
limits->max_sectors = chunk_sectors;
/* We are exposing a drive-managed zoned block device */
- limits->zoned = BLK_ZONED_NONE;
+ limits->zoned = false;
}
/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4ff9bebb81..0dc3650c7f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2675,7 +2675,7 @@ static int lock_fs(struct mapped_device *md)
WARN_ON(test_bit(DMF_FROZEN, &md->flags));
- r = freeze_bdev(md->disk->part0);
+ r = bdev_freeze(md->disk->part0);
if (!r)
set_bit(DMF_FROZEN, &md->flags);
return r;
@@ -2685,7 +2685,7 @@ static void unlock_fs(struct mapped_device *md)
{
if (!test_bit(DMF_FROZEN, &md->flags))
return;
- thaw_bdev(md->disk->part0);
+ bdev_thaw(md->disk->part0);
clear_bit(DMF_FROZEN, &md->flags);
}
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 4b80165afd..b2a00f213c 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -49,7 +49,6 @@ static int md_setup_ents __initdata;
* instead of just one. -- KTK
* 18May2000: Added support for persistent-superblock arrays:
* md=n,0,factor,fault,device-list uses RAID0 for device n
- * md=n,-1,factor,fault,device-list uses LINEAR for device n
* md=n,device-list reads a RAID superblock from the devices
* elements in device-list are read by name_to_kdev_t so can be
* a hex number or something like /dev/hda1 /dev/sdb
@@ -88,7 +87,7 @@ static int __init md_setup(char *str)
md_setup_ents++;
switch (get_option(&str, &level)) { /* RAID level */
case 2: /* could be 0 or -1.. */
- if (level == 0 || level == LEVEL_LINEAR) {
+ if (level == 0) {
if (get_option(&str, &factor) != 2 || /* Chunk Size */
get_option(&str, &fault) != 2) {
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
@@ -96,10 +95,7 @@ static int __init md_setup(char *str)
}
md_setup_args[ent].level = level;
md_setup_args[ent].chunk = 1 << (factor+12);
- if (level == LEVEL_LINEAR)
- pername = "linear";
- else
- pername = "raid0";
+ pername = "raid0";
break;
}
fallthrough;
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
deleted file mode 100644
index a039e8e20f..0000000000
--- a/drivers/md/md-faulty.c
+++ /dev/null
@@ -1,365 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * faulty.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 2004 Neil Brown
- *
- * fautly-device-simulator personality for md
- */
-
-
-/*
- * The "faulty" personality causes some requests to fail.
- *
- * Possible failure modes are:
- * reads fail "randomly" but succeed on retry
- * writes fail "randomly" but succeed on retry
- * reads for some address fail and then persist until a write
- * reads for some address fail and then persist irrespective of write
- * writes for some address fail and persist
- * all writes fail
- *
- * Different modes can be active at a time, but only
- * one can be set at array creation. Others can be added later.
- * A mode can be one-shot or recurrent with the recurrence being
- * once in every N requests.
- * The bottom 5 bits of the "layout" indicate the mode. The
- * remainder indicate a period, or 0 for one-shot.
- *
- * There is an implementation limit on the number of concurrently
- * persisting-faulty blocks. When a new fault is requested that would
- * exceed the limit, it is ignored.
- * All current faults can be clear using a layout of "0".
- *
- * Requests are always sent to the device. If they are to fail,
- * we clone the bio and insert a new b_end_io into the chain.
- */
-
-#define WriteTransient 0
-#define ReadTransient 1
-#define WritePersistent 2
-#define ReadPersistent 3
-#define WriteAll 4 /* doesn't go to device */
-#define ReadFixable 5
-#define Modes 6
-
-#define ClearErrors 31
-#define ClearFaults 30
-
-#define AllPersist 100 /* internal use only */
-#define NoPersist 101
-
-#define ModeMask 0x1f
-#define ModeShift 5
-
-#define MaxFault 50
-#include <linux/blkdev.h>
-#include <linux/module.h>
-#include <linux/raid/md_u.h>
-#include <linux/slab.h>
-#include "md.h"
-#include <linux/seq_file.h>
-
-
-static void faulty_fail(struct bio *bio)
-{
- struct bio *b = bio->bi_private;
-
- b->bi_iter.bi_size = bio->bi_iter.bi_size;
- b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
-
- bio_put(bio);
-
- bio_io_error(b);
-}
-
-struct faulty_conf {
- int period[Modes];
- atomic_t counters[Modes];
- sector_t faults[MaxFault];
- int modes[MaxFault];
- int nfaults;
- struct md_rdev *rdev;
-};
-
-static int check_mode(struct faulty_conf *conf, int mode)
-{
- if (conf->period[mode] == 0 &&
- atomic_read(&conf->counters[mode]) <= 0)
- return 0; /* no failure, no decrement */
-
-
- if (atomic_dec_and_test(&conf->counters[mode])) {
- if (conf->period[mode])
- atomic_set(&conf->counters[mode], conf->period[mode]);
- return 1;
- }
- return 0;
-}
-
-static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir)
-{
- /* If we find a ReadFixable sector, we fix it ... */
- int i;
- for (i=0; i<conf->nfaults; i++)
- if (conf->faults[i] >= start &&
- conf->faults[i] < end) {
- /* found it ... */
- switch (conf->modes[i] * 2 + dir) {
- case WritePersistent*2+WRITE: return 1;
- case ReadPersistent*2+READ: return 1;
- case ReadFixable*2+READ: return 1;
- case ReadFixable*2+WRITE:
- conf->modes[i] = NoPersist;
- return 0;
- case AllPersist*2+READ:
- case AllPersist*2+WRITE: return 1;
- default:
- return 0;
- }
- }
- return 0;
-}
-
-static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
-{
- int i;
- int n = conf->nfaults;
- for (i=0; i<conf->nfaults; i++)
- if (conf->faults[i] == start) {
- switch(mode) {
- case NoPersist: conf->modes[i] = mode; return;
- case WritePersistent:
- if (conf->modes[i] == ReadPersistent ||
- conf->modes[i] == ReadFixable)
- conf->modes[i] = AllPersist;
- else
- conf->modes[i] = WritePersistent;
- return;
- case ReadPersistent:
- if (conf->modes[i] == WritePersistent)
- conf->modes[i] = AllPersist;
- else
- conf->modes[i] = ReadPersistent;
- return;
- case ReadFixable:
- if (conf->modes[i] == WritePersistent ||
- conf->modes[i] == ReadPersistent)
- conf->modes[i] = AllPersist;
- else
- conf->modes[i] = ReadFixable;
- return;
- }
- } else if (conf->modes[i] == NoPersist)
- n = i;
-
- if (n >= MaxFault)
- return;
- conf->faults[n] = start;
- conf->modes[n] = mode;
- if (conf->nfaults == n)
- conf->nfaults = n+1;
-}
-
-static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
-{
- struct faulty_conf *conf = mddev->private;
- int failit = 0;
-
- if (bio_data_dir(bio) == WRITE) {
- /* write request */
- if (atomic_read(&conf->counters[WriteAll])) {
- /* special case - don't decrement, don't submit_bio_noacct,
- * just fail immediately
- */
- bio_io_error(bio);
- return true;
- }
-
- if (check_sector(conf, bio->bi_iter.bi_sector,
- bio_end_sector(bio), WRITE))
- failit = 1;
- if (check_mode(conf, WritePersistent)) {
- add_sector(conf, bio->bi_iter.bi_sector,
- WritePersistent);
- failit = 1;
- }
- if (check_mode(conf, WriteTransient))
- failit = 1;
- } else {
- /* read request */
- if (check_sector(conf, bio->bi_iter.bi_sector,
- bio_end_sector(bio), READ))
- failit = 1;
- if (check_mode(conf, ReadTransient))
- failit = 1;
- if (check_mode(conf, ReadPersistent)) {
- add_sector(conf, bio->bi_iter.bi_sector,
- ReadPersistent);
- failit = 1;
- }
- if (check_mode(conf, ReadFixable)) {
- add_sector(conf, bio->bi_iter.bi_sector,
- ReadFixable);
- failit = 1;
- }
- }
-
- md_account_bio(mddev, &bio);
- if (failit) {
- struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
- &mddev->bio_set);
-
- b->bi_private = bio;
- b->bi_end_io = faulty_fail;
- bio = b;
- } else
- bio_set_dev(bio, conf->rdev->bdev);
-
- submit_bio_noacct(bio);
- return true;
-}
-
-static void faulty_status(struct seq_file *seq, struct mddev *mddev)
-{
- struct faulty_conf *conf = mddev->private;
- int n;
-
- if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
- seq_printf(seq, " WriteTransient=%d(%d)",
- n, conf->period[WriteTransient]);
-
- if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
- seq_printf(seq, " ReadTransient=%d(%d)",
- n, conf->period[ReadTransient]);
-
- if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
- seq_printf(seq, " WritePersistent=%d(%d)",
- n, conf->period[WritePersistent]);
-
- if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
- seq_printf(seq, " ReadPersistent=%d(%d)",
- n, conf->period[ReadPersistent]);
-
-
- if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
- seq_printf(seq, " ReadFixable=%d(%d)",
- n, conf->period[ReadFixable]);
-
- if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
- seq_printf(seq, " WriteAll");
-
- seq_printf(seq, " nfaults=%d", conf->nfaults);
-}
-
-
-static int faulty_reshape(struct mddev *mddev)
-{
- int mode = mddev->new_layout & ModeMask;
- int count = mddev->new_layout >> ModeShift;
- struct faulty_conf *conf = mddev->private;
-
- if (mddev->new_layout < 0)
- return 0;
-
- /* new layout */
- if (mode == ClearFaults)
- conf->nfaults = 0;
- else if (mode == ClearErrors) {
- int i;
- for (i=0 ; i < Modes ; i++) {
- conf->period[i] = 0;
- atomic_set(&conf->counters[i], 0);
- }
- } else if (mode < Modes) {
- conf->period[mode] = count;
- if (!count) count++;
- atomic_set(&conf->counters[mode], count);
- } else
- return -EINVAL;
- mddev->new_layout = -1;
- mddev->layout = -1; /* makes sure further changes come through */
- return 0;
-}
-
-static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks)
-{
- WARN_ONCE(raid_disks,
- "%s does not support generic reshape\n", __func__);
-
- if (sectors == 0)
- return mddev->dev_sectors;
-
- return sectors;
-}
-
-static int faulty_run(struct mddev *mddev)
-{
- struct md_rdev *rdev;
- int i;
- struct faulty_conf *conf;
-
- if (md_check_no_bitmap(mddev))
- return -EINVAL;
-
- conf = kmalloc(sizeof(*conf), GFP_KERNEL);
- if (!conf)
- return -ENOMEM;
-
- for (i=0; i<Modes; i++) {
- atomic_set(&conf->counters[i], 0);
- conf->period[i] = 0;
- }
- conf->nfaults = 0;
-
- rdev_for_each(rdev, mddev) {
- conf->rdev = rdev;
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- }
-
- md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
- mddev->private = conf;
-
- faulty_reshape(mddev);
-
- return 0;
-}
-
-static void faulty_free(struct mddev *mddev, void *priv)
-{
- struct faulty_conf *conf = priv;
-
- kfree(conf);
-}
-
-static struct md_personality faulty_personality =
-{
- .name = "faulty",
- .level = LEVEL_FAULTY,
- .owner = THIS_MODULE,
- .make_request = faulty_make_request,
- .run = faulty_run,
- .free = faulty_free,
- .status = faulty_status,
- .check_reshape = faulty_reshape,
- .size = faulty_size,
-};
-
-static int __init raid_init(void)
-{
- return register_md_personality(&faulty_personality);
-}
-
-static void raid_exit(void)
-{
- unregister_md_personality(&faulty_personality);
-}
-
-module_init(raid_init);
-module_exit(raid_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)");
-MODULE_ALIAS("md-personality-10"); /* faulty */
-MODULE_ALIAS("md-faulty");
-MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
deleted file mode 100644
index 8eca7693b7..0000000000
--- a/drivers/md/md-linear.c
+++ /dev/null
@@ -1,318 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- linear.c : Multiple Devices driver for Linux
- Copyright (C) 1994-96 Marc ZYNGIER
- <zyngier@ufr-info-p7.ibp.fr> or
- <maz@gloups.fdn.fr>
-
- Linear mode management functions.
-
-*/
-
-#include <linux/blkdev.h>
-#include <linux/raid/md_u.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <trace/events/block.h>
-#include "md.h"
-#include "md-linear.h"
-
-/*
- * find which device holds a particular offset
- */
-static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
-{
- int lo, mid, hi;
- struct linear_conf *conf;
-
- lo = 0;
- hi = mddev->raid_disks - 1;
- conf = mddev->private;
-
- /*
- * Binary Search
- */
-
- while (hi > lo) {
-
- mid = (hi + lo) / 2;
- if (sector < conf->disks[mid].end_sector)
- hi = mid;
- else
- lo = mid + 1;
- }
-
- return conf->disks + lo;
-}
-
-static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
-{
- struct linear_conf *conf;
- sector_t array_sectors;
-
- conf = mddev->private;
- WARN_ONCE(sectors || raid_disks,
- "%s does not support generic reshape\n", __func__);
- array_sectors = conf->array_sectors;
-
- return array_sectors;
-}
-
-static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
-{
- struct linear_conf *conf;
- struct md_rdev *rdev;
- int i, cnt;
-
- conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
- if (!conf)
- return NULL;
-
- /*
- * conf->raid_disks is copy of mddev->raid_disks. The reason to
- * keep a copy of mddev->raid_disks in struct linear_conf is,
- * mddev->raid_disks may not be consistent with pointers number of
- * conf->disks[] when it is updated in linear_add() and used to
- * iterate old conf->disks[] earray in linear_congested().
- * Here conf->raid_disks is always consitent with number of
- * pointers in conf->disks[] array, and mddev->private is updated
- * with rcu_assign_pointer() in linear_addr(), such race can be
- * avoided.
- */
- conf->raid_disks = raid_disks;
-
- cnt = 0;
- conf->array_sectors = 0;
-
- rdev_for_each(rdev, mddev) {
- int j = rdev->raid_disk;
- struct dev_info *disk = conf->disks + j;
- sector_t sectors;
-
- if (j < 0 || j >= raid_disks || disk->rdev) {
- pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
- mdname(mddev));
- goto out;
- }
-
- disk->rdev = rdev;
- if (mddev->chunk_sectors) {
- sectors = rdev->sectors;
- sector_div(sectors, mddev->chunk_sectors);
- rdev->sectors = sectors * mddev->chunk_sectors;
- }
-
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
-
- conf->array_sectors += rdev->sectors;
- cnt++;
- }
- if (cnt != raid_disks) {
- pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
- mdname(mddev));
- goto out;
- }
-
- /*
- * Here we calculate the device offsets.
- */
- conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
-
- for (i = 1; i < raid_disks; i++)
- conf->disks[i].end_sector =
- conf->disks[i-1].end_sector +
- conf->disks[i].rdev->sectors;
-
- return conf;
-
-out:
- kfree(conf);
- return NULL;
-}
-
-static int linear_run (struct mddev *mddev)
-{
- struct linear_conf *conf;
- int ret;
-
- if (md_check_no_bitmap(mddev))
- return -EINVAL;
- conf = linear_conf(mddev, mddev->raid_disks);
-
- if (!conf)
- return 1;
- mddev->private = conf;
- md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
-
- ret = md_integrity_register(mddev);
- if (ret) {
- kfree(conf);
- mddev->private = NULL;
- }
- return ret;
-}
-
-static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
-{
- /* Adding a drive to a linear array allows the array to grow.
- * It is permitted if the new drive has a matching superblock
- * already on it, with raid_disk equal to raid_disks.
- * It is achieved by creating a new linear_private_data structure
- * and swapping it in in-place of the current one.
- * The current one is never freed until the array is stopped.
- * This avoids races.
- */
- struct linear_conf *newconf, *oldconf;
-
- if (rdev->saved_raid_disk != mddev->raid_disks)
- return -EINVAL;
-
- rdev->raid_disk = rdev->saved_raid_disk;
- rdev->saved_raid_disk = -1;
-
- newconf = linear_conf(mddev,mddev->raid_disks+1);
-
- if (!newconf)
- return -ENOMEM;
-
- /* newconf->raid_disks already keeps a copy of * the increased
- * value of mddev->raid_disks, WARN_ONCE() is just used to make
- * sure of this. It is possible that oldconf is still referenced
- * in linear_congested(), therefore kfree_rcu() is used to free
- * oldconf until no one uses it anymore.
- */
- oldconf = rcu_dereference_protected(mddev->private,
- lockdep_is_held(&mddev->reconfig_mutex));
- mddev->raid_disks++;
- WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
- "copied raid_disks doesn't match mddev->raid_disks");
- rcu_assign_pointer(mddev->private, newconf);
- md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
- set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
- kfree_rcu(oldconf, rcu);
- return 0;
-}
-
-static void linear_free(struct mddev *mddev, void *priv)
-{
- struct linear_conf *conf = priv;
-
- kfree(conf);
-}
-
-static bool linear_make_request(struct mddev *mddev, struct bio *bio)
-{
- struct dev_info *tmp_dev;
- sector_t start_sector, end_sector, data_offset;
- sector_t bio_sector = bio->bi_iter.bi_sector;
-
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)
- && md_flush_request(mddev, bio))
- return true;
-
- tmp_dev = which_dev(mddev, bio_sector);
- start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
- end_sector = tmp_dev->end_sector;
- data_offset = tmp_dev->rdev->data_offset;
-
- if (unlikely(bio_sector >= end_sector ||
- bio_sector < start_sector))
- goto out_of_bounds;
-
- if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
- md_error(mddev, tmp_dev->rdev);
- bio_io_error(bio);
- return true;
- }
-
- if (unlikely(bio_end_sector(bio) > end_sector)) {
- /* This bio crosses a device boundary, so we have to split it */
- struct bio *split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, &mddev->bio_set);
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
- }
-
- md_account_bio(mddev, &bio);
- bio_set_dev(bio, tmp_dev->rdev->bdev);
- bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
- start_sector + data_offset;
-
- if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev))) {
- /* Just ignore it */
- bio_endio(bio);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_write_zeroes(mddev, bio);
- submit_bio_noacct(bio);
- }
- return true;
-
-out_of_bounds:
- pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
- mdname(mddev),
- (unsigned long long)bio->bi_iter.bi_sector,
- tmp_dev->rdev->bdev,
- (unsigned long long)tmp_dev->rdev->sectors,
- (unsigned long long)start_sector);
- bio_io_error(bio);
- return true;
-}
-
-static void linear_status (struct seq_file *seq, struct mddev *mddev)
-{
- seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
-}
-
-static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
-{
- if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
- char *md_name = mdname(mddev);
-
- pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
- md_name, rdev->bdev);
- }
-}
-
-static void linear_quiesce(struct mddev *mddev, int state)
-{
-}
-
-static struct md_personality linear_personality =
-{
- .name = "linear",
- .level = LEVEL_LINEAR,
- .owner = THIS_MODULE,
- .make_request = linear_make_request,
- .run = linear_run,
- .free = linear_free,
- .status = linear_status,
- .hot_add_disk = linear_add,
- .size = linear_size,
- .quiesce = linear_quiesce,
- .error_handler = linear_error,
-};
-
-static int __init linear_init (void)
-{
- return register_md_personality (&linear_personality);
-}
-
-static void linear_exit (void)
-{
- unregister_md_personality (&linear_personality);
-}
-
-module_init(linear_init);
-module_exit(linear_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
-MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
-MODULE_ALIAS("md-linear");
-MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
deleted file mode 100644
index aa77133f31..0000000000
--- a/drivers/md/md-multipath.c
+++ /dev/null
@@ -1,462 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * multipath.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
- *
- * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
- *
- * MULTIPATH management functions.
- *
- * derived from raid1.c.
- */
-
-#include <linux/blkdev.h>
-#include <linux/module.h>
-#include <linux/raid/md_u.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include "md.h"
-#include "md-multipath.h"
-
-#define MAX_WORK_PER_DISK 128
-
-#define NR_RESERVED_BUFS 32
-
-static int multipath_map (struct mpconf *conf)
-{
- int i, disks = conf->raid_disks;
-
- /*
- * Later we do read balancing on the read side
- * now we use the first available disk.
- */
-
- rcu_read_lock();
- for (i = 0; i < disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
- if (rdev && test_bit(In_sync, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- return i;
- }
- }
- rcu_read_unlock();
-
- pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
- return (-1);
-}
-
-static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
-{
- unsigned long flags;
- struct mddev *mddev = mp_bh->mddev;
- struct mpconf *conf = mddev->private;
-
- spin_lock_irqsave(&conf->device_lock, flags);
- list_add(&mp_bh->retry_list, &conf->retry_list);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- md_wakeup_thread(mddev->thread);
-}
-
-/*
- * multipath_end_bh_io() is called when we have finished servicing a multipathed
- * operation and are ready to return a success/failure code to the buffer
- * cache layer.
- */
-static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
-{
- struct bio *bio = mp_bh->master_bio;
- struct mpconf *conf = mp_bh->mddev->private;
-
- bio->bi_status = status;
- bio_endio(bio);
- mempool_free(mp_bh, &conf->pool);
-}
-
-static void multipath_end_request(struct bio *bio)
-{
- struct multipath_bh *mp_bh = bio->bi_private;
- struct mpconf *conf = mp_bh->mddev->private;
- struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
-
- if (!bio->bi_status)
- multipath_end_bh_io(mp_bh, 0);
- else if (!(bio->bi_opf & REQ_RAHEAD)) {
- /*
- * oops, IO error:
- */
- md_error (mp_bh->mddev, rdev);
- pr_info("multipath: %pg: rescheduling sector %llu\n",
- rdev->bdev,
- (unsigned long long)bio->bi_iter.bi_sector);
- multipath_reschedule_retry(mp_bh);
- } else
- multipath_end_bh_io(mp_bh, bio->bi_status);
- rdev_dec_pending(rdev, conf->mddev);
-}
-
-static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
-{
- struct mpconf *conf = mddev->private;
- struct multipath_bh * mp_bh;
- struct multipath_info *multipath;
-
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)
- && md_flush_request(mddev, bio))
- return true;
-
- md_account_bio(mddev, &bio);
- mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
-
- mp_bh->master_bio = bio;
- mp_bh->mddev = mddev;
-
- mp_bh->path = multipath_map(conf);
- if (mp_bh->path < 0) {
- bio_io_error(bio);
- mempool_free(mp_bh, &conf->pool);
- return true;
- }
- multipath = conf->multipaths + mp_bh->path;
-
- bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO);
-
- mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
- mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
- mp_bh->bio.bi_end_io = multipath_end_request;
- mp_bh->bio.bi_private = mp_bh;
- mddev_check_write_zeroes(mddev, &mp_bh->bio);
- submit_bio_noacct(&mp_bh->bio);
- return true;
-}
-
-static void multipath_status(struct seq_file *seq, struct mddev *mddev)
-{
- struct mpconf *conf = mddev->private;
- int i;
-
- seq_printf (seq, " [%d/%d] [", conf->raid_disks,
- conf->raid_disks - mddev->degraded);
- rcu_read_lock();
- for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
- seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
- }
- rcu_read_unlock();
- seq_putc(seq, ']');
-}
-
-/*
- * Careful, this can execute in IRQ contexts as well!
- */
-static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
-{
- struct mpconf *conf = mddev->private;
-
- if (conf->raid_disks - mddev->degraded <= 1) {
- /*
- * Uh oh, we can do nothing if this is our last path, but
- * first check if this is a queued request for a device
- * which has just failed.
- */
- pr_warn("multipath: only one IO path left and IO error.\n");
- /* leave it active... it's all we have */
- return;
- }
- /*
- * Mark disk as unusable
- */
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- }
- set_bit(Faulty, &rdev->flags);
- set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- pr_err("multipath: IO failure on %pg, disabling IO path.\n"
- "multipath: Operation continuing on %d IO paths.\n",
- rdev->bdev,
- conf->raid_disks - mddev->degraded);
-}
-
-static void print_multipath_conf (struct mpconf *conf)
-{
- int i;
- struct multipath_info *tmp;
-
- pr_debug("MULTIPATH conf printout:\n");
- if (!conf) {
- pr_debug("(conf==NULL)\n");
- return;
- }
- pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
- conf->raid_disks);
-
- for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->multipaths + i;
- if (tmp->rdev)
- pr_debug(" disk%d, o:%d, dev:%pg\n",
- i,!test_bit(Faulty, &tmp->rdev->flags),
- tmp->rdev->bdev);
- }
-}
-
-static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
-{
- struct mpconf *conf = mddev->private;
- int err = -EEXIST;
- int path;
- struct multipath_info *p;
- int first = 0;
- int last = mddev->raid_disks - 1;
-
- if (rdev->raid_disk >= 0)
- first = last = rdev->raid_disk;
-
- print_multipath_conf(conf);
-
- for (path = first; path <= last; path++)
- if ((p=conf->multipaths+path)->rdev == NULL) {
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
-
- err = md_integrity_add_rdev(rdev, mddev);
- if (err)
- break;
- spin_lock_irq(&conf->device_lock);
- mddev->degraded--;
- rdev->raid_disk = path;
- set_bit(In_sync, &rdev->flags);
- spin_unlock_irq(&conf->device_lock);
- rcu_assign_pointer(p->rdev, rdev);
- err = 0;
- break;
- }
-
- print_multipath_conf(conf);
-
- return err;
-}
-
-static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
-{
- struct mpconf *conf = mddev->private;
- int err = 0;
- int number = rdev->raid_disk;
- struct multipath_info *p = conf->multipaths + number;
-
- print_multipath_conf(conf);
-
- if (rdev == p->rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
- pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
- err = -EBUSY;
- goto abort;
- }
- p->rdev = NULL;
- err = md_integrity_register(mddev);
- }
-abort:
-
- print_multipath_conf(conf);
- return err;
-}
-
-/*
- * This is a kernel thread which:
- *
- * 1. Retries failed read operations on working multipaths.
- * 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
- */
-
-static void multipathd(struct md_thread *thread)
-{
- struct mddev *mddev = thread->mddev;
- struct multipath_bh *mp_bh;
- struct bio *bio;
- unsigned long flags;
- struct mpconf *conf = mddev->private;
- struct list_head *head = &conf->retry_list;
-
- md_check_recovery(mddev);
- for (;;) {
- spin_lock_irqsave(&conf->device_lock, flags);
- if (list_empty(head))
- break;
- mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
- list_del(head->prev);
- spin_unlock_irqrestore(&conf->device_lock, flags);
-
- bio = &mp_bh->bio;
- bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
-
- if ((mp_bh->path = multipath_map (conf))<0) {
- pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n",
- bio->bi_bdev,
- (unsigned long long)bio->bi_iter.bi_sector);
- multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
- } else {
- pr_err("multipath: %pg: redirecting sector %llu to another IO path\n",
- bio->bi_bdev,
- (unsigned long long)bio->bi_iter.bi_sector);
- *bio = *(mp_bh->master_bio);
- bio->bi_iter.bi_sector +=
- conf->multipaths[mp_bh->path].rdev->data_offset;
- bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
- bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
- bio->bi_end_io = multipath_end_request;
- bio->bi_private = mp_bh;
- submit_bio_noacct(bio);
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
-}
-
-static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
-{
- WARN_ONCE(sectors || raid_disks,
- "%s does not support generic reshape\n", __func__);
-
- return mddev->dev_sectors;
-}
-
-static int multipath_run (struct mddev *mddev)
-{
- struct mpconf *conf;
- int disk_idx;
- struct multipath_info *disk;
- struct md_rdev *rdev;
- int working_disks;
- int ret;
-
- if (md_check_no_bitmap(mddev))
- return -EINVAL;
-
- if (mddev->level != LEVEL_MULTIPATH) {
- pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
- mdname(mddev), mddev->level);
- goto out;
- }
- /*
- * copy the already verified devices into our private MULTIPATH
- * bookkeeping area. [whatever we allocate in multipath_run(),
- * should be freed in multipath_free()]
- */
-
- conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
- mddev->private = conf;
- if (!conf)
- goto out;
-
- conf->multipaths = kcalloc(mddev->raid_disks,
- sizeof(struct multipath_info),
- GFP_KERNEL);
- if (!conf->multipaths)
- goto out_free_conf;
-
- working_disks = 0;
- rdev_for_each(rdev, mddev) {
- disk_idx = rdev->raid_disk;
- if (disk_idx < 0 ||
- disk_idx >= mddev->raid_disks)
- continue;
-
- disk = conf->multipaths + disk_idx;
- disk->rdev = rdev;
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
-
- if (!test_bit(Faulty, &rdev->flags))
- working_disks++;
- }
-
- conf->raid_disks = mddev->raid_disks;
- conf->mddev = mddev;
- spin_lock_init(&conf->device_lock);
- INIT_LIST_HEAD(&conf->retry_list);
-
- if (!working_disks) {
- pr_warn("multipath: no operational IO paths for %s\n",
- mdname(mddev));
- goto out_free_conf;
- }
- mddev->degraded = conf->raid_disks - working_disks;
-
- ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
- sizeof(struct multipath_bh));
- if (ret)
- goto out_free_conf;
-
- rcu_assign_pointer(mddev->thread,
- md_register_thread(multipathd, mddev, "multipath"));
- if (!mddev->thread)
- goto out_free_conf;
-
- pr_info("multipath: array %s active with %d out of %d IO paths\n",
- mdname(mddev), conf->raid_disks - mddev->degraded,
- mddev->raid_disks);
- /*
- * Ok, everything is just fine now
- */
- md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
-
- if (md_integrity_register(mddev))
- goto out_free_conf;
-
- return 0;
-
-out_free_conf:
- mempool_exit(&conf->pool);
- kfree(conf->multipaths);
- kfree(conf);
- mddev->private = NULL;
-out:
- return -EIO;
-}
-
-static void multipath_free(struct mddev *mddev, void *priv)
-{
- struct mpconf *conf = priv;
-
- mempool_exit(&conf->pool);
- kfree(conf->multipaths);
- kfree(conf);
-}
-
-static struct md_personality multipath_personality =
-{
- .name = "multipath",
- .level = LEVEL_MULTIPATH,
- .owner = THIS_MODULE,
- .make_request = multipath_make_request,
- .run = multipath_run,
- .free = multipath_free,
- .status = multipath_status,
- .error_handler = multipath_error,
- .hot_add_disk = multipath_add_disk,
- .hot_remove_disk= multipath_remove_disk,
- .size = multipath_size,
-};
-
-static int __init multipath_init (void)
-{
- return register_md_personality (&multipath_personality);
-}
-
-static void __exit multipath_exit (void)
-{
- unregister_md_personality (&multipath_personality);
-}
-
-module_init(multipath_init);
-module_exit(multipath_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)");
-MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
-MODULE_ALIAS("md-multipath");
-MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67befb598c..f54012d684 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1296,17 +1296,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
rdev->sb_size = MD_SB_BYTES;
rdev->badblocks.shift = -1;
- if (sb->level == LEVEL_MULTIPATH)
- rdev->desc_nr = -1;
- else
- rdev->desc_nr = sb->this_disk.number;
-
- /* not spare disk, or LEVEL_MULTIPATH */
- if (sb->level == LEVEL_MULTIPATH ||
- (rdev->desc_nr >= 0 &&
- rdev->desc_nr < MD_SB_DISKS &&
- sb->disks[rdev->desc_nr].state &
- ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
+ rdev->desc_nr = sb->this_disk.number;
+
+ /* not spare disk */
+ if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
+ sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
spare_disk = false;
if (!refdev) {
@@ -1453,31 +1447,28 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
return 0;
}
- if (mddev->level != LEVEL_MULTIPATH) {
- desc = sb->disks + rdev->desc_nr;
+ desc = sb->disks + rdev->desc_nr;
- if (desc->state & (1<<MD_DISK_FAULTY))
- set_bit(Faulty, &rdev->flags);
- else if (desc->state & (1<<MD_DISK_SYNC) /* &&
- desc->raid_disk < mddev->raid_disks */) {
- set_bit(In_sync, &rdev->flags);
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ set_bit(Faulty, &rdev->flags);
+ else if (desc->state & (1<<MD_DISK_SYNC)) {
+ set_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = desc->raid_disk;
+ rdev->saved_raid_disk = desc->raid_disk;
+ } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
+ /* active but not in sync implies recovery up to
+ * reshape position. We don't know exactly where
+ * that is, so set to zero for now
+ */
+ if (mddev->minor_version >= 91) {
+ rdev->recovery_offset = 0;
rdev->raid_disk = desc->raid_disk;
- rdev->saved_raid_disk = desc->raid_disk;
- } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
- /* active but not in sync implies recovery up to
- * reshape position. We don't know exactly where
- * that is, so set to zero for now */
- if (mddev->minor_version >= 91) {
- rdev->recovery_offset = 0;
- rdev->raid_disk = desc->raid_disk;
- }
}
- if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
- set_bit(WriteMostly, &rdev->flags);
- if (desc->state & (1<<MD_DISK_FAILFAST))
- set_bit(FailFast, &rdev->flags);
- } else /* MULTIPATH are always insync */
- set_bit(In_sync, &rdev->flags);
+ }
+ if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+ if (desc->state & (1<<MD_DISK_FAILFAST))
+ set_bit(FailFast, &rdev->flags);
return 0;
}
@@ -1767,10 +1758,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
&& rdev->new_data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL;
- if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
- rdev->desc_nr = -1;
- else
- rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
if (!rdev->bb_page) {
rdev->bb_page = alloc_page(GFP_KERNEL);
@@ -1823,12 +1811,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
sb->level != 0)
return -EINVAL;
- /* not spare disk, or LEVEL_MULTIPATH */
- if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
- (rdev->desc_nr >= 0 &&
- rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
- (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
- le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
+ /* not spare disk */
+ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
spare_disk = false;
if (!refdev) {
@@ -1871,6 +1857,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
{
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events);
+ int role;
rdev->raid_disk = -1;
clear_bit(Faulty, &rdev->flags);
@@ -1986,88 +1973,85 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
/* just a hot-add of a new device, leave raid_disk at -1 */
return 0;
}
- if (mddev->level != LEVEL_MULTIPATH) {
- int role;
- if (rdev->desc_nr < 0 ||
- rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
- role = MD_DISK_ROLE_SPARE;
- rdev->desc_nr = -1;
- } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
- /*
- * If we are assembling, and our event counter is smaller than the
- * highest event counter, we cannot trust our superblock about the role.
- * It could happen that our rdev was marked as Faulty, and all other
- * superblocks were updated with +1 event counter.
- * Then, before the next superblock update, which typically happens when
- * remove_and_add_spares() removes the device from the array, there was
- * a crash or reboot.
- * If we allow current rdev without consulting the freshest superblock,
- * we could cause data corruption.
- * Note that in this case our event counter is smaller by 1 than the
- * highest, otherwise, this rdev would not be allowed into array;
- * both kernel and mdadm allow event counter difference of 1.
- */
- struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
- u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
-
- if (rdev->desc_nr >= freshest_max_dev) {
- /* this is unexpected, better not proceed */
- pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
- mdname(mddev), rdev->bdev, rdev->desc_nr,
- freshest->bdev, freshest_max_dev);
- return -EUCLEAN;
- }
- role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
- pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
- mdname(mddev), rdev->bdev, role, role, freshest->bdev);
- } else {
- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ if (rdev->desc_nr < 0 ||
+ rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
+ role = MD_DISK_ROLE_SPARE;
+ rdev->desc_nr = -1;
+ } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
+ /*
+ * If we are assembling, and our event counter is smaller than the
+ * highest event counter, we cannot trust our superblock about the role.
+ * It could happen that our rdev was marked as Faulty, and all other
+ * superblocks were updated with +1 event counter.
+ * Then, before the next superblock update, which typically happens when
+ * remove_and_add_spares() removes the device from the array, there was
+ * a crash or reboot.
+ * If we allow current rdev without consulting the freshest superblock,
+ * we could cause data corruption.
+ * Note that in this case our event counter is smaller by 1 than the
+ * highest, otherwise, this rdev would not be allowed into array;
+ * both kernel and mdadm allow event counter difference of 1.
+ */
+ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
+ u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
+
+ if (rdev->desc_nr >= freshest_max_dev) {
+ /* this is unexpected, better not proceed */
+ pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+ mdname(mddev), rdev->bdev, rdev->desc_nr,
+ freshest->bdev, freshest_max_dev);
+ return -EUCLEAN;
}
- switch(role) {
- case MD_DISK_ROLE_SPARE: /* spare */
- break;
- case MD_DISK_ROLE_FAULTY: /* faulty */
- set_bit(Faulty, &rdev->flags);
- break;
- case MD_DISK_ROLE_JOURNAL: /* journal device */
- if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
- /* journal device without journal feature */
- pr_warn("md: journal device provided without journal feature, ignoring the device\n");
- return -EINVAL;
- }
- set_bit(Journal, &rdev->flags);
- rdev->journal_tail = le64_to_cpu(sb->journal_tail);
- rdev->raid_disk = 0;
- break;
- default:
- rdev->saved_raid_disk = role;
- if ((le32_to_cpu(sb->feature_map) &
- MD_FEATURE_RECOVERY_OFFSET)) {
- rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
- if (!(le32_to_cpu(sb->feature_map) &
- MD_FEATURE_RECOVERY_BITMAP))
- rdev->saved_raid_disk = -1;
- } else {
- /*
- * If the array is FROZEN, then the device can't
- * be in_sync with rest of array.
- */
- if (!test_bit(MD_RECOVERY_FROZEN,
- &mddev->recovery))
- set_bit(In_sync, &rdev->flags);
- }
- rdev->raid_disk = role;
- break;
+
+ role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+ pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
+ mdname(mddev), rdev->bdev, role, role, freshest->bdev);
+ } else {
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ }
+ switch (role) {
+ case MD_DISK_ROLE_SPARE: /* spare */
+ break;
+ case MD_DISK_ROLE_FAULTY: /* faulty */
+ set_bit(Faulty, &rdev->flags);
+ break;
+ case MD_DISK_ROLE_JOURNAL: /* journal device */
+ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
+ /* journal device without journal feature */
+ pr_warn("md: journal device provided without journal feature, ignoring the device\n");
+ return -EINVAL;
}
- if (sb->devflags & WriteMostly1)
- set_bit(WriteMostly, &rdev->flags);
- if (sb->devflags & FailFast1)
- set_bit(FailFast, &rdev->flags);
- if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
- set_bit(Replacement, &rdev->flags);
- } else /* MULTIPATH are always insync */
- set_bit(In_sync, &rdev->flags);
+ set_bit(Journal, &rdev->flags);
+ rdev->journal_tail = le64_to_cpu(sb->journal_tail);
+ rdev->raid_disk = 0;
+ break;
+ default:
+ rdev->saved_raid_disk = role;
+ if ((le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RECOVERY_OFFSET)) {
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+ if (!(le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RECOVERY_BITMAP))
+ rdev->saved_raid_disk = -1;
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
+ rdev->raid_disk = role;
+ break;
+ }
+ if (sb->devflags & WriteMostly1)
+ set_bit(WriteMostly, &rdev->flags);
+ if (sb->devflags & FailFast1)
+ set_bit(FailFast, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+ set_bit(Replacement, &rdev->flags);
return 0;
}
@@ -2886,10 +2870,6 @@ rewrite:
} else
pr_debug("md: %pg (skipping faulty)\n",
rdev->bdev);
-
- if (mddev->level == LEVEL_MULTIPATH)
- /* only need to write one superblock... */
- break;
}
if (md_super_wait(mddev) < 0)
goto rewrite;
@@ -3890,13 +3870,8 @@ static int analyze_sbs(struct mddev *mddev)
continue;
}
}
- if (mddev->level == LEVEL_MULTIPATH) {
- rdev->desc_nr = i++;
- rdev->raid_disk = rdev->desc_nr;
- set_bit(In_sync, &rdev->flags);
- } else if (rdev->raid_disk >=
- (mddev->raid_disks - min(0, mddev->delta_disks)) &&
- !test_bit(Journal, &rdev->flags)) {
+ if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
+ !test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
@@ -8156,7 +8131,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
- if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
+ if (mddev->pers->level == 0)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 3f22edec70..512746551f 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -173,3 +173,57 @@ static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
else
md_bitmap_unplug(bitmap);
}
+
+/*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ */
+static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
+{
+ long cur_time_mon;
+ unsigned long hours_since_last;
+ unsigned int read_errors = atomic_read(&rdev->read_errors);
+
+ cur_time_mon = ktime_get_seconds();
+
+ if (rdev->last_read_error == 0) {
+ /* first time we've seen a read error */
+ rdev->last_read_error = cur_time_mon;
+ return;
+ }
+
+ hours_since_last = (long)(cur_time_mon -
+ rdev->last_read_error) / 3600;
+
+ rdev->last_read_error = cur_time_mon;
+
+ /*
+ * if hours_since_last is > the number of bits in read_errors
+ * just set read errors to 0. We do this to avoid
+ * overflowing the shift of read_errors by hours_since_last.
+ */
+ if (hours_since_last >= 8 * sizeof(read_errors))
+ atomic_set(&rdev->read_errors, 0);
+ else
+ atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+
+static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+ int read_errors;
+
+ check_decay_read_errors(mddev, rdev);
+ read_errors = atomic_inc_return(&rdev->read_errors);
+ if (read_errors > max_read_errors) {
+ pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
+ mdname(mddev), rdev->bdev, read_errors, max_read_errors);
+ pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
+ mdname(mddev), rdev->bdev);
+ md_error(mddev, rdev);
+ return true;
+ }
+
+ return false;
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 750a802478..4f3c35f132 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -49,6 +49,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
+#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
#define START(node) ((node)->start)
@@ -1131,8 +1132,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
&r1_bio->mddev->bio_set);
- if (!behind_bio)
- return;
/* discard op, we don't support writezero/writesame yet */
if (!bio_has_data(bio)) {
@@ -1475,7 +1474,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
- free_r1bio(r1_bio);
+ mempool_free(r1_bio, &conf->r1bio_pool);
allow_barrier(conf, bio->bi_iter.bi_sector);
if (bio->bi_opf & REQ_NOWAIT) {
@@ -2299,16 +2298,24 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
* 3. Performs writes following reads for array synchronising.
*/
-static void fix_read_error(struct r1conf *conf, int read_disk,
- sector_t sect, int sectors)
+static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
{
+ sector_t sect = r1_bio->sector;
+ int sectors = r1_bio->sectors;
+ int read_disk = r1_bio->read_disk;
struct mddev *mddev = conf->mddev;
+ struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
+
+ if (exceed_read_errors(mddev, rdev)) {
+ r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
+ return;
+ }
+
while(sectors) {
int s = sectors;
int d = read_disk;
int success = 0;
int start;
- struct md_rdev *rdev;
if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9;
@@ -2549,8 +2556,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
if (mddev->ro == 0
&& !test_bit(FailFast, &rdev->flags)) {
freeze_array(conf, 1);
- fix_read_error(conf, r1_bio->read_disk,
- r1_bio->sector, r1_bio->sectors);
+ fix_read_error(conf, r1_bio);
unfreeze_array(conf);
} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
md_error(mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e828a6aa0..a5f8419e2d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -19,6 +19,8 @@
#include <linux/raid/md_p.h>
#include <trace/events/block.h>
#include "md.h"
+
+#define RAID_1_10_NAME "raid10"
#include "raid10.h"
#include "raid0.h"
#include "md-bitmap.h"
@@ -743,7 +745,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
struct geom *geo = &conf->geo;
raid10_find_phys(conf, r10_bio);
- rcu_read_lock();
best_dist_slot = -1;
min_pending = UINT_MAX;
best_dist_rdev = NULL;
@@ -775,18 +776,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
disk = r10_bio->devs[slot].devnum;
- rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ rdev = conf->mirrors[disk].replacement;
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors >
- rdev->recovery_offset) {
- /*
- * Read replacement first to prevent reading both rdev
- * and replacement as NULL during replacement replace
- * rdev.
- */
- smp_mb();
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
- }
+ rdev->recovery_offset)
+ rdev = conf->mirrors[disk].rdev;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags))
continue;
@@ -876,7 +870,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
r10_bio->read_slot = slot;
} else
rdev = NULL;
- rcu_read_unlock();
*max_sectors = best_good_sectors;
return rdev;
@@ -1198,9 +1191,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp = GFP_NOIO | __GFP_HIGH;
- rcu_read_lock();
disk = r10_bio->devs[slot].devnum;
- err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ err_rdev = conf->mirrors[disk].rdev;
if (err_rdev)
snprintf(b, sizeof(b), "%pg", err_rdev->bdev);
else {
@@ -1208,7 +1200,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
/* This never gets dereferenced */
err_rdev = r10_bio->devs[slot].rdev;
}
- rcu_read_unlock();
}
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
@@ -1279,15 +1270,8 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
int devnum = r10_bio->devs[n_copy].devnum;
struct bio *mbio;
- if (replacement) {
- rdev = conf->mirrors[devnum].replacement;
- if (rdev == NULL) {
- /* Replacement just got moved to main 'rdev' */
- smp_mb();
- rdev = conf->mirrors[devnum].rdev;
- }
- } else
- rdev = conf->mirrors[devnum].rdev;
+ rdev = replacement ? conf->mirrors[devnum].replacement :
+ conf->mirrors[devnum].rdev;
mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
if (replacement)
@@ -1321,25 +1305,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
}
}
-static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
- struct md_rdev **prrdev)
-{
- struct md_rdev *rdev, *rrdev;
-
- rrdev = rcu_dereference(mirror->replacement);
- /*
- * Read replacement first to prevent reading both rdev and
- * replacement as NULL during replacement replace rdev.
- */
- smp_mb();
- rdev = rcu_dereference(mirror->rdev);
- if (rdev == rrdev)
- rrdev = NULL;
-
- *prrdev = rrdev;
- return rdev;
-}
-
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{
int i;
@@ -1348,11 +1313,11 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
retry_wait:
blocked_rdev = NULL;
- rcu_read_lock();
for (i = 0; i < conf->copies; i++) {
struct md_rdev *rdev, *rrdev;
- rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
+ rdev = conf->mirrors[i].rdev;
+ rrdev = conf->mirrors[i].replacement;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
@@ -1391,7 +1356,6 @@ retry_wait:
}
}
}
- rcu_read_unlock();
if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */
@@ -1474,14 +1438,14 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
wait_blocked_dev(mddev, r10_bio);
- rcu_read_lock();
max_sectors = r10_bio->sectors;
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev, *rrdev;
- rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
+ rdev = conf->mirrors[d].rdev;
+ rrdev = conf->mirrors[d].replacement;
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1535,7 +1499,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
atomic_inc(&rrdev->nr_pending);
}
}
- rcu_read_unlock();
if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
@@ -1625,17 +1588,8 @@ static void raid10_end_discard_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state);
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
- if (repl)
- rdev = conf->mirrors[dev].replacement;
- if (!rdev) {
- /*
- * raid10_remove_disk uses smp_mb to make sure rdev is set to
- * replacement before setting replacement to NULL. It can read
- * rdev first without barrier protect even replacement is NULL
- */
- smp_rmb();
- rdev = conf->mirrors[dev].rdev;
- }
+ rdev = repl ? conf->mirrors[dev].replacement :
+ conf->mirrors[dev].rdev;
raid_end_discard_bio(r10_bio);
rdev_dec_pending(rdev, conf->mddev);
@@ -1785,11 +1739,11 @@ retry_discard:
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
- rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) {
struct md_rdev *rdev, *rrdev;
- rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
+ rdev = conf->mirrors[disk].rdev;
+ rrdev = conf->mirrors[disk].replacement;
r10_bio->devs[disk].bio = NULL;
r10_bio->devs[disk].repl_bio = NULL;
@@ -1809,7 +1763,6 @@ retry_discard:
atomic_inc(&rrdev->nr_pending);
}
}
- rcu_read_unlock();
atomic_set(&r10_bio->remaining, 1);
for (disk = 0; disk < geo->raid_disks; disk++) {
@@ -1939,6 +1892,8 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
struct r10conf *conf = mddev->private;
int i;
+ lockdep_assert_held(&mddev->lock);
+
if (conf->geo.near_copies < conf->geo.raid_disks)
seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
if (conf->geo.near_copies > 1)
@@ -1953,12 +1908,11 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
}
seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
conf->geo.raid_disks - mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->geo.raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
+
seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
- rcu_read_unlock();
seq_printf(seq, "]");
}
@@ -1980,7 +1934,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
ncopies = conf->geo.near_copies;
}
- rcu_read_lock();
do {
int n = conf->copies;
int cnt = 0;
@@ -1988,7 +1941,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
while (n--) {
struct md_rdev *rdev;
if (this != ignore &&
- (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+ (rdev = conf->mirrors[this].rdev) &&
test_bit(In_sync, &rdev->flags))
cnt++;
this = (this+1) % disks;
@@ -1999,7 +1952,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
} while (first != 0);
has_enough = 1;
out:
- rcu_read_unlock();
return has_enough;
}
@@ -2072,8 +2024,7 @@ static void print_conf(struct r10conf *conf)
pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
conf->geo.raid_disks);
- /* This is only called with ->reconfix_mutex held, so
- * rcu protection of rdev is not needed */
+ lockdep_assert_held(&conf->mddev->reconfig_mutex);
for (i = 0; i < conf->geo.raid_disks; i++) {
rdev = conf->mirrors[i].rdev;
if (rdev)
@@ -2190,7 +2141,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
err = 0;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
- rcu_assign_pointer(p->rdev, rdev);
+ WRITE_ONCE(p->rdev, rdev);
break;
}
@@ -2204,7 +2155,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
- rcu_assign_pointer(p->replacement, rdev);
+ WRITE_ONCE(p->replacement, rdev);
}
print_conf(conf);
@@ -2246,15 +2197,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
goto abort;
}
- *rdevp = NULL;
+ WRITE_ONCE(*rdevp, NULL);
if (p->replacement) {
/* We must have just cleared 'rdev' */
- p->rdev = p->replacement;
+ WRITE_ONCE(p->rdev, p->replacement);
clear_bit(Replacement, &p->replacement->flags);
- smp_mb(); /* Make sure other CPUs may see both as identical
- * but will never see neither -- if they are careful.
- */
- p->replacement = NULL;
+ WRITE_ONCE(p->replacement, NULL);
}
clear_bit(WantReplacement, &rdev->flags);
@@ -2646,42 +2594,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
}
}
-/*
- * Used by fix_read_error() to decay the per rdev read_errors.
- * We halve the read error count for every hour that has elapsed
- * since the last recorded read error.
- *
- */
-static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
-{
- long cur_time_mon;
- unsigned long hours_since_last;
- unsigned int read_errors = atomic_read(&rdev->read_errors);
-
- cur_time_mon = ktime_get_seconds();
-
- if (rdev->last_read_error == 0) {
- /* first time we've seen a read error */
- rdev->last_read_error = cur_time_mon;
- return;
- }
-
- hours_since_last = (long)(cur_time_mon -
- rdev->last_read_error) / 3600;
-
- rdev->last_read_error = cur_time_mon;
-
- /*
- * if hours_since_last is > the number of bits in read_errors
- * just set read errors to 0. We do this to avoid
- * overflowing the shift of read_errors by hours_since_last.
- */
- if (hours_since_last >= 8 * sizeof(read_errors))
- atomic_set(&rdev->read_errors, 0);
- else
- atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
-}
-
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op)
{
@@ -2719,7 +2631,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
struct md_rdev *rdev;
- int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[slot].devnum;
/* still own a reference to this rdev, so it cannot
@@ -2732,15 +2643,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
more fix_read_error() attempts */
return;
- check_decay_read_errors(mddev, rdev);
- atomic_inc(&rdev->read_errors);
- if (atomic_read(&rdev->read_errors) > max_read_errors) {
- pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
- mdname(mddev), rdev->bdev,
- atomic_read(&rdev->read_errors), max_read_errors);
- pr_notice("md/raid10:%s: %pg: Failing raid device\n",
- mdname(mddev), rdev->bdev);
- md_error(mddev, rdev);
+ if (exceed_read_errors(mddev, rdev)) {
r10_bio->devs[slot].bio = IO_BLOCKED;
return;
}
@@ -2754,20 +2657,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9;
- rcu_read_lock();
do {
sector_t first_bad;
int bad_sectors;
d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
&first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
@@ -2775,7 +2676,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
conf->tmppage,
REQ_OP_READ, false);
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
if (success)
break;
}
@@ -2783,7 +2683,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
if (sl == conf->copies)
sl = 0;
} while (sl != slot);
- rcu_read_unlock();
if (!success) {
/* Cannot read from anywhere, just mark the block
@@ -2807,20 +2706,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
start = sl;
/* write it back and re-read */
- rcu_read_lock();
while (sl != slot) {
if (sl==0)
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
@@ -2839,7 +2736,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
rdev->bdev);
}
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
}
sl = start;
while (sl != slot) {
@@ -2847,14 +2743,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
switch (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
@@ -2882,9 +2777,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
}
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
}
- rcu_read_unlock();
sectors -= s;
sect += s;
@@ -3358,14 +3251,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Completed a full sync so the replacements
* are now fully recovered.
*/
- rcu_read_lock();
for (i = 0; i < conf->geo.raid_disks; i++) {
struct md_rdev *rdev =
- rcu_dereference(conf->mirrors[i].replacement);
+ conf->mirrors[i].replacement;
+
if (rdev)
rdev->recovery_offset = MaxSector;
}
- rcu_read_unlock();
}
conf->fullsync = 0;
}
@@ -3446,9 +3338,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
- rcu_read_lock();
- mrdev = rcu_dereference(mirror->rdev);
- mreplace = rcu_dereference(mirror->replacement);
+ mrdev = mirror->rdev;
+ mreplace = mirror->replacement;
if (mrdev && (test_bit(Faulty, &mrdev->flags) ||
test_bit(In_sync, &mrdev->flags)))
@@ -3456,22 +3347,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace && test_bit(Faulty, &mreplace->flags))
mreplace = NULL;
- if (!mrdev && !mreplace) {
- rcu_read_unlock();
+ if (!mrdev && !mreplace)
continue;
- }
still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
- if (sect >= mddev->resync_max_sectors) {
+ if (sect >= mddev->resync_max_sectors)
/* last stripe is not complete - don't
* try to recover this sector.
*/
- rcu_read_unlock();
continue;
- }
/* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in
* the bitmap
@@ -3487,14 +3374,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* that there will never be anything to do here
*/
chunks_skipped = -1;
- rcu_read_unlock();
continue;
}
if (mrdev)
atomic_inc(&mrdev->nr_pending);
if (mreplace)
atomic_inc(&mreplace->nr_pending);
- rcu_read_unlock();
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
@@ -3513,10 +3398,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to check if the array will still be
* degraded
*/
- rcu_read_lock();
for (j = 0; j < conf->geo.raid_disks; j++) {
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[j].rdev);
+ struct md_rdev *rdev = conf->mirrors[j].rdev;
+
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
break;
@@ -3531,8 +3415,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int k;
int d = r10_bio->devs[j].devnum;
sector_t from_addr, to_addr;
- struct md_rdev *rdev =
- rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t sector, first_bad;
int bad_sectors;
if (!rdev ||
@@ -3611,7 +3494,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&r10_bio->remaining);
break;
}
- rcu_read_unlock();
if (j == conf->copies) {
/* Cannot recover, so abort the recovery or
* record a bad block */
@@ -3738,12 +3620,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r10_bio->devs[i].bio;
bio->bi_status = BLK_STS_IOERR;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
+ rdev = conf->mirrors[d].rdev;
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
continue;
- }
+
sector = r10_bio->devs[i].addr;
if (is_badblock(rdev, sector, max_sync,
&first_bad, &bad_sectors)) {
@@ -3753,7 +3633,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bad_sectors -= (sector - first_bad);
if (max_sync > bad_sectors)
max_sync = bad_sectors;
- rcu_read_unlock();
continue;
}
}
@@ -3769,11 +3648,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio_set_dev(bio, rdev->bdev);
count++;
- rdev = rcu_dereference(conf->mirrors[d].replacement);
- if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
+ rdev = conf->mirrors[d].replacement;
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
continue;
- }
+
atomic_inc(&rdev->nr_pending);
/* Need to set up for writing to the replacement */
@@ -3790,7 +3668,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
count++;
- rcu_read_unlock();
}
if (count < 2) {
@@ -4496,11 +4373,11 @@ static int calc_degraded(struct r10conf *conf)
int degraded, degraded2;
int i;
- rcu_read_lock();
degraded = 0;
/* 'prev' section first */
for (i = 0; i < conf->prev.raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (!test_bit(In_sync, &rdev->flags))
@@ -4510,13 +4387,12 @@ static int calc_degraded(struct r10conf *conf)
*/
degraded++;
}
- rcu_read_unlock();
if (conf->geo.raid_disks == conf->prev.raid_disks)
return degraded;
- rcu_read_lock();
degraded2 = 0;
for (i = 0; i < conf->geo.raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++;
else if (!test_bit(In_sync, &rdev->flags)) {
@@ -4529,7 +4405,6 @@ static int calc_degraded(struct r10conf *conf)
degraded2++;
}
}
- rcu_read_unlock();
if (degraded2 > degraded)
return degraded2;
return degraded;
@@ -4953,16 +4828,15 @@ read_more:
blist = read_bio;
read_bio->bi_next = NULL;
- rcu_read_lock();
for (s = 0; s < conf->copies*2; s++) {
struct bio *b;
int d = r10_bio->devs[s/2].devnum;
struct md_rdev *rdev2;
if (s&1) {
- rdev2 = rcu_dereference(conf->mirrors[d].replacement);
+ rdev2 = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
- rdev2 = rcu_dereference(conf->mirrors[d].rdev);
+ rdev2 = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
}
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
@@ -4996,7 +4870,6 @@ read_more:
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
- rcu_read_unlock();
r10_bio->sectors = nr_sectors;
/* Now submit the read */
@@ -5049,20 +4922,17 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *b;
int d = r10_bio->devs[s/2].devnum;
struct md_rdev *rdev;
- rcu_read_lock();
if (s&1) {
- rdev = rcu_dereference(conf->mirrors[d].replacement);
+ rdev = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
}
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
+ if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
- }
+
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
@@ -5133,10 +5003,9 @@ static int handle_reshape_read_error(struct mddev *mddev,
if (s > (PAGE_SIZE >> 9))
s = PAGE_SIZE >> 9;
- rcu_read_lock();
while (!success) {
int d = r10b->devs[slot].devnum;
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t addr;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags) ||
@@ -5145,14 +5014,12 @@ static int handle_reshape_read_error(struct mddev *mddev,
addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
success = sync_page_io(rdev,
addr,
s << 9,
pages[idx],
REQ_OP_READ, false);
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
if (success)
break;
failed:
@@ -5162,7 +5029,6 @@ static int handle_reshape_read_error(struct mddev *mddev,
if (slot == first_slot)
break;
}
- rcu_read_unlock();
if (!success) {
/* couldn't read this block, must give up */
set_bit(MD_RECOVERY_INTR,
@@ -5188,12 +5054,8 @@ static void end_reshape_write(struct bio *bio)
struct md_rdev *rdev = NULL;
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
- if (repl)
- rdev = conf->mirrors[d].replacement;
- if (!rdev) {
- smp_mb();
- rdev = conf->mirrors[d].rdev;
- }
+ rdev = repl ? conf->mirrors[d].replacement :
+ conf->mirrors[d].rdev;
if (bio->bi_status) {
/* FIXME should record badblock */
@@ -5228,18 +5090,16 @@ static void raid10_finish_reshape(struct mddev *mddev)
mddev->resync_max_sectors = mddev->array_sectors;
} else {
int d;
- rcu_read_lock();
for (d = conf->geo.raid_disks ;
d < conf->geo.raid_disks - mddev->delta_disks;
d++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
if (rdev)
clear_bit(In_sync, &rdev->flags);
- rdev = rcu_dereference(conf->mirrors[d].replacement);
+ rdev = conf->mirrors[d].replacement;
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
- rcu_read_unlock();
}
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6157f5beb9..874874fe4f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1890,28 +1890,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
continue;
/* in case device is broken */
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[disk_index].rdev);
+ rdev = conf->disks[disk_index].rdev;
if (rdev) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
sync_page_io(rdev, sh->sector, PAGE_SIZE,
sh->dev[disk_index].page, REQ_OP_WRITE,
false);
rdev_dec_pending(rdev, rdev->mddev);
- rcu_read_lock();
}
- rrdev = rcu_dereference(conf->disks[disk_index].replacement);
+ rrdev = conf->disks[disk_index].replacement;
if (rrdev) {
atomic_inc(&rrdev->nr_pending);
- rcu_read_unlock();
sync_page_io(rrdev, sh->sector, PAGE_SIZE,
sh->dev[disk_index].page, REQ_OP_WRITE,
false);
rdev_dec_pending(rrdev, rrdev->mddev);
- rcu_read_lock();
}
- rcu_read_unlock();
}
ctx->data_parity_stripes++;
out:
@@ -2948,7 +2942,6 @@ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
if (!log)
return false;
- WARN_ON_ONCE(!rcu_read_lock_held());
tree_index = r5c_tree_index(conf, sect);
slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
return slot != NULL;
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index eaea57aee6..da4ba736c4 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -620,11 +620,9 @@ static void ppl_do_flush(struct ppl_io_unit *io)
struct md_rdev *rdev;
struct block_device *bdev = NULL;
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
if (rdev && !test_bit(Faulty, &rdev->flags))
bdev = rdev->bdev;
- rcu_read_unlock();
if (bdev) {
struct bio *bio;
@@ -882,9 +880,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)r_sector, dd_idx,
(unsigned long long)sector);
- /* Array has not started so rcu dereference is safe */
- rdev = rcu_dereference_protected(
- conf->disks[dd_idx].rdev, 1);
+ rdev = conf->disks[dd_idx].rdev;
if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n",
@@ -936,9 +932,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
0, &disk, &sh);
BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
- /* Array has not started so rcu dereference is safe */
- parity_rdev = rcu_dereference_protected(
- conf->disks[sh.pd_idx].rdev, 1);
+ parity_rdev = conf->disks[sh.pd_idx].rdev;
BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
pr_debug("%s:%*s write parity at sector %llu, disk %pg\n",
@@ -1404,9 +1398,7 @@ int ppl_init_log(struct r5conf *conf)
for (i = 0; i < ppl_conf->count; i++) {
struct ppl_log *log = &ppl_conf->child_logs[i];
- /* Array has not started so rcu dereference is safe */
- struct md_rdev *rdev =
- rcu_dereference_protected(conf->disks[i].rdev, 1);
+ struct md_rdev *rdev = conf->disks[i].rdev;
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e1d8b5199f..69452e4394 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -694,12 +694,12 @@ int raid5_calc_degraded(struct r5conf *conf)
int degraded, degraded2;
int i;
- rcu_read_lock();
degraded = 0;
for (i = 0; i < conf->previous_raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
+
if (rdev && test_bit(Faulty, &rdev->flags))
- rdev = rcu_dereference(conf->disks[i].replacement);
+ rdev = READ_ONCE(conf->disks[i].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (test_bit(In_sync, &rdev->flags))
@@ -717,15 +717,14 @@ int raid5_calc_degraded(struct r5conf *conf)
if (conf->raid_disks >= conf->previous_raid_disks)
degraded++;
}
- rcu_read_unlock();
if (conf->raid_disks == conf->previous_raid_disks)
return degraded;
- rcu_read_lock();
degraded2 = 0;
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
+
if (rdev && test_bit(Faulty, &rdev->flags))
- rdev = rcu_dereference(conf->disks[i].replacement);
+ rdev = READ_ONCE(conf->disks[i].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++;
else if (test_bit(In_sync, &rdev->flags))
@@ -739,7 +738,6 @@ int raid5_calc_degraded(struct r5conf *conf)
if (conf->raid_disks <= conf->previous_raid_disks)
degraded2++;
}
- rcu_read_unlock();
if (degraded2 > degraded)
return degraded2;
return degraded;
@@ -1185,14 +1183,8 @@ again:
bi = &dev->req;
rbi = &dev->rreq; /* For writing to replacement */
- rcu_read_lock();
- rrdev = rcu_dereference(conf->disks[i].replacement);
- smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
- rdev = rcu_dereference(conf->disks[i].rdev);
- if (!rdev) {
- rdev = rrdev;
- rrdev = NULL;
- }
+ rdev = conf->disks[i].rdev;
+ rrdev = conf->disks[i].replacement;
if (op_is_write(op)) {
if (replace_only)
rdev = NULL;
@@ -1213,7 +1205,6 @@ again:
rrdev = NULL;
if (rrdev)
atomic_inc(&rrdev->nr_pending);
- rcu_read_unlock();
/* We have already checked bad blocks for reads. Now
* need to check for writes. We never accept write errors
@@ -2732,28 +2723,6 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
-/*
- * This helper wraps rcu_dereference_protected() and can be used when
- * it is known that the nr_pending of the rdev is elevated.
- */
-static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
-{
- return rcu_dereference_protected(rdev,
- atomic_read(&rcu_access_pointer(rdev)->nr_pending));
-}
-
-/*
- * This helper wraps rcu_dereference_protected() and should be used
- * when it is known that the mddev_lock() is held. This is safe
- * seeing raid5_remove_disk() has the same lock held.
- */
-static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
- struct md_rdev __rcu *rdev)
-{
- return rcu_dereference_protected(rdev,
- lockdep_is_held(&mddev->reconfig_mutex));
-}
-
static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
@@ -2779,9 +2748,9 @@ static void raid5_end_read_request(struct bio * bi)
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
- rdev = rdev_pend_deref(conf->disks[i].replacement);
+ rdev = conf->disks[i].replacement;
if (!rdev)
- rdev = rdev_pend_deref(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
@@ -2894,11 +2863,11 @@ static void raid5_end_write_request(struct bio *bi)
for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) {
- rdev = rdev_pend_deref(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
break;
}
if (bi == &sh->dev[i].rreq) {
- rdev = rdev_pend_deref(conf->disks[i].replacement);
+ rdev = conf->disks[i].replacement;
if (rdev)
replacement = 1;
else
@@ -2906,7 +2875,7 @@ static void raid5_end_write_request(struct bio *bi)
* replaced it. rdev is not removed
* until all requests are finished.
*/
- rdev = rdev_pend_deref(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
break;
}
}
@@ -3668,15 +3637,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
- struct md_rdev *rdev;
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
+ struct md_rdev *rdev = conf->disks[i].rdev;
+
if (rdev && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
atomic_inc(&rdev->nr_pending);
else
rdev = NULL;
- rcu_read_unlock();
if (rdev) {
if (!rdev_set_badblocks(
rdev,
@@ -3794,16 +3761,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
/* During recovery devices cannot be removed, so
* locking and refcounting of rdevs is not needed
*/
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ struct md_rdev *rdev = conf->disks[i].rdev;
+
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
- rdev = rcu_dereference(conf->disks[i].replacement);
+ rdev = conf->disks[i].replacement;
+
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
@@ -3811,7 +3779,6 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
}
- rcu_read_unlock();
if (abort)
conf->recovery_disabled =
conf->mddev->recovery_disabled;
@@ -3824,15 +3791,13 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
struct md_rdev *rdev;
int rv = 0;
- rcu_read_lock();
- rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
+ rdev = sh->raid_conf->disks[disk_idx].replacement;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector))
rv = 1;
- rcu_read_unlock();
return rv;
}
@@ -4709,7 +4674,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */
- rcu_read_lock();
for (i=disks; i--; ) {
struct md_rdev *rdev;
sector_t first_bad;
@@ -4754,7 +4718,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Prefer to use the replacement for reads, but only
* if it is recovered enough and has no bad blocks.
*/
- rdev = rcu_dereference(conf->disks[i].replacement);
+ rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
@@ -4765,7 +4729,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(R5_NeedReplace, &dev->flags);
else
clear_bit(R5_NeedReplace, &dev->flags);
- rdev = rcu_dereference(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
clear_bit(R5_ReadRepl, &dev->flags);
}
if (rdev && test_bit(Faulty, &rdev->flags))
@@ -4812,8 +4776,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_WriteError, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
- struct md_rdev *rdev2 = rcu_dereference(
- conf->disks[i].rdev);
+ struct md_rdev *rdev2 = conf->disks[i].rdev;
+
if (rdev2 == rdev)
clear_bit(R5_Insync, &dev->flags);
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
@@ -4825,8 +4789,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_MadeGood, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
- struct md_rdev *rdev2 = rcu_dereference(
- conf->disks[i].rdev);
+ struct md_rdev *rdev2 = conf->disks[i].rdev;
+
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1;
atomic_inc(&rdev2->nr_pending);
@@ -4834,8 +4798,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
clear_bit(R5_MadeGood, &dev->flags);
}
if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
- struct md_rdev *rdev2 = rcu_dereference(
- conf->disks[i].replacement);
+ struct md_rdev *rdev2 = conf->disks[i].replacement;
+
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1;
atomic_inc(&rdev2->nr_pending);
@@ -4856,8 +4820,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1;
else if (!rdev) {
- rdev = rcu_dereference(
- conf->disks[i].replacement);
+ rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1;
}
@@ -4884,7 +4847,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
else
s->replacing = 1;
}
- rcu_read_unlock();
}
/*
@@ -5341,23 +5303,23 @@ finish:
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */
- rdev = rdev_pend_deref(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
- rdev = rdev_pend_deref(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
- rdev = rdev_pend_deref(conf->disks[i].replacement);
+ rdev = conf->disks[i].replacement;
if (!rdev)
/* rdev have been moved down */
- rdev = rdev_pend_deref(conf->disks[i].rdev);
+ rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
@@ -5516,24 +5478,22 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
&dd_idx, NULL);
end_sector = sector + bio_sectors(raid_bio);
- rcu_read_lock();
if (r5c_big_stripe_cached(conf, sector))
- goto out_rcu_unlock;
+ return 0;
- rdev = rcu_dereference(conf->disks[dd_idx].replacement);
+ rdev = conf->disks[dd_idx].replacement;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
rdev->recovery_offset < end_sector) {
- rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+ rdev = conf->disks[dd_idx].rdev;
if (!rdev)
- goto out_rcu_unlock;
+ return 0;
if (test_bit(Faulty, &rdev->flags) ||
!(test_bit(In_sync, &rdev->flags) ||
rdev->recovery_offset >= end_sector))
- goto out_rcu_unlock;
+ return 0;
}
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
&bad_sectors)) {
@@ -5577,10 +5537,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio);
return 1;
-
-out_rcu_unlock:
- rcu_read_unlock();
- return 0;
}
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
@@ -6595,14 +6551,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
* Note in case of > 1 drive failures it's possible we're rebuilding
* one drive while leaving another faulty drive in array.
*/
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
}
- rcu_read_unlock();
md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
@@ -7926,18 +7880,10 @@ static int raid5_run(struct mddev *mddev)
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) {
- rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
- if (!rdev && conf->disks[i].replacement) {
- /* The replacement is all we have yet */
- rdev = rdev_mdlock_deref(mddev,
- conf->disks[i].replacement);
- conf->disks[i].replacement = NULL;
- clear_bit(Replacement, &rdev->flags);
- rcu_assign_pointer(conf->disks[i].rdev, rdev);
- }
+ rdev = conf->disks[i].rdev;
if (!rdev)
continue;
- if (rcu_access_pointer(conf->disks[i].replacement) &&
+ if (conf->disks[i].replacement &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
@@ -8117,15 +8063,16 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
struct r5conf *conf = mddev->private;
int i;
+ lockdep_assert_held(&mddev->lock);
+
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
+
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
- rcu_read_unlock();
seq_printf (seq, "]");
}
@@ -8163,9 +8110,8 @@ static int raid5_spare_active(struct mddev *mddev)
unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) {
- rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
- replacement = rdev_mdlock_deref(mddev,
- conf->disks[i].replacement);
+ rdev = conf->disks[i].rdev;
+ replacement = conf->disks[i].replacement;
if (replacement
&& replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &replacement->flags)
@@ -8204,7 +8150,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r5conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct md_rdev __rcu **rdevp;
+ struct md_rdev **rdevp;
struct disk_info *p;
struct md_rdev *tmp;
@@ -8227,9 +8173,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->pool_size))
return 0;
p = conf->disks + number;
- if (rdev == rcu_access_pointer(p->rdev))
+ if (rdev == p->rdev)
rdevp = &p->rdev;
- else if (rdev == rcu_access_pointer(p->replacement))
+ else if (rdev == p->replacement)
rdevp = &p->replacement;
else
return 0;
@@ -8249,28 +8195,24 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
- (!rcu_access_pointer(p->replacement) ||
- rcu_access_pointer(p->replacement) == rdev) &&
+ (!p->replacement || p->replacement == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
- *rdevp = NULL;
+ WRITE_ONCE(*rdevp, NULL);
if (!err) {
err = log_modify(conf, rdev, false);
if (err)
goto abort;
}
- tmp = rcu_access_pointer(p->replacement);
+ tmp = p->replacement;
if (tmp) {
/* We must have just cleared 'rdev' */
- rcu_assign_pointer(p->rdev, tmp);
+ WRITE_ONCE(p->rdev, tmp);
clear_bit(Replacement, &tmp->flags);
- smp_mb(); /* Make sure other CPUs may see both as identical
- * but will never see neither - if they are careful
- */
- rcu_assign_pointer(p->replacement, NULL);
+ WRITE_ONCE(p->replacement, NULL);
if (!err)
err = log_modify(conf, tmp, true);
@@ -8338,7 +8280,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = disk;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
- rcu_assign_pointer(p->rdev, rdev);
+ WRITE_ONCE(p->rdev, rdev);
err = log_modify(conf, rdev, true);
@@ -8347,7 +8289,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
}
for (disk = first; disk <= last; disk++) {
p = conf->disks + disk;
- tmp = rdev_mdlock_deref(mddev, p->rdev);
+ tmp = p->rdev;
if (test_bit(WantReplacement, &tmp->flags) &&
mddev->reshape_position == MaxSector &&
p->replacement == NULL) {
@@ -8356,7 +8298,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = disk;
err = 0;
conf->fullsync = 1;
- rcu_assign_pointer(p->replacement, rdev);
+ WRITE_ONCE(p->replacement, rdev);
break;
}
}
@@ -8489,7 +8431,7 @@ static int raid5_start_reshape(struct mddev *mddev)
if (mddev->recovery_cp < MaxSector)
return -EBUSY;
for (i = 0; i < conf->raid_disks; i++)
- if (rdev_mdlock_deref(mddev, conf->disks[i].replacement))
+ if (conf->disks[i].replacement)
return -EBUSY;
rdev_for_each(rdev, mddev) {
@@ -8639,12 +8581,10 @@ static void raid5_finish_reshape(struct mddev *mddev)
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++) {
- rdev = rdev_mdlock_deref(mddev,
- conf->disks[d].rdev);
+ rdev = conf->disks[d].rdev;
if (rdev)
clear_bit(In_sync, &rdev->flags);
- rdev = rdev_mdlock_deref(mddev,
- conf->disks[d].replacement);
+ rdev = conf->disks[d].replacement;
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 22bea20ecc..9b5a7dc3f2 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -473,8 +473,8 @@ enum {
*/
struct disk_info {
- struct md_rdev __rcu *rdev;
- struct md_rdev __rcu *replacement;
+ struct md_rdev *rdev;
+ struct md_rdev *replacement;
struct page *extra_page; /* extra page to use in prexor */
};