summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bufio.c6
-rw-r--r--drivers/md/dm-crypt.c4
-rw-r--r--drivers/md/dm-integrity.c30
-rw-r--r--drivers/md/dm-io.c23
-rw-r--r--drivers/md/dm-kcopyd.c4
-rw-r--r--drivers/md/dm-log.c4
-rw-r--r--drivers/md/dm-raid.c97
-rw-r--r--drivers/md/dm-raid1.c6
-rw-r--r--drivers/md/dm-snap-persistent.c4
-rw-r--r--drivers/md/dm-snap.c4
-rw-r--r--drivers/md/dm-verity-target.c2
-rw-r--r--drivers/md/dm-verity.h4
-rw-r--r--drivers/md/dm-writecache.c8
-rw-r--r--drivers/md/dm.c26
-rw-r--r--drivers/md/md-bitmap.c9
-rw-r--r--drivers/md/md-multipath.c9
-rw-r--r--drivers/md/md.c134
-rw-r--r--drivers/md/md.h44
-rw-r--r--drivers/md/raid1.c199
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c9
-rw-r--r--drivers/md/raid5.c55
22 files changed, 419 insertions, 263 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index f03d7dba2..4f2808ef3 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1315,7 +1315,7 @@ static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
io_req.mem.ptr.vma = (char *)b->data + offset;
}
- r = dm_io(&io_req, 1, &region, NULL);
+ r = dm_io(&io_req, 1, &region, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
b->end_io(b, errno_to_blk_status(r));
}
@@ -2167,7 +2167,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
if (WARN_ON_ONCE(dm_bufio_in_request()))
return -EINVAL;
- return dm_io(&io_req, 1, &io_reg, NULL);
+ return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
}
EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
@@ -2191,7 +2191,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
if (WARN_ON_ONCE(dm_bufio_in_request()))
return -EINVAL; /* discards are optional */
- return dm_io(&io_req, 1, &io_reg, NULL);
+ return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
}
EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4ab4e8dcf..35f501939 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -53,11 +53,11 @@
struct convert_context {
struct completion restart;
struct bio *bio_in;
- struct bio *bio_out;
struct bvec_iter iter_in;
+ struct bio *bio_out;
struct bvec_iter iter_out;
- u64 cc_sector;
atomic_t cc_pending;
+ u64 cc_sector;
union {
struct skcipher_request *req;
struct aead_request *req_aead;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index e8e8fc33d..2cc30b9ab 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -555,7 +555,7 @@ static int sync_rw_sb(struct dm_integrity_c *ic, blk_opf_t opf)
}
}
- r = dm_io(&io_req, 1, &io_loc, NULL);
+ r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
return r;
@@ -1073,7 +1073,7 @@ static void rw_journal_sectors(struct dm_integrity_c *ic, blk_opf_t opf,
io_loc.sector = ic->start + SB_SECTORS + sector;
io_loc.count = n_sectors;
- r = dm_io(&io_req, 1, &io_loc, NULL);
+ r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
if (unlikely(r)) {
dm_integrity_io_error(ic, (opf & REQ_OP_MASK) == REQ_OP_READ ?
"reading journal" : "writing journal", r);
@@ -1190,7 +1190,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned int section, u
io_loc.sector = target;
io_loc.count = n_sectors;
- r = dm_io(&io_req, 1, &io_loc, NULL);
+ r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
if (unlikely(r)) {
WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
fn(-1UL, data);
@@ -1519,7 +1519,7 @@ static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_dat
fr.io_reg.count = 0,
fr.ic = ic;
init_completion(&fr.comp);
- r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL);
+ r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL, IOPRIO_DEFAULT);
BUG_ON(r);
}
@@ -1699,7 +1699,6 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
struct bio_vec bv;
sector_t sector, logical_sector, area, offset;
struct page *page;
- void *buffer;
get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset,
@@ -1708,13 +1707,14 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
logical_sector = dio->range.logical_sector;
page = mempool_alloc(&ic->recheck_pool, GFP_NOIO);
- buffer = page_to_virt(page);
__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
unsigned pos = 0;
do {
+ sector_t alignment;
char *mem;
+ char *buffer = page_to_virt(page);
int r;
struct dm_io_request io_req;
struct dm_io_region io_loc;
@@ -1727,7 +1727,15 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
io_loc.sector = sector;
io_loc.count = ic->sectors_per_block;
- r = dm_io(&io_req, 1, &io_loc, NULL);
+ /* Align the bio to logical block size */
+ alignment = dio->range.logical_sector | bio_sectors(bio) | (PAGE_SIZE >> SECTOR_SHIFT);
+ alignment &= -alignment;
+ io_loc.sector = round_down(io_loc.sector, alignment);
+ io_loc.count += sector - io_loc.sector;
+ buffer += (sector - io_loc.sector) << SECTOR_SHIFT;
+ io_loc.count = round_up(io_loc.count, alignment);
+
+ r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
if (unlikely(r)) {
dio->bi_status = errno_to_blk_status(r);
goto free_ret;
@@ -1848,12 +1856,12 @@ again:
r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
if (unlikely(r)) {
+ if (likely(checksums != checksums_onstack))
+ kfree(checksums);
if (r > 0) {
- integrity_recheck(dio, checksums);
+ integrity_recheck(dio, checksums_onstack);
goto skip_io;
}
- if (likely(checksums != checksums_onstack))
- kfree(checksums);
goto error;
}
@@ -2806,7 +2814,7 @@ next_chunk:
io_loc.sector = get_data_sector(ic, area, offset);
io_loc.count = n_sectors;
- r = dm_io(&io_req, 1, &io_loc, NULL);
+ r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
if (unlikely(r)) {
dm_integrity_io_error(ic, "reading data", r);
goto err;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index f053ce245..740949025 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -305,7 +305,7 @@ static void km_dp_init(struct dpages *dp, void *data)
*/
static void do_region(const blk_opf_t opf, unsigned int region,
struct dm_io_region *where, struct dpages *dp,
- struct io *io)
+ struct io *io, unsigned short ioprio)
{
struct bio *bio;
struct page *page;
@@ -354,6 +354,7 @@ static void do_region(const blk_opf_t opf, unsigned int region,
&io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
bio->bi_end_io = endio;
+ bio->bi_ioprio = ioprio;
store_io_and_region_in_bio(bio, io, region);
if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
@@ -383,7 +384,7 @@ static void do_region(const blk_opf_t opf, unsigned int region,
static void dispatch_io(blk_opf_t opf, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
- struct io *io, int sync)
+ struct io *io, int sync, unsigned short ioprio)
{
int i;
struct dpages old_pages = *dp;
@@ -400,7 +401,7 @@ static void dispatch_io(blk_opf_t opf, unsigned int num_regions,
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
if (where[i].count || (opf & REQ_PREFLUSH))
- do_region(opf, i, where + i, dp, io);
+ do_region(opf, i, where + i, dp, io, ioprio);
}
/*
@@ -425,7 +426,7 @@ static void sync_io_complete(unsigned long error, void *context)
static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, blk_opf_t opf, struct dpages *dp,
- unsigned long *error_bits)
+ unsigned long *error_bits, unsigned short ioprio)
{
struct io *io;
struct sync_io sio;
@@ -447,7 +448,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
io->vma_invalidate_address = dp->vma_invalidate_address;
io->vma_invalidate_size = dp->vma_invalidate_size;
- dispatch_io(opf, num_regions, where, dp, io, 1);
+ dispatch_io(opf, num_regions, where, dp, io, 1, ioprio);
wait_for_completion_io(&sio.wait);
@@ -459,7 +460,8 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
static int async_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, blk_opf_t opf,
- struct dpages *dp, io_notify_fn fn, void *context)
+ struct dpages *dp, io_notify_fn fn, void *context,
+ unsigned short ioprio)
{
struct io *io;
@@ -479,7 +481,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io->vma_invalidate_address = dp->vma_invalidate_address;
io->vma_invalidate_size = dp->vma_invalidate_size;
- dispatch_io(opf, num_regions, where, dp, io, 0);
+ dispatch_io(opf, num_regions, where, dp, io, 0, ioprio);
return 0;
}
@@ -521,7 +523,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
}
int dm_io(struct dm_io_request *io_req, unsigned int num_regions,
- struct dm_io_region *where, unsigned long *sync_error_bits)
+ struct dm_io_region *where, unsigned long *sync_error_bits,
+ unsigned short ioprio)
{
int r;
struct dpages dp;
@@ -532,11 +535,11 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions,
if (!io_req->notify.fn)
return sync_io(io_req->client, num_regions, where,
- io_req->bi_opf, &dp, sync_error_bits);
+ io_req->bi_opf, &dp, sync_error_bits, ioprio);
return async_io(io_req->client, num_regions, where,
io_req->bi_opf, &dp, io_req->notify.fn,
- io_req->notify.context);
+ io_req->notify.context, ioprio);
}
EXPORT_SYMBOL(dm_io);
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d01807c50..79c65c9ad 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -578,9 +578,9 @@ static int run_io_job(struct kcopyd_job *job)
io_job_start(job->kc->throttle);
if (job->op == REQ_OP_READ)
- r = dm_io(&io_req, 1, &job->source, NULL);
+ r = dm_io(&io_req, 1, &job->source, NULL, IOPRIO_DEFAULT);
else
- r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+ r = dm_io(&io_req, job->num_dests, job->dests, NULL, IOPRIO_DEFAULT);
return r;
}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index f9f84236d..f7f9c2100 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int rw_header(struct log_c *lc, enum req_op op)
{
lc->io_req.bi_opf = op;
- return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
+ return dm_io(&lc->io_req, 1, &lc->header_location, NULL, IOPRIO_DEFAULT);
}
static int flush_header(struct log_c *lc)
@@ -313,7 +313,7 @@ static int flush_header(struct log_c *lc)
lc->io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
- return dm_io(&lc->io_req, 1, &null_location, NULL);
+ return dm_io(&lc->io_req, 1, &null_location, NULL, IOPRIO_DEFAULT);
}
static int read_header(struct log_c *log)
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index eb009d6bb..d97355e9b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -213,6 +213,7 @@ struct raid_dev {
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
#define RT_FLAG_RS_GROW 8
+#define RT_FLAG_RS_FROZEN 9
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -3240,11 +3241,12 @@ size_check:
rs->md.ro = 1;
rs->md.in_sync = 1;
- /* Keep array frozen until resume. */
- set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
-
/* Has to be held on running the array */
mddev_suspend_and_lock_nointr(&rs->md);
+
+ /* Keep array frozen until resume. */
+ md_frozen_sync_thread(&rs->md);
+
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@@ -3329,17 +3331,18 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
struct mddev *mddev = &rs->md;
/*
- * If we're reshaping to add disk(s)), ti->len and
+ * If we're reshaping to add disk(s), ti->len and
* mddev->array_sectors will differ during the process
* (ti->len > mddev->array_sectors), so we have to requeue
* bios with addresses > mddev->array_sectors here or
* there will occur accesses past EOD of the component
* data images thus erroring the raid set.
*/
- if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
+ if (unlikely(bio_has_data(bio) && bio_end_sector(bio) > mddev->array_sectors))
return DM_MAPIO_REQUEUE;
- md_handle_request(mddev, bio);
+ if (unlikely(!md_handle_request(mddev, bio)))
+ return DM_MAPIO_REQUEUE;
return DM_MAPIO_SUBMITTED;
}
@@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
+ int ret = 0;
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
- if (!strcasecmp(argv[0], "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) ||
+ test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags))
+ return -EBUSY;
- if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
- }
- } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
+ if (!strcasecmp(argv[0], "frozen")) {
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
+ md_frozen_sync_thread(mddev);
+ mddev_unlock(mddev);
+ } else if (!strcasecmp(argv[0], "idle")) {
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
+ md_idle_sync_thread(mddev);
+ mddev_unlock(mddev);
+ }
+
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
return -EBUSY;
else if (!strcasecmp(argv[0], "resync"))
; /* MD_RECOVERY_NEEDED set below */
@@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
}
+static void raid_presuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+
+ /*
+ * From now on, disallow raid_message() to change sync_thread until
+ * resume, raid_postsuspend() is too late.
+ */
+ set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
+
+ if (!reshape_interrupted(mddev))
+ return;
+
+ /*
+ * For raid456, if reshape is interrupted, IO across reshape position
+ * will never make progress, while caller will wait for IO to be done.
+ * Inform raid456 to handle those IO to prevent deadlock.
+ */
+ if (mddev->pers && mddev->pers->prepare_suspend)
+ mddev->pers->prepare_suspend(mddev);
+}
+
+static void raid_presuspend_undo(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
+}
+
static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
- /* Writes have to be stopped before suspending to avoid deadlocks. */
- if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
- md_stop_writes(&rs->md);
-
+ /*
+ * sync_thread must be stopped during suspend, and writes have
+ * to be stopped before suspending to avoid deadlocks.
+ */
+ md_stop_writes(&rs->md);
mddev_suspend(&rs->md, false);
}
}
@@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti)
}
/* Check for any resize/reshape on @rs and adjust/initiate */
- /* Be prepared for mddev_resume() in raid_resume() */
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp;
@@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti)
* Take this opportunity to check whether any failed
* devices are reachable again.
*/
+ mddev_lock_nointr(mddev);
attempt_restore_of_faulty_devices(rs);
+ mddev_unlock(mddev);
}
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
@@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti)
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
+ WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery));
+ WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
mddev_lock_nointr(mddev);
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
+ md_unfrozen_sync_thread(mddev);
mddev_unlock_and_resume(mddev);
}
}
@@ -4074,6 +4123,8 @@ static struct target_type raid_target = {
.message = raid_message,
.iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints,
+ .presuspend = raid_presuspend,
+ .presuspend_undo = raid_presuspend_undo,
.postsuspend = raid_postsuspend,
.preresume = raid_preresume,
.resume = raid_resume,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ddcb2bc4a..9511dae5b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -278,7 +278,7 @@ static int mirror_flush(struct dm_target *ti)
}
error_bits = -1;
- dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
+ dm_io(&io_req, ms->nr_mirrors, io, &error_bits, IOPRIO_DEFAULT);
if (unlikely(error_bits != 0)) {
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error_bits))
@@ -554,7 +554,7 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
map_region(&io, m, bio);
bio_set_m(bio, m);
- BUG_ON(dm_io(&io_req, 1, &io, NULL));
+ BUG_ON(dm_io(&io_req, 1, &io, NULL, IOPRIO_DEFAULT));
}
static inline int region_in_sync(struct mirror_set *ms, region_t region,
@@ -681,7 +681,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
*/
bio_set_m(bio, get_default_mirror(ms));
- BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL));
+ BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL, IOPRIO_DEFAULT));
}
static void do_writes(struct mirror_set *ms, struct bio_list *writes)
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 15649921f..568d10842 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -223,7 +223,7 @@ static void do_metadata(struct work_struct *work)
{
struct mdata_req *req = container_of(work, struct mdata_req, work);
- req->result = dm_io(req->io_req, 1, req->where, NULL);
+ req->result = dm_io(req->io_req, 1, req->where, NULL, IOPRIO_DEFAULT);
}
/*
@@ -247,7 +247,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, blk_opf_t opf,
struct mdata_req req;
if (!metadata)
- return dm_io(&io_req, 1, &where, NULL);
+ return dm_io(&io_req, 1, &where, NULL, IOPRIO_DEFAULT);
req.where = &where;
req.io_req = &io_req;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index bf7a57449..0ace06d1b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -684,8 +684,10 @@ static void dm_exception_table_exit(struct dm_exception_table *et,
for (i = 0; i < size; i++) {
slot = et->table + i;
- hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
+ hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) {
kmem_cache_free(mem, ex);
+ cond_resched();
+ }
}
kvfree(et->table);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 7b620b187..49e4a35d7 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -511,7 +511,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
io_loc.bdev = v->data_dev->bdev;
io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT);
io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT);
- r = dm_io(&io_req, 1, &io_loc, NULL);
+ r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
goto free_ret;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 4620a98c9..db93a9116 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -80,12 +80,12 @@ struct dm_verity_io {
/* original value of bio->bi_end_io */
bio_end_io_t *orig_bi_end_io;
+ struct bvec_iter iter;
+
sector_t block;
unsigned int n_blocks;
bool in_tasklet;
- struct bvec_iter iter;
-
struct work_struct work;
char *recheck_buffer;
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 074cb785e..6a4279bfb 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -531,7 +531,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
req.notify.context = &endio;
/* writing via async dm-io (implied by notify.fn above) won't return an error */
- (void) dm_io(&req, 1, &region, NULL);
+ (void) dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
i = j;
}
@@ -568,7 +568,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
req.notify.fn = NULL;
req.notify.context = NULL;
- r = dm_io(&req, 1, &region, NULL);
+ r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
writecache_error(wc, r, "error writing superblock");
}
@@ -596,7 +596,7 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
req.client = wc->dm_io;
req.notify.fn = NULL;
- r = dm_io(&req, 1, &region, NULL);
+ r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
writecache_error(wc, r, "error flushing metadata: %d", r);
}
@@ -990,7 +990,7 @@ static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors
req.client = wc->dm_io;
req.notify.fn = NULL;
- return dm_io(&req, 1, &region, NULL);
+ return dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
}
static void writecache_resume(struct dm_target *ti)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 23c32cd1f..4ff9bebb8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2945,6 +2945,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend
static void __dm_internal_resume(struct mapped_device *md)
{
+ int r;
+ struct dm_table *map;
+
BUG_ON(!md->internal_suspend_count);
if (--md->internal_suspend_count)
@@ -2953,12 +2956,23 @@ static void __dm_internal_resume(struct mapped_device *md)
if (dm_suspended_md(md))
goto done; /* resume from nested suspend */
- /*
- * NOTE: existing callers don't need to call dm_table_resume_targets
- * (which may fail -- so best to avoid it for now by passing NULL map)
- */
- (void) __dm_resume(md, NULL);
-
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ r = __dm_resume(md, map);
+ if (r) {
+ /*
+ * If a preresume method of some target failed, we are in a
+ * tricky situation. We can't return an error to the caller. We
+ * can't fake success because then the "resume" and
+ * "postsuspend" methods would not be paired correctly, and it
+ * would break various targets, for example it would cause list
+ * corruption in the "origin" target.
+ *
+ * So, we fake normal suspend here, to make sure that the
+ * "resume" and "postsuspend" methods will be paired correctly.
+ */
+ DMERR("Preresume method failed: %d", r);
+ set_bit(DMF_SUSPENDED, &md->flags);
+ }
done:
clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
smp_mb__after_atomic();
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 9672f75c3..a4976ceae 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
- if (pg_index == store->file_pages - 1) {
+ /* we compare length (page numbers), not page offset. */
+ if ((pg_index - store->sb_index) == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
struct page *page = store->filemap[pg_index];
if (mddev_is_clustered(bitmap->mddev)) {
- pg_index += bitmap->cluster_slot *
- DIV_ROUND_UP(store->bytes, PAGE_SIZE);
+ /* go to node bitmap area starting point */
+ pg_index += store->sb_index;
}
if (store->file)
@@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
+ index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
+ index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index d22276870..aa77133f3 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -258,15 +258,6 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort;
}
p->rdev = NULL;
- if (!test_bit(RemoveSynchronized, &rdev->flags)) {
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- p->rdev = rdev;
- goto abort;
- }
- }
err = md_integrity_register(mddev);
}
abort:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 58889bc72..67befb598 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -99,18 +99,6 @@ static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
-enum md_ro_state {
- MD_RDWR,
- MD_RDONLY,
- MD_AUTO_READ,
- MD_MAX_STATE
-};
-
-static bool md_is_rdwr(struct mddev *mddev)
-{
- return (mddev->ro == MD_RDWR);
-}
-
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@@ -378,7 +366,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
}
-void md_handle_request(struct mddev *mddev, struct bio *bio)
+bool md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
if (is_suspended(mddev, bio)) {
@@ -386,7 +374,7 @@ check_suspended:
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
- return;
+ return true;
}
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
@@ -402,10 +390,13 @@ check_suspended:
if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io);
+ if (!mddev->gendisk && mddev->pers->prepare_suspend)
+ return false;
goto check_suspended;
}
percpu_ref_put(&mddev->active_io);
+ return true;
}
EXPORT_SYMBOL(md_handle_request);
@@ -2582,6 +2573,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
fail:
pr_warn("md: failed to register dev-%s for %s\n",
b, mdname(mddev));
+ mddev_destroy_serial_pool(mddev, rdev);
return err;
}
@@ -4944,6 +4936,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
mddev_lock_nointr(mddev);
}
+void md_idle_sync_thread(struct mddev *mddev)
+{
+ lockdep_assert_held(&mddev->reconfig_mutex);
+
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev, true, true);
+}
+EXPORT_SYMBOL_GPL(md_idle_sync_thread);
+
+void md_frozen_sync_thread(struct mddev *mddev)
+{
+ lockdep_assert_held(&mddev->reconfig_mutex);
+
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev, true, false);
+}
+EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
+
+void md_unfrozen_sync_thread(struct mddev *mddev)
+{
+ lockdep_assert_held(&mddev->reconfig_mutex);
+
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+}
+EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
+
static void idle_sync_thread(struct mddev *mddev)
{
mutex_lock(&mddev->sync_mutex);
@@ -6063,7 +6084,10 @@ int md_run(struct mddev *mddev)
pr_warn("True protection against single-disk failure might be compromised.\n");
}
- mddev->recovery = 0;
+ /* dm-raid expect sync_thread to be frozen until resume */
+ if (mddev->gendisk)
+ mddev->recovery = 0;
+
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
@@ -6303,7 +6327,15 @@ static void md_clean(struct mddev *mddev)
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
- mddev->flags = 0;
+ /*
+ * Don't clear MD_CLOSING, or mddev can be opened again.
+ * 'hold_active != 0' means mddev is still in the creation
+ * process and will be used later.
+ */
+ if (mddev->hold_active)
+ mddev->flags = 0;
+ else
+ mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
mddev->sb_flags = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
@@ -6340,7 +6372,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- stop_sync_thread(mddev, true, false);
del_timer_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) {
@@ -6365,6 +6396,8 @@ static void __md_stop_writes(struct mddev *mddev)
void md_stop_writes(struct mddev *mddev)
{
mddev_lock_nointr(mddev);
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev, true, false);
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
@@ -7649,7 +7682,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
int err = 0;
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
- bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
return -ENOTTY;
@@ -7733,7 +7765,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
err = -EBUSY;
goto out;
}
- did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
@@ -7875,7 +7906,7 @@ unlock:
mddev_unlock(mddev);
out:
- if(did_set_md_closing)
+ if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
@@ -8762,6 +8793,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
}
EXPORT_SYMBOL_GPL(md_account_bio);
+void md_free_cloned_bio(struct bio *bio)
+{
+ struct md_io_clone *md_io_clone = bio->bi_private;
+ struct bio *orig_bio = md_io_clone->orig_bio;
+ struct mddev *mddev = md_io_clone->mddev;
+
+ if (bio->bi_status && !orig_bio->bi_status)
+ orig_bio->bi_status = bio->bi_status;
+
+ if (md_io_clone->start_time)
+ bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
+ bio_put(bio);
+ percpu_ref_put(&mddev->active_io);
+}
+EXPORT_SYMBOL_GPL(md_free_cloned_bio);
+
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
@@ -9295,9 +9343,14 @@ static bool md_spares_need_change(struct mddev *mddev)
{
struct md_rdev *rdev;
- rdev_for_each(rdev, mddev)
- if (rdev_removeable(rdev) || rdev_addable(rdev))
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev_removeable(rdev) || rdev_addable(rdev)) {
+ rcu_read_unlock();
return true;
+ }
+ }
+ rcu_read_unlock();
return false;
}
@@ -9307,44 +9360,19 @@ static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *rdev;
int spares = 0;
int removed = 0;
- bool remove_some = false;
if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
/* Mustn't remove devices when resync thread is running */
return 0;
rdev_for_each(rdev, mddev) {
- if ((this == NULL || rdev == this) &&
- rdev->raid_disk >= 0 &&
- !test_bit(Blocked, &rdev->flags) &&
- test_bit(Faulty, &rdev->flags) &&
- atomic_read(&rdev->nr_pending)==0) {
- /* Faulty non-Blocked devices with nr_pending == 0
- * never get nr_pending incremented,
- * never get Faulty cleared, and never get Blocked set.
- * So we can synchronize_rcu now rather than once per device
- */
- remove_some = true;
- set_bit(RemoveSynchronized, &rdev->flags);
- }
- }
-
- if (remove_some)
- synchronize_rcu();
- rdev_for_each(rdev, mddev) {
- if ((this == NULL || rdev == this) &&
- (test_bit(RemoveSynchronized, &rdev->flags) ||
- rdev_removeable(rdev))) {
- if (mddev->pers->hot_remove_disk(
- mddev, rdev) == 0) {
- sysfs_unlink_rdev(mddev, rdev);
- rdev->saved_raid_disk = rdev->raid_disk;
- rdev->raid_disk = -1;
- removed++;
- }
+ if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
+ !mddev->pers->hot_remove_disk(mddev, rdev)) {
+ sysfs_unlink_rdev(mddev, rdev);
+ rdev->saved_raid_disk = rdev->raid_disk;
+ rdev->raid_disk = -1;
+ removed++;
}
- if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
- clear_bit(RemoveSynchronized, &rdev->flags);
}
if (removed && mddev->kobj.sd)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ade83af12..375ad4a2d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -190,11 +190,6 @@ enum flag_bits {
* than other devices in the array
*/
ClusterRemove,
- RemoveSynchronized, /* synchronize_rcu() was called after
- * this device was known to be faulty,
- * so it is safe to remove without
- * another synchronize_rcu() call.
- */
ExternalBbl, /* External metadata provides bad
* block management for a disk
*/
@@ -212,6 +207,7 @@ enum flag_bits {
* check if there is collision between raid1
* serial bios.
*/
+ Nonrot, /* non-rotational device (SSD) */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -563,6 +559,37 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static inline bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
+static inline bool reshape_interrupted(struct mddev *mddev)
+{
+ /* reshape never start */
+ if (mddev->reshape_position == MaxSector)
+ return false;
+
+ /* interrupted */
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return true;
+
+ /* running reshape will be interrupted soon. */
+ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ return true;
+
+ return false;
+}
+
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -622,6 +649,7 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
+ void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@@ -755,6 +783,7 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio);
+void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@@ -783,9 +812,12 @@ extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
-extern void md_handle_request(struct mddev *mddev, struct bio *bio);
+extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
+extern void md_idle_sync_thread(struct mddev *mddev);
+extern void md_frozen_sync_thread(struct mddev *mddev);
+extern void md_unfrozen_sync_thread(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e138922d5..750a80247 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -600,16 +600,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
const sector_t this_sector = r1_bio->sector;
int sectors;
int best_good_sectors;
- int best_disk, best_dist_disk, best_pending_disk;
- int has_nonrot_disk;
+ int best_disk, best_dist_disk, best_pending_disk, sequential_disk;
int disk;
sector_t best_dist;
unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
- int choose_next_idle;
- rcu_read_lock();
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
@@ -619,12 +616,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
sectors = r1_bio->sectors;
best_disk = -1;
best_dist_disk = -1;
+ sequential_disk = -1;
best_dist = MaxSector;
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0;
- has_nonrot_disk = 0;
- choose_next_idle = 0;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
@@ -640,9 +636,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
sector_t first_bad;
int bad_sectors;
unsigned int pending;
- bool nonrot;
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ rdev = conf->mirrors[disk].rdev;
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| test_bit(Faulty, &rdev->flags))
@@ -706,8 +701,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
/* At least two disks to choose from so failfast is OK */
set_bit(R1BIO_FailFast, &r1_bio->state);
- nonrot = bdev_nonrot(rdev->bdev);
- has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first) {
@@ -720,7 +713,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
struct raid1_info *mirror = &conf->mirrors[disk];
- best_disk = disk;
/*
* If buffered sequential IO size exceeds optimal
* iosize, check if there is idle disk. If yes, choose
@@ -734,20 +726,27 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* small, but not a big deal since when the second disk
* starts IO, the first disk is likely still busy.
*/
- if (nonrot && opt_iosize > 0 &&
+ if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 &&
mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >=
mirror->seq_start) {
- choose_next_idle = 1;
- continue;
+ /*
+ * Add 'pending' to avoid choosing this disk if
+ * there is other idle disk.
+ */
+ pending++;
+ /*
+ * If there is no other idle disk, this disk
+ * will be chosen.
+ */
+ sequential_disk = disk;
+ } else {
+ best_disk = disk;
+ break;
}
- break;
}
- if (choose_next_idle)
- continue;
-
if (min_pending > pending) {
min_pending = pending;
best_pending_disk = disk;
@@ -760,20 +759,27 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
}
/*
+ * sequential IO size exceeds optimal iosize, however, there is no other
+ * idle disk, so choose the sequential disk.
+ */
+ if (best_disk == -1 && min_pending != 0)
+ best_disk = sequential_disk;
+
+ /*
* If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
if (best_disk == -1) {
- if (has_nonrot_disk || min_pending == 0)
+ if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
best_disk = best_pending_disk;
else
best_disk = best_dist_disk;
}
if (best_disk >= 0) {
- rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
+ rdev = conf->mirrors[best_disk].rdev;
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
@@ -784,7 +790,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
}
- rcu_read_unlock();
*max_sectors = sectors;
return best_disk;
@@ -1235,14 +1240,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (r1bio_existed) {
/* Need to get the block device name carefully */
- struct md_rdev *rdev;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
+ struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
+
if (rdev)
snprintf(b, sizeof(b), "%pg", rdev->bdev);
else
strcpy(b, "???");
- rcu_read_unlock();
}
/*
@@ -1396,10 +1399,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
disks = conf->raid_disks * 2;
blocked_rdev = NULL;
- rcu_read_lock();
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
/*
* The write-behind io is only attempted on drives marked as
@@ -1465,7 +1467,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
}
r1_bio->bios[i] = bio;
}
- rcu_read_unlock();
if (unlikely(blocked_rdev)) {
/* Wait for this device to become unblocked */
@@ -1617,15 +1618,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
struct r1conf *conf = mddev->private;
int i;
+ lockdep_assert_held(&mddev->lock);
+
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
conf->raid_disks - mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
+
seq_printf(seq, "%s",
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
- rcu_read_unlock();
seq_printf(seq, "]");
}
@@ -1691,16 +1693,15 @@ static void print_conf(struct r1conf *conf)
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
- rcu_read_lock();
+ lockdep_assert_held(&conf->mddev->reconfig_mutex);
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
if (rdev)
pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
i, !test_bit(In_sync, &rdev->flags),
!test_bit(Faulty, &rdev->flags),
rdev->bdev);
}
- rcu_read_unlock();
}
static void close_sync(struct r1conf *conf)
@@ -1767,6 +1768,52 @@ static int raid1_spare_active(struct mddev *mddev)
return count;
}
+static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
+ bool replacement)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+
+ if (replacement)
+ info += conf->raid_disks;
+
+ if (info->rdev)
+ return false;
+
+ if (bdev_nonrot(rdev->bdev)) {
+ set_bit(Nonrot, &rdev->flags);
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
+ }
+
+ rdev->raid_disk = disk;
+ info->head_position = 0;
+ info->seq_start = MaxSector;
+ WRITE_ONCE(info->rdev, rdev);
+
+ return true;
+}
+
+static bool raid1_remove_conf(struct r1conf *conf, int disk)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+ struct md_rdev *rdev = info->rdev;
+
+ if (!rdev || test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending))
+ return false;
+
+ /* Only remove non-faulty devices if recovery is not possible. */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ rdev->mddev->recovery_disabled != conf->recovery_disabled &&
+ rdev->mddev->degraded < conf->raid_disks)
+ return false;
+
+ if (test_and_clear_bit(Nonrot, &rdev->flags))
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
+
+ WRITE_ONCE(info->rdev, NULL);
+ return true;
+}
+
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
@@ -1802,15 +1849,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- p->head_position = 0;
- rdev->raid_disk = mirror;
+ raid1_add_conf(conf, rdev, mirror, false);
err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
- rcu_assign_pointer(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1820,13 +1865,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
- p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
- rdev->raid_disk = repl_slot;
+ raid1_add_conf(conf, rdev, repl_slot, true);
err = 0;
conf->fullsync = 1;
- rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf);
@@ -1843,36 +1886,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks))
goto abort;
- if (rdev != p->rdev)
- p = conf->mirrors + conf->raid_disks + number;
+ if (rdev != p->rdev) {
+ number += conf->raid_disks;
+ p = conf->mirrors + number;
+ }
print_conf(conf);
if (rdev == p->rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
+ if (!raid1_remove_conf(conf, number)) {
err = -EBUSY;
goto abort;
}
- /* Only remove non-faulty devices if recovery
- * is not possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != conf->recovery_disabled &&
- mddev->degraded < conf->raid_disks) {
- err = -EBUSY;
- goto abort;
- }
- p->rdev = NULL;
- if (!test_bit(RemoveSynchronized, &rdev->flags)) {
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- p->rdev = rdev;
- goto abort;
- }
- }
- if (conf->mirrors[conf->raid_disks + number].rdev) {
+
+ if (number < conf->raid_disks &&
+ conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
@@ -1892,7 +1919,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort;
}
clear_bit(Replacement, &repl->flags);
- p->rdev = repl;
+ WRITE_ONCE(p->rdev, repl);
conf->mirrors[conf->raid_disks + number].rdev = NULL;
unfreeze_array(conf);
}
@@ -2290,8 +2317,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
sector_t first_bad;
int bad_sectors;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
(test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
@@ -2299,15 +2325,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
is_badblock(rdev, sect, s,
&first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false))
success = 1;
rdev_dec_pending(rdev, mddev);
if (success)
break;
- } else
- rcu_read_unlock();
+ }
+
d++;
if (d == conf->raid_disks * 2)
d = 0;
@@ -2326,29 +2351,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
if (d==0)
d = conf->raid_disks * 2;
d--;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
r1_sync_page_io(rdev, sect, s,
conf->tmppage, REQ_OP_WRITE);
rdev_dec_pending(rdev, mddev);
- } else
- rcu_read_unlock();
+ }
}
d = start;
while (d != read_disk) {
if (d==0)
d = conf->raid_disks * 2;
d--;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, REQ_OP_READ)) {
atomic_add(s, &rdev->corrected_errors);
@@ -2359,8 +2379,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
rdev->bdev);
}
rdev_dec_pending(rdev, mddev);
- } else
- rcu_read_unlock();
+ }
}
sectors -= s;
sect += s;
@@ -2741,7 +2760,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
r1_bio = raid1_alloc_init_r1buf(conf);
- rcu_read_lock();
/*
* If we get a correctably read error during resync or recovery,
* we might want to read from a different device. So we
@@ -2762,7 +2780,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
struct md_rdev *rdev;
bio = r1_bio->bios[i];
- rdev = rcu_dereference(conf->mirrors[i].rdev);
+ rdev = conf->mirrors[i].rdev;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
@@ -2820,7 +2838,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_opf |= MD_FAILFAST;
}
}
- rcu_read_unlock();
if (disk < 0)
disk = wonly;
r1_bio->read_disk = disk;
@@ -3025,23 +3042,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
+ conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
- if (disk_idx >= mddev->raid_disks
- || disk_idx < 0)
+
+ if (disk_idx >= conf->raid_disks || disk_idx < 0)
continue;
- if (test_bit(Replacement, &rdev->flags))
- disk = conf->mirrors + mddev->raid_disks + disk_idx;
- else
- disk = conf->mirrors + disk_idx;
- if (disk->rdev)
+ if (!raid1_add_conf(conf, rdev, disk_idx,
+ test_bit(Replacement, &rdev->flags)))
goto abort;
- disk->rdev = rdev;
- disk->head_position = 0;
- disk->seq_start = MaxSector;
}
- conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14d4211a1..5300cbaa5 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements.
*/
int raid_disks;
+ int nonrot_disks;
spinlock_t device_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b7b0a573e..6e828a6aa 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2247,15 +2247,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort;
}
*rdevp = NULL;
- if (!test_bit(RemoveSynchronized, &rdev->flags)) {
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- *rdevp = rdev;
- goto abort;
- }
- }
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6fe334bb9..e1d8b5199 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -763,6 +763,7 @@ enum stripe_result {
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
+ STRIPE_WAIT_RESHAPE,
};
struct stripe_request_ctx {
@@ -2422,7 +2423,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
atomic_inc(&conf->active_stripes);
raid5_release_stripe(sh);
- conf->max_nr_stripes++;
+ WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1);
return 1;
}
@@ -2717,7 +2718,7 @@ static int drop_one_stripe(struct r5conf *conf)
shrink_buffers(sh);
free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
- conf->max_nr_stripes--;
+ WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1);
return 1;
}
@@ -5991,7 +5992,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- return STRIPE_SCHEDULE_AND_RETRY;
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6070,6 +6072,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
+out:
+ if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
+ bi->bi_status = BLK_STS_RESOURCE;
+ ret = STRIPE_WAIT_RESHAPE;
+ pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
+ }
return ret;
}
@@ -6191,7 +6199,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
- if (res == STRIPE_FAIL)
+ if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
break;
if (res == STRIPE_RETRY)
@@ -6229,6 +6237,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
if (rw == WRITE)
md_write_end(mddev);
+ if (res == STRIPE_WAIT_RESHAPE) {
+ md_free_cloned_bio(bi);
+ return false;
+ }
+
bio_endio(bi);
return true;
}
@@ -6878,7 +6891,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
if (size <= 16 || size > 32768)
return -EINVAL;
- conf->min_nr_stripes = size;
+ WRITE_ONCE(conf->min_nr_stripes, size);
mutex_lock(&conf->cache_size_mutex);
while (size < conf->max_nr_stripes &&
drop_one_stripe(conf))
@@ -6890,7 +6903,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
if (!grow_one_stripe(conf, GFP_KERNEL)) {
- conf->min_nr_stripes = conf->max_nr_stripes;
+ WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
result = -ENOMEM;
break;
}
@@ -7448,11 +7461,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = shrink->private_data;
+ int max_stripes = READ_ONCE(conf->max_nr_stripes);
+ int min_stripes = READ_ONCE(conf->min_nr_stripes);
- if (conf->max_nr_stripes < conf->min_nr_stripes)
+ if (max_stripes < min_stripes)
/* unlikely, but not impossible */
return 0;
- return conf->max_nr_stripes - conf->min_nr_stripes;
+ return max_stripes - min_stripes;
}
static struct r5conf *setup_conf(struct mddev *mddev)
@@ -8241,15 +8256,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort;
}
*rdevp = NULL;
- if (!test_bit(RemoveSynchronized, &rdev->flags)) {
- lockdep_assert_held(&mddev->reconfig_mutex);
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- rcu_assign_pointer(*rdevp, rdev);
- }
- }
if (!err) {
err = log_modify(conf, rdev, false);
if (err)
@@ -8990,6 +8996,18 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
+/*
+ * This is only used for dm-raid456, caller already frozen sync_thread, hence
+ * if rehsape is still in progress, io that is waiting for reshape can never be
+ * done now, hence wake up and handle those IO.
+ */
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ wake_up(&conf->wait_for_overlap);
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -9013,6 +9031,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid5_personality =
{
@@ -9037,6 +9056,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid4_personality =
@@ -9062,6 +9082,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static int __init raid5_init(void)