From 7c0639a3af697d4ae7a5db4d2ecc09eed43cad35 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 20:47:50 +0200 Subject: Merging upstream version 6.7.12. Signed-off-by: Daniel Baumann --- drivers/md/dm-bufio.c | 6 +- drivers/md/dm-crypt.c | 4 +- drivers/md/dm-integrity.c | 30 +++--- drivers/md/dm-io.c | 23 +++-- drivers/md/dm-kcopyd.c | 4 +- drivers/md/dm-log.c | 4 +- drivers/md/dm-raid.c | 97 +++++++++++++++----- drivers/md/dm-raid1.c | 6 +- drivers/md/dm-snap-persistent.c | 4 +- drivers/md/dm-snap.c | 4 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-verity.h | 4 +- drivers/md/dm-writecache.c | 8 +- drivers/md/dm.c | 26 ++++-- drivers/md/md-bitmap.c | 9 +- drivers/md/md-multipath.c | 9 -- drivers/md/md.c | 134 ++++++++++++++++----------- drivers/md/md.h | 44 +++++++-- drivers/md/raid1.c | 199 +++++++++++++++++++++------------------- drivers/md/raid1.h | 1 + drivers/md/raid10.c | 9 -- drivers/md/raid5.c | 55 +++++++---- 22 files changed, 419 insertions(+), 263 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index f03d7dba27..4f2808ef38 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1315,7 +1315,7 @@ static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector, io_req.mem.ptr.vma = (char *)b->data + offset; } - r = dm_io(&io_req, 1, ®ion, NULL); + r = dm_io(&io_req, 1, ®ion, NULL, IOPRIO_DEFAULT); if (unlikely(r)) b->end_io(b, errno_to_blk_status(r)); } @@ -2167,7 +2167,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) if (WARN_ON_ONCE(dm_bufio_in_request())) return -EINVAL; - return dm_io(&io_req, 1, &io_reg, NULL); + return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT); } EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); @@ -2191,7 +2191,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c if (WARN_ON_ONCE(dm_bufio_in_request())) return -EINVAL; /* discards are optional */ - return dm_io(&io_req, 1, &io_reg, NULL); + return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT); } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4ab4e8dcfd..35f5019395 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -53,11 +53,11 @@ struct convert_context { struct completion restart; struct bio *bio_in; - struct bio *bio_out; struct bvec_iter iter_in; + struct bio *bio_out; struct bvec_iter iter_out; - u64 cc_sector; atomic_t cc_pending; + u64 cc_sector; union { struct skcipher_request *req; struct aead_request *req_aead; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index e8e8fc33d3..2cc30b9ab2 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -555,7 +555,7 @@ static int sync_rw_sb(struct dm_integrity_c *ic, blk_opf_t opf) } } - r = dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) return r; @@ -1073,7 +1073,7 @@ static void rw_journal_sectors(struct dm_integrity_c *ic, blk_opf_t opf, io_loc.sector = ic->start + SB_SECTORS + sector; io_loc.count = n_sectors; - r = dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) { dm_integrity_io_error(ic, (opf & REQ_OP_MASK) == REQ_OP_READ ? "reading journal" : "writing journal", r); @@ -1190,7 +1190,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned int section, u io_loc.sector = target; io_loc.count = n_sectors; - r = dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) { WARN_ONCE(1, "asynchronous dm_io failed: %d", r); fn(-1UL, data); @@ -1519,7 +1519,7 @@ static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_dat fr.io_reg.count = 0, fr.ic = ic; init_completion(&fr.comp); - r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL); + r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL, IOPRIO_DEFAULT); BUG_ON(r); } @@ -1699,7 +1699,6 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks struct bio_vec bv; sector_t sector, logical_sector, area, offset; struct page *page; - void *buffer; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, @@ -1708,13 +1707,14 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks logical_sector = dio->range.logical_sector; page = mempool_alloc(&ic->recheck_pool, GFP_NOIO); - buffer = page_to_virt(page); __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos = 0; do { + sector_t alignment; char *mem; + char *buffer = page_to_virt(page); int r; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -1727,7 +1727,15 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks io_loc.sector = sector; io_loc.count = ic->sectors_per_block; - r = dm_io(&io_req, 1, &io_loc, NULL); + /* Align the bio to logical block size */ + alignment = dio->range.logical_sector | bio_sectors(bio) | (PAGE_SIZE >> SECTOR_SHIFT); + alignment &= -alignment; + io_loc.sector = round_down(io_loc.sector, alignment); + io_loc.count += sector - io_loc.sector; + buffer += (sector - io_loc.sector) << SECTOR_SHIFT; + io_loc.count = round_up(io_loc.count, alignment); + + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) { dio->bi_status = errno_to_blk_status(r); goto free_ret; @@ -1848,12 +1856,12 @@ again: r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { + if (likely(checksums != checksums_onstack)) + kfree(checksums); if (r > 0) { - integrity_recheck(dio, checksums); + integrity_recheck(dio, checksums_onstack); goto skip_io; } - if (likely(checksums != checksums_onstack)) - kfree(checksums); goto error; } @@ -2806,7 +2814,7 @@ next_chunk: io_loc.sector = get_data_sector(ic, area, offset); io_loc.count = n_sectors; - r = dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) { dm_integrity_io_error(ic, "reading data", r); goto err; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index f053ce2458..7409490259 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -305,7 +305,7 @@ static void km_dp_init(struct dpages *dp, void *data) */ static void do_region(const blk_opf_t opf, unsigned int region, struct dm_io_region *where, struct dpages *dp, - struct io *io) + struct io *io, unsigned short ioprio) { struct bio *bio; struct page *page; @@ -354,6 +354,7 @@ static void do_region(const blk_opf_t opf, unsigned int region, &io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); bio->bi_end_io = endio; + bio->bi_ioprio = ioprio; store_io_and_region_in_bio(bio, io, region); if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) { @@ -383,7 +384,7 @@ static void do_region(const blk_opf_t opf, unsigned int region, static void dispatch_io(blk_opf_t opf, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, - struct io *io, int sync) + struct io *io, int sync, unsigned short ioprio) { int i; struct dpages old_pages = *dp; @@ -400,7 +401,7 @@ static void dispatch_io(blk_opf_t opf, unsigned int num_regions, for (i = 0; i < num_regions; i++) { *dp = old_pages; if (where[i].count || (opf & REQ_PREFLUSH)) - do_region(opf, i, where + i, dp, io); + do_region(opf, i, where + i, dp, io, ioprio); } /* @@ -425,7 +426,7 @@ static void sync_io_complete(unsigned long error, void *context) static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, - unsigned long *error_bits) + unsigned long *error_bits, unsigned short ioprio) { struct io *io; struct sync_io sio; @@ -447,7 +448,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; - dispatch_io(opf, num_regions, where, dp, io, 1); + dispatch_io(opf, num_regions, where, dp, io, 1, ioprio); wait_for_completion_io(&sio.wait); @@ -459,7 +460,8 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, static int async_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, blk_opf_t opf, - struct dpages *dp, io_notify_fn fn, void *context) + struct dpages *dp, io_notify_fn fn, void *context, + unsigned short ioprio) { struct io *io; @@ -479,7 +481,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; - dispatch_io(opf, num_regions, where, dp, io, 0); + dispatch_io(opf, num_regions, where, dp, io, 0, ioprio); return 0; } @@ -521,7 +523,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, } int dm_io(struct dm_io_request *io_req, unsigned int num_regions, - struct dm_io_region *where, unsigned long *sync_error_bits) + struct dm_io_region *where, unsigned long *sync_error_bits, + unsigned short ioprio) { int r; struct dpages dp; @@ -532,11 +535,11 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions, if (!io_req->notify.fn) return sync_io(io_req->client, num_regions, where, - io_req->bi_opf, &dp, sync_error_bits); + io_req->bi_opf, &dp, sync_error_bits, ioprio); return async_io(io_req->client, num_regions, where, io_req->bi_opf, &dp, io_req->notify.fn, - io_req->notify.context); + io_req->notify.context, ioprio); } EXPORT_SYMBOL(dm_io); diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d01807c50f..79c65c9ad5 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -578,9 +578,9 @@ static int run_io_job(struct kcopyd_job *job) io_job_start(job->kc->throttle); if (job->op == REQ_OP_READ) - r = dm_io(&io_req, 1, &job->source, NULL); + r = dm_io(&io_req, 1, &job->source, NULL, IOPRIO_DEFAULT); else - r = dm_io(&io_req, job->num_dests, job->dests, NULL); + r = dm_io(&io_req, job->num_dests, job->dests, NULL, IOPRIO_DEFAULT); return r; } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index f9f84236df..f7f9c21009 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -300,7 +300,7 @@ static int rw_header(struct log_c *lc, enum req_op op) { lc->io_req.bi_opf = op; - return dm_io(&lc->io_req, 1, &lc->header_location, NULL); + return dm_io(&lc->io_req, 1, &lc->header_location, NULL, IOPRIO_DEFAULT); } static int flush_header(struct log_c *lc) @@ -313,7 +313,7 @@ static int flush_header(struct log_c *lc) lc->io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - return dm_io(&lc->io_req, 1, &null_location, NULL); + return dm_io(&lc->io_req, 1, &null_location, NULL, IOPRIO_DEFAULT); } static int read_header(struct log_c *log) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index eb009d6bb0..d97355e9b9 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -213,6 +213,7 @@ struct raid_dev { #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 #define RT_FLAG_RS_GROW 8 +#define RT_FLAG_RS_FROZEN 9 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3240,11 +3241,12 @@ size_check: rs->md.ro = 1; rs->md.in_sync = 1; - /* Keep array frozen until resume. */ - set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); - /* Has to be held on running the array */ mddev_suspend_and_lock_nointr(&rs->md); + + /* Keep array frozen until resume. */ + md_frozen_sync_thread(&rs->md); + r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ if (r) { @@ -3329,17 +3331,18 @@ static int raid_map(struct dm_target *ti, struct bio *bio) struct mddev *mddev = &rs->md; /* - * If we're reshaping to add disk(s)), ti->len and + * If we're reshaping to add disk(s), ti->len and * mddev->array_sectors will differ during the process * (ti->len > mddev->array_sectors), so we have to requeue * bios with addresses > mddev->array_sectors here or * there will occur accesses past EOD of the component * data images thus erroring the raid set. */ - if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) + if (unlikely(bio_has_data(bio) && bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - md_handle_request(mddev, bio); + if (unlikely(!md_handle_request(mddev, bio))) + return DM_MAPIO_REQUEUE; return DM_MAPIO_SUBMITTED; } @@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; + int ret = 0; if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (!strcasecmp(argv[0], "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) || + test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags)) + return -EBUSY; - if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) + if (!strcasecmp(argv[0], "frozen")) { + ret = mddev_lock(mddev); + if (ret) + return ret; + + md_frozen_sync_thread(mddev); + mddev_unlock(mddev); + } else if (!strcasecmp(argv[0], "idle")) { + ret = mddev_lock(mddev); + if (ret) + return ret; + + md_idle_sync_thread(mddev); + mddev_unlock(mddev); + } + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (decipher_sync_action(mddev, mddev->recovery) != st_idle) return -EBUSY; else if (!strcasecmp(argv[0], "resync")) ; /* MD_RECOVERY_NEEDED set below */ @@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); } +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + /* + * From now on, disallow raid_message() to change sync_thread until + * resume, raid_postsuspend() is too late. + */ + set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); + + if (!reshape_interrupted(mddev)) + return; + + /* + * For raid456, if reshape is interrupted, IO across reshape position + * will never make progress, while caller will wait for IO to be done. + * Inform raid456 to handle those IO to prevent deadlock. + */ + if (mddev->pers && mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); +} + +static void raid_presuspend_undo(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); +} + static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { - /* Writes have to be stopped before suspending to avoid deadlocks. */ - if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) - md_stop_writes(&rs->md); - + /* + * sync_thread must be stopped during suspend, and writes have + * to be stopped before suspending to avoid deadlocks. + */ + md_stop_writes(&rs->md); mddev_suspend(&rs->md, false); } } @@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - /* Be prepared for mddev_resume() in raid_resume() */ - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; @@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti) * Take this opportunity to check whether any failed * devices are reachable again. */ + mddev_lock_nointr(mddev); attempt_restore_of_faulty_devices(rs); + mddev_unlock(mddev); } if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { @@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti) if (mddev->delta_disks < 0) rs_set_capacity(rs); + WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); + WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); mddev_lock_nointr(mddev); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; + md_unfrozen_sync_thread(mddev); mddev_unlock_and_resume(mddev); } } @@ -4074,6 +4123,8 @@ static struct target_type raid_target = { .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .presuspend_undo = raid_presuspend_undo, .postsuspend = raid_postsuspend, .preresume = raid_preresume, .resume = raid_resume, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ddcb2bc4a6..9511dae5b5 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -278,7 +278,7 @@ static int mirror_flush(struct dm_target *ti) } error_bits = -1; - dm_io(&io_req, ms->nr_mirrors, io, &error_bits); + dm_io(&io_req, ms->nr_mirrors, io, &error_bits, IOPRIO_DEFAULT); if (unlikely(error_bits != 0)) { for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error_bits)) @@ -554,7 +554,7 @@ static void read_async_bio(struct mirror *m, struct bio *bio) map_region(&io, m, bio); bio_set_m(bio, m); - BUG_ON(dm_io(&io_req, 1, &io, NULL)); + BUG_ON(dm_io(&io_req, 1, &io, NULL, IOPRIO_DEFAULT)); } static inline int region_in_sync(struct mirror_set *ms, region_t region, @@ -681,7 +681,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) */ bio_set_m(bio, get_default_mirror(ms)); - BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); + BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL, IOPRIO_DEFAULT)); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 15649921f2..568d10842b 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -223,7 +223,7 @@ static void do_metadata(struct work_struct *work) { struct mdata_req *req = container_of(work, struct mdata_req, work); - req->result = dm_io(req->io_req, 1, req->where, NULL); + req->result = dm_io(req->io_req, 1, req->where, NULL, IOPRIO_DEFAULT); } /* @@ -247,7 +247,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, blk_opf_t opf, struct mdata_req req; if (!metadata) - return dm_io(&io_req, 1, &where, NULL); + return dm_io(&io_req, 1, &where, NULL, IOPRIO_DEFAULT); req.where = &where; req.io_req = &io_req; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index bf7a574499..0ace06d1be 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -684,8 +684,10 @@ static void dm_exception_table_exit(struct dm_exception_table *et, for (i = 0; i < size; i++) { slot = et->table + i; - hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) { kmem_cache_free(mem, ex); + cond_resched(); + } } kvfree(et->table); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 7b620b187d..49e4a35d70 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -511,7 +511,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, io_loc.bdev = v->data_dev->bdev; io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT); io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT); - r = dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) goto free_ret; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 4620a98c99..db93a91169 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -80,12 +80,12 @@ struct dm_verity_io { /* original value of bio->bi_end_io */ bio_end_io_t *orig_bi_end_io; + struct bvec_iter iter; + sector_t block; unsigned int n_blocks; bool in_tasklet; - struct bvec_iter iter; - struct work_struct work; char *recheck_buffer; diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 074cb785ea..6a4279bfb1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -531,7 +531,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) req.notify.context = &endio; /* writing via async dm-io (implied by notify.fn above) won't return an error */ - (void) dm_io(&req, 1, ®ion, NULL); + (void) dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); i = j; } @@ -568,7 +568,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) req.notify.fn = NULL; req.notify.context = NULL; - r = dm_io(&req, 1, ®ion, NULL); + r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); if (unlikely(r)) writecache_error(wc, r, "error writing superblock"); } @@ -596,7 +596,7 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) req.client = wc->dm_io; req.notify.fn = NULL; - r = dm_io(&req, 1, ®ion, NULL); + r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); if (unlikely(r)) writecache_error(wc, r, "error flushing metadata: %d", r); } @@ -990,7 +990,7 @@ static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors req.client = wc->dm_io; req.notify.fn = NULL; - return dm_io(&req, 1, ®ion, NULL); + return dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); } static void writecache_resume(struct dm_target *ti) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 23c32cd1f1..4ff9bebb81 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2945,6 +2945,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend static void __dm_internal_resume(struct mapped_device *md) { + int r; + struct dm_table *map; + BUG_ON(!md->internal_suspend_count); if (--md->internal_suspend_count) @@ -2953,12 +2956,23 @@ static void __dm_internal_resume(struct mapped_device *md) if (dm_suspended_md(md)) goto done; /* resume from nested suspend */ - /* - * NOTE: existing callers don't need to call dm_table_resume_targets - * (which may fail -- so best to avoid it for now by passing NULL map) - */ - (void) __dm_resume(md, NULL); - + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + r = __dm_resume(md, map); + if (r) { + /* + * If a preresume method of some target failed, we are in a + * tricky situation. We can't return an error to the caller. We + * can't fake success because then the "resume" and + * "postsuspend" methods would not be paired correctly, and it + * would break various targets, for example it would cause list + * corruption in the "origin" target. + * + * So, we fake normal suspend here, to make sure that the + * "resume" and "postsuspend" methods will be paired correctly. + */ + DMERR("Preresume method failed: %d", r); + set_bit(DMF_SUSPENDED, &md->flags); + } done: clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); smp_mb__after_atomic(); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 9672f75c30..a4976ceae8 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, sector_t doff; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - if (pg_index == store->file_pages - 1) { + /* we compare length (page numbers), not page offset. */ + if ((pg_index - store->sb_index) == store->file_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, struct page *page = store->filemap[pg_index]; if (mddev_is_clustered(bitmap->mddev)) { - pg_index += bitmap->cluster_slot * - DIV_ROUND_UP(store->bytes, PAGE_SIZE); + /* go to node bitmap area starting point */ + pg_index += store->sb_index; } if (store->file) @@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; + index += store->sb_index; if (mddev_is_clustered(bitmap->mddev)) node_offset = bitmap->cluster_slot * store->file_pages; @@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; + index += store->sb_index; if (mddev_is_clustered(bitmap->mddev)) node_offset = bitmap->cluster_slot * store->file_pages; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index d222768702..aa77133f31 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -258,15 +258,6 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } err = md_integrity_register(mddev); } abort: diff --git a/drivers/md/md.c b/drivers/md/md.c index 58889bc726..67befb598c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -99,18 +99,6 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -378,7 +366,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; } -void md_handle_request(struct mddev *mddev, struct bio *bio) +bool md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: if (is_suspended(mddev, bio)) { @@ -386,7 +374,7 @@ check_suspended: /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { bio_wouldblock_error(bio); - return; + return true; } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, @@ -402,10 +390,13 @@ check_suspended: if (!mddev->pers->make_request(mddev, bio)) { percpu_ref_put(&mddev->active_io); + if (!mddev->gendisk && mddev->pers->prepare_suspend) + return false; goto check_suspended; } percpu_ref_put(&mddev->active_io); + return true; } EXPORT_SYMBOL(md_handle_request); @@ -2582,6 +2573,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) fail: pr_warn("md: failed to register dev-%s for %s\n", b, mdname(mddev)); + mddev_destroy_serial_pool(mddev, rdev); return err; } @@ -4944,6 +4936,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) mddev_lock_nointr(mddev); } +void md_idle_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, true); +} +EXPORT_SYMBOL_GPL(md_idle_sync_thread); + +void md_frozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, false); +} +EXPORT_SYMBOL_GPL(md_frozen_sync_thread); + +void md_unfrozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); +} +EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); + static void idle_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); @@ -6063,7 +6084,10 @@ int md_run(struct mddev *mddev) pr_warn("True protection against single-disk failure might be compromised.\n"); } - mddev->recovery = 0; + /* dm-raid expect sync_thread to be frozen until resume */ + if (mddev->gendisk) + mddev->recovery = 0; + /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; @@ -6303,7 +6327,15 @@ static void md_clean(struct mddev *mddev) mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; - mddev->flags = 0; + /* + * Don't clear MD_CLOSING, or mddev can be opened again. + * 'hold_active != 0' means mddev is still in the creation + * process and will be used later. + */ + if (mddev->hold_active) + mddev->flags = 0; + else + mddev->flags &= BIT_ULL_MASK(MD_CLOSING); mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; @@ -6340,7 +6372,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - stop_sync_thread(mddev, true, false); del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { @@ -6365,6 +6396,8 @@ static void __md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, false); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -7649,7 +7682,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, int err = 0; void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; - bool did_set_md_closing = false; if (!md_ioctl_valid(cmd)) return -ENOTTY; @@ -7733,7 +7765,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, err = -EBUSY; goto out; } - did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } @@ -7875,7 +7906,7 @@ unlock: mddev_unlock(mddev); out: - if(did_set_md_closing) + if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) clear_bit(MD_CLOSING, &mddev->flags); return err; } @@ -8762,6 +8793,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio) } EXPORT_SYMBOL_GPL(md_account_bio); +void md_free_cloned_bio(struct bio *bio) +{ + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; + + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; + + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + + bio_put(bio); + percpu_ref_put(&mddev->active_io); +} +EXPORT_SYMBOL_GPL(md_free_cloned_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9295,9 +9343,14 @@ static bool md_spares_need_change(struct mddev *mddev) { struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev_removeable(rdev) || rdev_addable(rdev)) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_removeable(rdev) || rdev_addable(rdev)) { + rcu_read_unlock(); return true; + } + } + rcu_read_unlock(); return false; } @@ -9307,44 +9360,19 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; - bool remove_some = false; if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) /* Mustn't remove devices when resync thread is running */ return 0; rdev_for_each(rdev, mddev) { - if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - /* Faulty non-Blocked devices with nr_pending == 0 - * never get nr_pending incremented, - * never get Faulty cleared, and never get Blocked set. - * So we can synchronize_rcu now rather than once per device - */ - remove_some = true; - set_bit(RemoveSynchronized, &rdev->flags); - } - } - - if (remove_some) - synchronize_rcu(); - rdev_for_each(rdev, mddev) { - if ((this == NULL || rdev == this) && - (test_bit(RemoveSynchronized, &rdev->flags) || - rdev_removeable(rdev))) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; - rdev->raid_disk = -1; - removed++; - } + if ((this == NULL || rdev == this) && rdev_removeable(rdev) && + !mddev->pers->hot_remove_disk(mddev, rdev)) { + sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + removed++; } - if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) - clear_bit(RemoveSynchronized, &rdev->flags); } if (removed && mddev->kobj.sd) diff --git a/drivers/md/md.h b/drivers/md/md.h index ade83af123..375ad4a2df 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -190,11 +190,6 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, - RemoveSynchronized, /* synchronize_rcu() was called after - * this device was known to be faulty, - * so it is safe to remove without - * another synchronize_rcu() call. - */ ExternalBbl, /* External metadata provides bad * block management for a disk */ @@ -212,6 +207,7 @@ enum flag_bits { * check if there is collision between raid1 * serial bios. */ + Nonrot, /* non-rotational device (SSD) */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -563,6 +559,37 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool reshape_interrupted(struct mddev *mddev) +{ + /* reshape never start */ + if (mddev->reshape_position == MaxSector) + return false; + + /* interrupted */ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return true; + + /* running reshape will be interrupted soon. */ + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return true; + + return false; +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -622,6 +649,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -755,6 +783,7 @@ extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); +void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, @@ -783,9 +812,12 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); -extern void md_handle_request(struct mddev *mddev, struct bio *bio); +extern bool md_handle_request(struct mddev *mddev, struct bio *bio); extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); +extern void md_idle_sync_thread(struct mddev *mddev); +extern void md_frozen_sync_thread(struct mddev *mddev); +extern void md_unfrozen_sync_thread(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e138922d51..750a802478 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -600,16 +600,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect const sector_t this_sector = r1_bio->sector; int sectors; int best_good_sectors; - int best_disk, best_dist_disk, best_pending_disk; - int has_nonrot_disk; + int best_disk, best_dist_disk, best_pending_disk, sequential_disk; int disk; sector_t best_dist; unsigned int min_pending; struct md_rdev *rdev; int choose_first; - int choose_next_idle; - rcu_read_lock(); /* * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. @@ -619,12 +616,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect sectors = r1_bio->sectors; best_disk = -1; best_dist_disk = -1; + sequential_disk = -1; best_dist = MaxSector; best_pending_disk = -1; min_pending = UINT_MAX; best_good_sectors = 0; - has_nonrot_disk = 0; - choose_next_idle = 0; clear_bit(R1BIO_FailFast, &r1_bio->state); if ((conf->mddev->recovery_cp < this_sector + sectors) || @@ -640,9 +636,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect sector_t first_bad; int bad_sectors; unsigned int pending; - bool nonrot; - rdev = rcu_dereference(conf->mirrors[disk].rdev); + rdev = conf->mirrors[disk].rdev; if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL || test_bit(Faulty, &rdev->flags)) @@ -706,8 +701,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect /* At least two disks to choose from so failfast is OK */ set_bit(R1BIO_FailFast, &r1_bio->state); - nonrot = bdev_nonrot(rdev->bdev); - has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); if (choose_first) { @@ -720,7 +713,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; struct raid1_info *mirror = &conf->mirrors[disk]; - best_disk = disk; /* * If buffered sequential IO size exceeds optimal * iosize, check if there is idle disk. If yes, choose @@ -734,20 +726,27 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect * small, but not a big deal since when the second disk * starts IO, the first disk is likely still busy. */ - if (nonrot && opt_iosize > 0 && + if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 && mirror->seq_start != MaxSector && mirror->next_seq_sect > opt_iosize && mirror->next_seq_sect - opt_iosize >= mirror->seq_start) { - choose_next_idle = 1; - continue; + /* + * Add 'pending' to avoid choosing this disk if + * there is other idle disk. + */ + pending++; + /* + * If there is no other idle disk, this disk + * will be chosen. + */ + sequential_disk = disk; + } else { + best_disk = disk; + break; } - break; } - if (choose_next_idle) - continue; - if (min_pending > pending) { min_pending = pending; best_pending_disk = disk; @@ -759,6 +758,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } } + /* + * sequential IO size exceeds optimal iosize, however, there is no other + * idle disk, so choose the sequential disk. + */ + if (best_disk == -1 && min_pending != 0) + best_disk = sequential_disk; + /* * If all disks are rotational, choose the closest disk. If any disk is * non-rotational, choose the disk with less pending request even the @@ -766,14 +772,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect * mixed ratation/non-rotational disks depending on workload. */ if (best_disk == -1) { - if (has_nonrot_disk || min_pending == 0) + if (READ_ONCE(conf->nonrot_disks) || min_pending == 0) best_disk = best_pending_disk; else best_disk = best_dist_disk; } if (best_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[best_disk].rdev); + rdev = conf->mirrors[best_disk].rdev; if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); @@ -784,7 +790,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } - rcu_read_unlock(); *max_sectors = sectors; return best_disk; @@ -1235,14 +1240,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (r1bio_existed) { /* Need to get the block device name carefully */ - struct md_rdev *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (rdev) snprintf(b, sizeof(b), "%pg", rdev->bdev); else strcpy(b, "???"); - rcu_read_unlock(); } /* @@ -1396,10 +1399,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, disks = conf->raid_disks * 2; blocked_rdev = NULL; - rcu_read_lock(); max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; /* * The write-behind io is only attempted on drives marked as @@ -1465,7 +1467,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } r1_bio->bios[i] = bio; } - rcu_read_unlock(); if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ @@ -1617,15 +1618,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) struct r1conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1691,16 +1693,15 @@ static void print_conf(struct r1conf *conf) pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); - rcu_read_lock(); + lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; if (rdev) pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), rdev->bdev); } - rcu_read_unlock(); } static void close_sync(struct r1conf *conf) @@ -1767,6 +1768,52 @@ static int raid1_spare_active(struct mddev *mddev) return count; } +static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, + bool replacement) +{ + struct raid1_info *info = conf->mirrors + disk; + + if (replacement) + info += conf->raid_disks; + + if (info->rdev) + return false; + + if (bdev_nonrot(rdev->bdev)) { + set_bit(Nonrot, &rdev->flags); + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); + } + + rdev->raid_disk = disk; + info->head_position = 0; + info->seq_start = MaxSector; + WRITE_ONCE(info->rdev, rdev); + + return true; +} + +static bool raid1_remove_conf(struct r1conf *conf, int disk) +{ + struct raid1_info *info = conf->mirrors + disk; + struct md_rdev *rdev = info->rdev; + + if (!rdev || test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) + return false; + + /* Only remove non-faulty devices if recovery is not possible. */ + if (!test_bit(Faulty, &rdev->flags) && + rdev->mddev->recovery_disabled != conf->recovery_disabled && + rdev->mddev->degraded < conf->raid_disks) + return false; + + if (test_and_clear_bit(Nonrot, &rdev->flags)) + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1); + + WRITE_ONCE(info->rdev, NULL); + return true; +} + static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; @@ -1802,15 +1849,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - p->head_position = 0; - rdev->raid_disk = mirror; + raid1_add_conf(conf, rdev, mirror, false); err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); break; } if (test_bit(WantReplacement, &p->rdev->flags) && @@ -1820,13 +1865,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (err && repl_slot >= 0) { /* Add this device as a replacement */ - p = conf->mirrors + repl_slot; clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); - rdev->raid_disk = repl_slot; + raid1_add_conf(conf, rdev, repl_slot, true); err = 0; conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); } print_conf(conf); @@ -1843,36 +1886,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (unlikely(number >= conf->raid_disks)) goto abort; - if (rdev != p->rdev) - p = conf->mirrors + conf->raid_disks + number; + if (rdev != p->rdev) { + number += conf->raid_disks; + p = conf->mirrors + number; + } print_conf(conf); if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { + if (!raid1_remove_conf(conf, number)) { err = -EBUSY; goto abort; } - /* Only remove non-faulty devices if recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - mddev->degraded < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } - if (conf->mirrors[conf->raid_disks + number].rdev) { + + if (number < conf->raid_disks && + conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. @@ -1892,7 +1919,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } clear_bit(Replacement, &repl->flags); - p->rdev = repl; + WRITE_ONCE(p->rdev, repl); conf->mirrors[conf->raid_disks + number].rdev = NULL; unfreeze_array(conf); } @@ -2290,8 +2317,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, sector_t first_bad; int bad_sectors; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && @@ -2299,15 +2325,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk, is_badblock(rdev, sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) success = 1; rdev_dec_pending(rdev, mddev); if (success) break; - } else - rcu_read_unlock(); + } + d++; if (d == conf->raid_disks * 2) d = 0; @@ -2326,29 +2351,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); r1_sync_page_io(rdev, sect, s, conf->tmppage, REQ_OP_WRITE); rdev_dec_pending(rdev, mddev); - } else - rcu_read_unlock(); + } } d = start; while (d != read_disk) { if (d==0) d = conf->raid_disks * 2; d--; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (r1_sync_page_io(rdev, sect, s, conf->tmppage, REQ_OP_READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2359,8 +2379,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, rdev->bdev); } rdev_dec_pending(rdev, mddev); - } else - rcu_read_unlock(); + } } sectors -= s; sect += s; @@ -2741,7 +2760,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, r1_bio = raid1_alloc_init_r1buf(conf); - rcu_read_lock(); /* * If we get a correctably read error during resync or recovery, * we might want to read from a different device. So we @@ -2762,7 +2780,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, struct md_rdev *rdev; bio = r1_bio->bios[i]; - rdev = rcu_dereference(conf->mirrors[i].rdev); + rdev = conf->mirrors[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) @@ -2820,7 +2838,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_opf |= MD_FAILFAST; } } - rcu_read_unlock(); if (disk < 0) disk = wonly; r1_bio->read_disk = disk; @@ -3025,23 +3042,17 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); + conf->raid_disks = mddev->raid_disks; rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks - || disk_idx < 0) + + if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; - if (test_bit(Replacement, &rdev->flags)) - disk = conf->mirrors + mddev->raid_disks + disk_idx; - else - disk = conf->mirrors + disk_idx; - if (disk->rdev) + if (!raid1_add_conf(conf, rdev, disk_idx, + test_bit(Replacement, &rdev->flags))) goto abort; - disk->rdev = rdev; - disk->head_position = 0; - disk->seq_start = MaxSector; } - conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 14d4211a12..5300cbaa58 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -71,6 +71,7 @@ struct r1conf { * allow for replacements. */ int raid_disks; + int nonrot_disks; spinlock_t device_lock; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7b0a573e7..6e828a6aa0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2247,15 +2247,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } - } if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6fe334bb95..e1d8b5199f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -763,6 +763,7 @@ enum stripe_result { STRIPE_RETRY, STRIPE_SCHEDULE_AND_RETRY, STRIPE_FAIL, + STRIPE_WAIT_RESHAPE, }; struct stripe_request_ctx { @@ -2422,7 +2423,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) atomic_inc(&conf->active_stripes); raid5_release_stripe(sh); - conf->max_nr_stripes++; + WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1); return 1; } @@ -2717,7 +2718,7 @@ static int drop_one_stripe(struct r5conf *conf) shrink_buffers(sh); free_stripe(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); - conf->max_nr_stripes--; + WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1); return 1; } @@ -5991,7 +5992,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6070,6 +6072,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) { + bi->bi_status = BLK_STS_RESOURCE; + ret = STRIPE_WAIT_RESHAPE; + pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress"); + } return ret; } @@ -6191,7 +6199,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); - if (res == STRIPE_FAIL) + if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) break; if (res == STRIPE_RETRY) @@ -6229,6 +6237,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == WRITE) md_write_end(mddev); + if (res == STRIPE_WAIT_RESHAPE) { + md_free_cloned_bio(bi); + return false; + } + bio_endio(bi); return true; } @@ -6878,7 +6891,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) if (size <= 16 || size > 32768) return -EINVAL; - conf->min_nr_stripes = size; + WRITE_ONCE(conf->min_nr_stripes, size); mutex_lock(&conf->cache_size_mutex); while (size < conf->max_nr_stripes && drop_one_stripe(conf)) @@ -6890,7 +6903,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) if (!grow_one_stripe(conf, GFP_KERNEL)) { - conf->min_nr_stripes = conf->max_nr_stripes; + WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes); result = -ENOMEM; break; } @@ -7448,11 +7461,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct r5conf *conf = shrink->private_data; + int max_stripes = READ_ONCE(conf->max_nr_stripes); + int min_stripes = READ_ONCE(conf->min_nr_stripes); - if (conf->max_nr_stripes < conf->min_nr_stripes) + if (max_stripes < min_stripes) /* unlikely, but not impossible */ return 0; - return conf->max_nr_stripes - conf->min_nr_stripes; + return max_stripes - min_stripes; } static struct r5conf *setup_conf(struct mddev *mddev) @@ -8241,15 +8256,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - lockdep_assert_held(&mddev->reconfig_mutex); - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - rcu_assign_pointer(*rdevp, rdev); - } - } if (!err) { err = log_modify(conf, rdev, false); if (err) @@ -8990,6 +8996,18 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +/* + * This is only used for dm-raid456, caller already frozen sync_thread, hence + * if rehsape is still in progress, io that is waiting for reshape can never be + * done now, hence wake up and handle those IO. + */ +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -9013,6 +9031,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid5_personality = { @@ -9037,6 +9056,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid4_personality = @@ -9062,6 +9082,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static int __init raid5_init(void) -- cgit v1.2.3