diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:27 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:27 +0000 |
commit | 34996e42f82bfd60bc2c191e5cae3c6ab233ec6c (patch) | |
tree | 62db60558cbf089714b48daeabca82bf2b20b20e /drivers/md/raid1.c | |
parent | Adding debian version 6.8.12-1. (diff) | |
download | linux-34996e42f82bfd60bc2c191e5cae3c6ab233ec6c.tar.xz linux-34996e42f82bfd60bc2c191e5cae3c6ab233ec6c.zip |
Merging upstream version 6.9.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 505 |
1 files changed, 291 insertions, 214 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4f3c35f132..7b8a71ca66 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,9 +46,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); -#define raid1_log(md, fmt, args...) \ - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) - #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -498,9 +495,6 @@ static void raid1_end_write_request(struct bio *bio) * to user-side. So if something waits for IO, then it * will wait for the 'master' bio. */ - sector_t first_bad; - int bad_sectors; - r1_bio->bios[mirror] = NULL; to_put = bio; /* @@ -516,8 +510,8 @@ static void raid1_end_write_request(struct bio *bio) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && !discard_error) { + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !discard_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } @@ -582,180 +576,242 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector, return len; } -/* - * This routine returns the disk from which the requested read should - * be done. There is a per-array 'next expected sequential IO' sector - * number - if this matches on the next IO then we use the last disk. - * There is also a per-disk 'last know head position' sector that is - * maintained from IRQ contexts, both the normal and the resync IO - * completion handlers update this position correctly. If there is no - * perfect sequential match then we pick the disk whose head is closest. - * - * If there are 2 mirrors in the same 2 devices, performance degrades - * because position is mirror, not device based. - * - * The rdev for the device selected will have nr_pending incremented. - */ -static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) +static void update_read_sectors(struct r1conf *conf, int disk, + sector_t this_sector, int len) { - const sector_t this_sector = r1_bio->sector; - int sectors; - int best_good_sectors; - int best_disk, best_dist_disk, best_pending_disk, sequential_disk; + struct raid1_info *info = &conf->mirrors[disk]; + + atomic_inc(&info->rdev->nr_pending); + if (info->next_seq_sect != this_sector) + info->seq_start = this_sector; + info->next_seq_sect = this_sector + len; +} + +static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int len = r1_bio->sectors; int disk; - sector_t best_dist; - unsigned int min_pending; - struct md_rdev *rdev; - int choose_first; - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. - */ - retry: - sectors = r1_bio->sectors; - best_disk = -1; - best_dist_disk = -1; - sequential_disk = -1; - best_dist = MaxSector; - best_pending_disk = -1; - min_pending = UINT_MAX; - best_good_sectors = 0; - clear_bit(R1BIO_FailFast, &r1_bio->state); + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int read_len; - if ((conf->mddev->recovery_cp < this_sector + sectors) || - (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, - this_sector + sectors))) - choose_first = 1; - else - choose_first = 0; + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + + /* choose the first disk even if it has some bad blocks. */ + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > 0) { + update_read_sectors(conf, disk, this_sector, read_len); + *max_sectors = read_len; + return disk; + } + } + + return -1; +} + +static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int best_disk = -1; + int best_len = 0; + int disk; for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { - sector_t dist; - sector_t first_bad; - int bad_sectors; - unsigned int pending; + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; rdev = conf->mirrors[disk].rdev; - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || test_bit(Faulty, &rdev->flags)) + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(WriteMostly, &rdev->flags)) continue; - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < this_sector + sectors) + + /* keep track of the disk with the most readable sectors. */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > best_len) { + best_disk = disk; + best_len = read_len; + } + } + + if (best_disk != -1) { + *max_sectors = best_len; + update_read_sectors(conf, best_disk, this_sector, best_len); + } + + return best_disk; +} + +static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int bb_disk = -1; + int bb_read_len = 0; + int disk; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) continue; - if (test_bit(WriteMostly, &rdev->flags)) { - /* Don't balance among write-mostly, just - * use the first as a last resort */ - if (best_dist_disk < 0) { - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (first_bad <= this_sector) - /* Cannot use this */ - continue; - best_good_sectors = first_bad - this_sector; - } else - best_good_sectors = sectors; - best_dist_disk = disk; - best_pending_disk = disk; - } + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || + !test_bit(WriteMostly, &rdev->flags)) continue; + + /* there are no bad blocks, we can use this disk */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len == r1_bio->sectors) { + update_read_sectors(conf, disk, this_sector, read_len); + return disk; } - /* This is a reasonable device to use. It might - * even be best. + + /* + * there are partial bad blocks, choose the rdev with largest + * read length. */ - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* already have a better device */ - continue; - if (first_bad <= this_sector) { - /* cannot read here. If this is the 'primary' - * device, then we must not read beyond - * bad_sectors from another device.. - */ - bad_sectors -= (this_sector - first_bad); - if (choose_first && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - - } else { - sector_t good_sectors = first_bad - this_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_disk = disk; - } - if (choose_first) - break; - } - continue; - } else { - if ((sectors > best_good_sectors) && (best_disk >= 0)) - best_disk = -1; - best_good_sectors = sectors; + if (read_len > bb_read_len) { + bb_disk = disk; + bb_read_len = read_len; } + } + + if (bb_disk != -1) { + *max_sectors = bb_read_len; + update_read_sectors(conf, bb_disk, this_sector, bb_read_len); + } + + return bb_disk; +} + +static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio) +{ + /* TODO: address issues with this check and concurrency. */ + return conf->mirrors[disk].next_seq_sect == r1_bio->sector || + conf->mirrors[disk].head_position == r1_bio->sector; +} + +/* + * If buffered sequential IO size exceeds optimal iosize, check if there is idle + * disk. If yes, choose the idle disk. + */ +static bool should_choose_next(struct r1conf *conf, int disk) +{ + struct raid1_info *mirror = &conf->mirrors[disk]; + int opt_iosize; + + if (!test_bit(Nonrot, &mirror->rdev->flags)) + return false; - if (best_disk >= 0) - /* At least two disks to choose from so failfast is OK */ + opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9; + return opt_iosize > 0 && mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= mirror->seq_start; +} + +static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + if (!rdev || test_bit(Faulty, &rdev->flags)) + return false; + + /* still in recovery */ + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) + return false; + + /* don't read from slow disk unless have to */ + if (test_bit(WriteMostly, &rdev->flags)) + return false; + + /* don't split IO for bad blocks unless have to */ + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors)) + return false; + + return true; +} + +struct read_balance_ctl { + sector_t closest_dist; + int closest_dist_disk; + int min_pending; + int min_pending_disk; + int sequential_disk; + int readable_disks; +}; + +static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio) +{ + int disk; + struct read_balance_ctl ctl = { + .closest_dist_disk = -1, + .closest_dist = MaxSector, + .min_pending_disk = -1, + .min_pending = UINT_MAX, + .sequential_disk = -1, + }; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + sector_t dist; + unsigned int pending; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev_readable(rdev, r1_bio)) + continue; + + /* At least two disks to choose from so failfast is OK */ + if (ctl.readable_disks++ == 1) set_bit(R1BIO_FailFast, &r1_bio->state); pending = atomic_read(&rdev->nr_pending); - dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first) { - best_disk = disk; - break; - } + dist = abs(r1_bio->sector - conf->mirrors[disk].head_position); + /* Don't change to another disk for sequential reads */ - if (conf->mirrors[disk].next_seq_sect == this_sector - || dist == 0) { - int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; - struct raid1_info *mirror = &conf->mirrors[disk]; + if (is_sequential(conf, disk, r1_bio)) { + if (!should_choose_next(conf, disk)) + return disk; /* - * If buffered sequential IO size exceeds optimal - * iosize, check if there is idle disk. If yes, choose - * the idle disk. read_balance could already choose an - * idle disk before noticing it's a sequential IO in - * this disk. This doesn't matter because this disk - * will idle, next time it will be utilized after the - * first disk has IO size exceeds optimal iosize. In - * this way, iosize of the first disk will be optimal - * iosize at least. iosize of the second disk might be - * small, but not a big deal since when the second disk - * starts IO, the first disk is likely still busy. + * Add 'pending' to avoid choosing this disk if + * there is other idle disk. */ - if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 && - mirror->seq_start != MaxSector && - mirror->next_seq_sect > opt_iosize && - mirror->next_seq_sect - opt_iosize >= - mirror->seq_start) { - /* - * Add 'pending' to avoid choosing this disk if - * there is other idle disk. - */ - pending++; - /* - * If there is no other idle disk, this disk - * will be chosen. - */ - sequential_disk = disk; - } else { - best_disk = disk; - break; - } + pending++; + /* + * If there is no other idle disk, this disk + * will be chosen. + */ + ctl.sequential_disk = disk; } - if (min_pending > pending) { - min_pending = pending; - best_pending_disk = disk; + if (ctl.min_pending > pending) { + ctl.min_pending = pending; + ctl.min_pending_disk = disk; } - if (dist < best_dist) { - best_dist = dist; - best_dist_disk = disk; + if (ctl.closest_dist > dist) { + ctl.closest_dist = dist; + ctl.closest_dist_disk = disk; } } @@ -763,8 +819,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect * sequential IO size exceeds optimal iosize, however, there is no other * idle disk, so choose the sequential disk. */ - if (best_disk == -1 && min_pending != 0) - best_disk = sequential_disk; + if (ctl.sequential_disk != -1 && ctl.min_pending != 0) + return ctl.sequential_disk; /* * If all disks are rotational, choose the closest disk. If any disk is @@ -772,28 +828,60 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect * disk is rotational, which might/might not be optimal for raids with * mixed ratation/non-rotational disks depending on workload. */ - if (best_disk == -1) { - if (READ_ONCE(conf->nonrot_disks) || min_pending == 0) - best_disk = best_pending_disk; - else - best_disk = best_dist_disk; - } + if (ctl.min_pending_disk != -1 && + (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0)) + return ctl.min_pending_disk; + else + return ctl.closest_dist_disk; +} - if (best_disk >= 0) { - rdev = conf->mirrors[best_disk].rdev; - if (!rdev) - goto retry; - atomic_inc(&rdev->nr_pending); - sectors = best_good_sectors; +/* + * This routine returns the disk from which the requested read should be done. + * + * 1) If resync is in progress, find the first usable disk and use it even if it + * has some bad blocks. + * + * 2) Now that there is no resync, loop through all disks and skipping slow + * disks and disks with bad blocks for now. Only pay attention to key disk + * choice. + * + * 3) If we've made it this far, now look for disks with bad blocks and choose + * the one with most number of sectors. + * + * 4) If we are all the way at the end, we have no choice but to use a disk even + * if it is write mostly. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + int disk; + + clear_bit(R1BIO_FailFast, &r1_bio->state); - if (conf->mirrors[best_disk].next_seq_sect != this_sector) - conf->mirrors[best_disk].seq_start = this_sector; + if (raid1_should_read_first(conf->mddev, r1_bio->sector, + r1_bio->sectors)) + return choose_first_rdev(conf, r1_bio, max_sectors); - conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; + disk = choose_best_rdev(conf, r1_bio); + if (disk >= 0) { + *max_sectors = r1_bio->sectors; + update_read_sectors(conf, disk, r1_bio->sector, + r1_bio->sectors); + return disk; } - *max_sectors = sectors; - return best_disk; + /* + * If we are here it means we didn't find a perfectly good disk so + * now spend a bit more time trying to find one with the most good + * sectors. + */ + disk = choose_bb_rdev(conf, r1_bio, max_sectors); + if (disk >= 0) + return disk; + + return choose_slow_rdev(conf, r1_bio, max_sectors); } static void wake_up_barrier(struct r1conf *conf) @@ -1105,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra) */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; - raid1_log(conf->mddev, "wait freeze"); + mddev_add_trace_msg(conf->mddev, "raid1 wait freeze"); wait_event_lock_irq_cmd( conf->wait_barrier, get_unqueued_pending(conf) == extra, @@ -1294,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ - raid1_log(mddev, "wait behind writes"); + mddev_add_trace_msg(mddev, "raid1 wait behind writes"); wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); } @@ -1327,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_private = r1_bio; - - if (mddev->gendisk) - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); - + mddev_trace_remap(mddev, read_bio, r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1481,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bio_wouldblock_error(bio); return; } - raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); + mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", + blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf, bio->bi_iter.bi_sector, false); goto retry_write; @@ -1564,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); - - if (mddev->gendisk) - trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), - r1_bio->sector); + mddev_trace_remap(mddev, mbio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { @@ -1844,12 +1926,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; raid1_add_conf(conf, rdev, mirror, false); - err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -1986,8 +2067,6 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - sector_t first_bad; - int bad_sectors; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { @@ -1997,14 +2076,11 @@ static void end_sync_write(struct bio *bio) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) - ) + } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, r1_bio->sectors)) { set_bit(R1BIO_MadeGood, &r1_bio->state); + } put_sync_write_buf(r1_bio, uptodate); } @@ -2321,16 +2397,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) s = PAGE_SIZE >> 9; do { - sector_t first_bad; - int bad_sectors; - rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && - is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0) { + rdev_has_badblock(rdev, sect, s) == 0) { atomic_inc(&rdev->nr_pending); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) @@ -3122,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } +static int raid1_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_write_zeroes_sectors = 0; + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); +} + static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; int i; - struct md_rdev *rdev; int ret; if (mddev->level != 1) { @@ -3154,14 +3235,10 @@ static int raid1_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (mddev->queue) - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - - rdev_for_each(rdev, mddev) { - if (!mddev->gendisk) - continue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (!mddev_is_dm(mddev)) { + ret = raid1_set_limits(mddev); + if (ret) + goto abort; } mddev->degraded = 0; |