diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 1104 |
1 files changed, 641 insertions, 463 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8c40c1c395..58889bc726 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -82,6 +82,14 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; + +/* + * This workqueue is used for sync_work to register new sync_thread, and for + * del_work to remove rdev, and for event_work that is only set by dm-raid. + * + * Noted that sync_work will grab reconfig_mutex, hence never flush this + * workqueue whith reconfig_mutex grabbed. + */ static struct workqueue_struct *md_misc_wq; struct workqueue_struct *md_bitmap_wq; @@ -91,6 +99,18 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -206,8 +226,7 @@ static int rdev_need_serial(struct md_rdev *rdev) * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ -void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; @@ -215,15 +234,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, !test_bit(CollisionCheck, &rdev->flags)) return; - if (!is_suspend) - mddev_suspend(mddev); - if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) - goto abort; + return; if (mddev->serial_info_pool == NULL) { /* @@ -238,10 +254,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, pr_err("can't alloc memory pool for serialization\n"); } } - -abort: - if (!is_suspend) - mddev_resume(mddev); } /* @@ -250,8 +262,7 @@ abort: * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -260,8 +271,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ - if (!is_suspend) - mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || @@ -283,8 +292,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } - if (!is_suspend) - mddev_resume(mddev); } } @@ -305,7 +312,6 @@ static struct ctl_table raid_table[] = { .mode = S_IRUGO|S_IWUSR, .proc_handler = proc_dointvec, }, - { } }; static int start_readonly; @@ -346,6 +352,10 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -359,11 +369,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; if (bio_data_dir(bio) != WRITE) return false; - if (mddev->suspend_lo >= mddev->suspend_hi) + if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio_end_sector(bio) < mddev->suspend_lo) + if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } @@ -431,42 +441,73 @@ static void md_submit_bio(struct bio *bio) md_handle_request(mddev, bio); } -/* mddev_suspend makes sure no new requests are submitted - * to the device, and that any requests that have been submitted - * are completely handled. - * Once mddev_detach() is called and completes, the module will be - * completely unused. +/* + * Make sure no new requests are submitted to the device, and any requests that + * have been submitted are completely handled. */ -void mddev_suspend(struct mddev *mddev) +int mddev_suspend(struct mddev *mddev, bool interruptible) { - struct md_thread *thread = rcu_dereference_protected(mddev->thread, - lockdep_is_held(&mddev->reconfig_mutex)); + int err = 0; - WARN_ON_ONCE(thread && current == thread->tsk); - if (mddev->suspended++) - return; - wake_up(&mddev->sb_wait); - set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); - percpu_ref_kill(&mddev->active_io); + /* + * hold reconfig_mutex to wait for normal io will deadlock, because + * other context can't update super_block, and normal io can rely on + * updating super_block. + */ + lockdep_assert_not_held(&mddev->reconfig_mutex); - if (mddev->pers && mddev->pers->prepare_suspend) - mddev->pers->prepare_suspend(mddev); + if (interruptible) + err = mutex_lock_interruptible(&mddev->suspend_mutex); + else + mutex_lock(&mddev->suspend_mutex); + if (err) + return err; - wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); - wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); + if (mddev->suspended) { + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + mutex_unlock(&mddev->suspend_mutex); + return 0; + } + + percpu_ref_kill(&mddev->active_io); + if (interruptible) + err = wait_event_interruptible(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + else + wait_event(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + if (err) { + percpu_ref_resurrect(&mddev->active_io); + mutex_unlock(&mddev->suspend_mutex); + return err; + } + + /* + * For raid456, io might be waiting for reshape to make progress, + * allow new reshape to start while waiting for io to be done to + * prevent deadlock. + */ + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); + + mutex_unlock(&mddev->suspend_mutex); + return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); -void mddev_resume(struct mddev *mddev) +static void __mddev_resume(struct mddev *mddev, bool recovery_needed) { - lockdep_assert_held(&mddev->reconfig_mutex); - if (--mddev->suspended) + lockdep_assert_not_held(&mddev->reconfig_mutex); + + mutex_lock(&mddev->suspend_mutex); + WRITE_ONCE(mddev->suspended, mddev->suspended - 1); + if (mddev->suspended) { + mutex_unlock(&mddev->suspend_mutex); return; + } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); @@ -474,9 +515,17 @@ void mddev_resume(struct mddev *mddev) percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (recovery_needed) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + mutex_unlock(&mddev->suspend_mutex); +} + +void mddev_resume(struct mddev *mddev) +{ + return __mddev_resume(mddev, true); } EXPORT_SYMBOL_GPL(mddev_resume); @@ -530,8 +579,12 @@ static void submit_flushes(struct work_struct *ws) rcu_read_lock(); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); + queue_work(md_wq, &mddev->flush_work); + } } static void md_submit_flush_data(struct work_struct *ws) @@ -626,34 +679,63 @@ static inline struct mddev *mddev_get(struct mddev *mddev) static void mddev_delayed_delete(struct work_struct *ws); +static void __mddev_put(struct mddev *mddev) +{ + if (mddev->raid_disks || !list_empty(&mddev->disks) || + mddev->ctime || mddev->hold_active) + return; + + /* Array is not configured at all, and not held active, so destroy it */ + set_bit(MD_DELETED, &mddev->flags); + + /* + * Call queue_work inside the spinlock so that flush_workqueue() after + * mddev_find will succeed in waiting for the work to be done. + */ + queue_work(md_misc_wq, &mddev->del_work); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks) && - mddev->ctime == 0 && !mddev->hold_active) { - /* Array is not configured at all, and not held active, - * so destroy it */ - set_bit(MD_DELETED, &mddev->flags); - /* - * Call queue_work inside the spinlock so that - * flush_workqueue() after mddev_find will succeed in waiting - * for the work to be done. - */ - INIT_WORK(&mddev->del_work, mddev_delayed_delete); - queue_work(md_misc_wq, &mddev->del_work); - } + __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } static void md_safemode_timeout(struct timer_list *t); +static void md_start_sync(struct work_struct *ws); + +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} + +static void no_op(struct percpu_ref *r) {} -void mddev_init(struct mddev *mddev) +int mddev_init(struct mddev *mddev) { + + if (percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + return -ENOMEM; + + if (percpu_ref_init(&mddev->writes_pending, no_op, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + percpu_ref_exit(&mddev->active_io); + return -ENOMEM; + } + + /* We want to start with the refcount at zero */ + percpu_ref_put(&mddev->writes_pending); + mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->sync_mutex); + mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -672,9 +754,21 @@ void mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + + INIT_WORK(&mddev->sync_work, md_start_sync); + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + + return 0; } EXPORT_SYMBOL_GPL(mddev_init); +void mddev_destroy(struct mddev *mddev) +{ + percpu_ref_exit(&mddev->active_io); + percpu_ref_exit(&mddev->writes_pending); +} +EXPORT_SYMBOL_GPL(mddev_destroy); + static struct mddev *mddev_find_locked(dev_t unit) { struct mddev *mddev; @@ -718,13 +812,16 @@ static struct mddev *mddev_alloc(dev_t unit) new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); - mddev_init(new); + + error = mddev_init(new); + if (error) + goto out_free_new; spin_lock(&all_mddevs_lock); if (unit) { error = -EEXIST; if (mddev_find_locked(unit)) - goto out_free_new; + goto out_destroy_new; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); @@ -735,7 +832,7 @@ static struct mddev *mddev_alloc(dev_t unit) error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) - goto out_free_new; + goto out_destroy_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; } @@ -743,8 +840,11 @@ static struct mddev *mddev_alloc(dev_t unit) list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; -out_free_new: + +out_destroy_new: spin_unlock(&all_mddevs_lock); + mddev_destroy(new); +out_free_new: kfree(new); return ERR_PTR(error); } @@ -755,6 +855,7 @@ static void mddev_free(struct mddev *mddev) list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); + mddev_destroy(mddev); kfree(mddev); } @@ -940,9 +1041,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, - 1, - REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, - GFP_NOIO, &mddev->sync_set); + 1, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META + | REQ_PREFLUSH | REQ_FUA, + GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); @@ -1122,6 +1224,7 @@ struct super_type { struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, + struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); @@ -1259,8 +1362,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor /* * validate_super for 0.90.0 + * note: we are not using "freshest" for 0.9 superblock */ -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); @@ -1772,7 +1876,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return ret; } -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); @@ -1868,13 +1972,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for - * spares (which don't need an event count) */ - ++ev1; + * spares (which don't need an event count). + * Similar to mdadm, we allow event counter difference of 1 + * from the freshest device. + */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) - if (ev1 < mddev->events) + if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an @@ -1895,8 +2001,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; - } else + } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { + /* + * If we are assembling, and our event counter is smaller than the + * highest event counter, we cannot trust our superblock about the role. + * It could happen that our rdev was marked as Faulty, and all other + * superblocks were updated with +1 event counter. + * Then, before the next superblock update, which typically happens when + * remove_and_add_spares() removes the device from the array, there was + * a crash or reboot. + * If we allow current rdev without consulting the freshest superblock, + * we could cause data corruption. + * Note that in this case our event counter is smaller by 1 than the + * highest, otherwise, this rdev would not be allowed into array; + * both kernel and mdadm allow event counter difference of 1. + */ + struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); + u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); + + if (rdev->desc_nr >= freshest_max_dev) { + /* this is unexpected, better not proceed */ + pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", + mdname(mddev), rdev->bdev, rdev->desc_nr, + freshest->bdev, freshest_max_dev); + return -EUCLEAN; + } + + role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); + pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", + mdname(mddev), rdev->bdev, role, role, freshest->bdev); + } else { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + } switch(role) { case MD_DISK_ROLE_SPARE: /* spare */ break; @@ -2422,7 +2558,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2462,8 +2598,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, - test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev); + bdev_release(rdev->bdev_handle); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -2475,7 +2610,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2804,12 +2939,8 @@ static int add_bound_rdev(struct md_rdev *rdev) * and should be added immediately. */ super_types[mddev->major_version]. - validate_super(mddev, rdev); - if (add_journal) - mddev_suspend(mddev); + validate_super(mddev, NULL/*freshest*/, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); - if (add_journal) - mddev_resume(mddev); if (err) { md_kick_rdev_from_array(rdev); return err; @@ -2946,11 +3077,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_serial_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; @@ -3562,6 +3693,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; + bool suspend = false; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3569,17 +3701,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + if (!mddev) + return -ENODEV; - if (entry->store == state_store && cmd_match(page, "remove")) - kn = sysfs_break_active_protection(kobj, attr); + if (entry->store == state_store) { + if (cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + if (cmd_match(page, "remove") || cmd_match(page, "re-add") || + cmd_match(page, "writemostly") || + cmd_match(page, "-writemostly")) + suspend = true; + } - rv = mddev ? mddev_lock(mddev) : -ENODEV; + rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) @@ -3643,7 +3783,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init); static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { struct md_rdev *rdev; - struct md_rdev *holder; sector_t size; int err; @@ -3658,21 +3797,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; - if (super_format == -2) { - holder = &claim_rdev; - } else { - holder = rdev; - set_bit(Holder, &rdev->flags); - } - - rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, - holder, NULL); - if (IS_ERR(rdev->bdev)) { + rdev->bdev_handle = bdev_open_by_dev(newdev, + BLK_OPEN_READ | BLK_OPEN_WRITE, + super_format == -2 ? &claim_rdev : rdev, NULL); + if (IS_ERR(rdev->bdev_handle)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); - err = PTR_ERR(rdev->bdev); + err = PTR_ERR(rdev->bdev_handle); goto out_clear_rdev; } + rdev->bdev = rdev->bdev_handle->bdev; kobject_init(&rdev->kobj, &rdev_ktype); @@ -3703,7 +3837,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe return rdev; out_blkdev_put: - blkdev_put(rdev->bdev, holder); + bdev_release(rdev->bdev_handle); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: @@ -3742,7 +3876,7 @@ static int analyze_sbs(struct mddev *mddev) } super_types[mddev->major_version]. - validate_super(mddev, freshest); + validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { @@ -3757,7 +3891,7 @@ static int analyze_sbs(struct mddev *mddev) } if (rdev != freshest) { if (super_types[mddev->major_version]. - validate_super(mddev, rdev)) { + validate_super(mddev, freshest, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); @@ -3884,12 +4018,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers == NULL) { - strncpy(mddev->clevel, buf, slen); + memcpy(mddev->clevel, buf, slen); if (mddev->clevel[slen-1] == '\n') slen--; mddev->clevel[slen] = 0; @@ -3922,7 +4056,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Now find the new personality */ - strncpy(clevel, buf, slen); + memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n') slen--; clevel[slen] = 0; @@ -3977,7 +4111,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Looks like we have a winner */ - mddev_suspend(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); @@ -4063,14 +4196,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len) blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return rv; } @@ -4378,6 +4510,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) int err = 0; enum array_state st = match_word(buf, array_states); + /* No lock dependent actions */ + switch (st) { + case suspended: /* not supported yet */ + case write_pending: /* cannot be set */ + case active_idle: /* cannot be set */ + case broken: /* cannot be set */ + case bad_word: + return -EINVAL; + default: + break; + } + if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between @@ -4402,23 +4546,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_lock(mddev); if (err) return err; - err = -EINVAL; - switch(st) { - case bad_word: - break; - case clear: - /* stopping an active array */ - err = do_md_stop(mddev, 0, NULL); - break; + + switch (st) { case inactive: - /* stopping an active array */ + /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2, NULL); - else - err = 0; /* already inactive */ break; - case suspended: - break; /* not supported yet */ + case clear: + err = do_md_stop(mddev, 0, NULL); + break; case readonly: if (mddev->pers) err = md_set_readonly(mddev, NULL); @@ -4469,10 +4606,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = do_md_run(mddev); } break; - case write_pending: - case active_idle: - case broken: - /* these cannot be set */ + default: + err = -EINVAL; break; } @@ -4545,7 +4680,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { @@ -4566,14 +4701,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; @@ -4708,7 +4843,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; - strncpy(mddev->metadata_type, buf+9, namelen); + memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; @@ -4768,25 +4903,29 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } -static void stop_sync_thread(struct mddev *mddev) +/** + * stop_sync_thread() - wait for sync_thread to stop if it's running. + * @mddev: the array. + * @locked: if set, reconfig_mutex will still be held after this function + * return; if not set, reconfig_mutex will be released after this + * function return. + * @check_seq: if set, only wait for curent running sync_thread to stop, noted + * that new sync_thread can still start. + */ +static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) { - if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return; + int sync_seq; - if (mddev_lock(mddev)) - return; + if (check_seq) + sync_seq = atomic_read(&mddev->sync_seq); - /* - * Check again in case MD_RECOVERY_RUNNING is cleared before lock is - * held. - */ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - mddev_unlock(mddev); + if (!locked) + mddev_unlock(mddev); return; } - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); + mddev_unlock(mddev); set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* @@ -4794,21 +4933,28 @@ static void stop_sync_thread(struct mddev *mddev) * never happen */ md_wakeup_thread_directly(mddev->sync_thread); + if (work_pending(&mddev->sync_work)) + flush_work(&mddev->sync_work); - mddev_unlock(mddev); + wait_event(resync_wait, + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); + + if (locked) + mddev_lock_nointr(mddev); } static void idle_sync_thread(struct mddev *mddev) { - int sync_seq = atomic_read(&mddev->sync_seq); - mutex_lock(&mddev->sync_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev); - wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || - !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + if (mddev_lock(mddev)) { + mutex_unlock(&mddev->sync_mutex); + return; + } + stop_sync_thread(mddev, false, true); mutex_unlock(&mddev->sync_mutex); } @@ -4816,11 +4962,13 @@ static void frozen_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev); - wait_event(resync_wait, mddev->sync_thread == NULL && - !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + if (mddev_lock(mddev)) { + mutex_unlock(&mddev->sync_mutex); + return; + } + stop_sync_thread(mddev, false, false); mutex_unlock(&mddev->sync_mutex); } @@ -4882,6 +5030,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) /* A write to sync_action is enough to justify * canceling read-auto mode */ + flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } @@ -5138,7 +5287,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t @@ -5153,21 +5303,14 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend(mddev, true); if (err) return err; - err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - goto unlock; - mddev_suspend(mddev); - mddev->suspend_lo = new; + + WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); - err = 0; -unlock: - mddev_unlock(mddev); - return err ?: len; + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -5175,7 +5318,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t @@ -5190,21 +5334,14 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend(mddev, true); if (err) return err; - err = -EINVAL; - if (mddev->pers == NULL) - goto unlock; - mddev_suspend(mddev); - mddev->suspend_hi = new; + WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); - err = 0; -unlock: - mddev_unlock(mddev); - return err ?: len; + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -5451,7 +5588,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) if (value == mddev->serialize_policy) return len; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->level != 1)) { @@ -5460,15 +5597,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; } - mddev_suspend(mddev); if (value) - mddev_create_serial_pool(mddev, NULL, true); + mddev_create_serial_pool(mddev, NULL); else - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; - mddev_resume(mddev); unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -5607,21 +5742,6 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } -static void no_op(struct percpu_ref *r) {} - -int mddev_init_writes_pending(struct mddev *mddev) -{ - if (mddev->writes_pending.percpu_count_ptr) - return 0; - if (percpu_ref_init(&mddev->writes_pending, no_op, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) - return -ENOMEM; - /* We want to start with the refcount at zero */ - percpu_ref_put(&mddev->writes_pending); - return 0; -} -EXPORT_SYMBOL_GPL(mddev_init_writes_pending); - struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5793,12 +5913,6 @@ static void md_safemode_timeout(struct timer_list *t) } static int start_dirty_degraded; -static void active_io_release(struct percpu_ref *ref) -{ - struct mddev *mddev = container_of(ref, struct mddev, active_io); - - wake_up(&mddev->sb_wait); -} int md_run(struct mddev *mddev) { @@ -5879,15 +5993,10 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } - err = percpu_ref_init(&mddev->active_io, active_io_release, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); - if (err) - return err; - if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - goto exit_active_io; + return err; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); @@ -6084,8 +6193,6 @@ exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); -exit_active_io: - percpu_ref_exit(&mddev->active_io); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6233,14 +6340,7 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - + stop_sync_thread(mddev, true, false); del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { @@ -6259,7 +6359,7 @@ static void __md_stop_writes(struct mddev *mddev) } /* disable policy to guarantee rdevs free resources for serialization */ mddev->serialize_policy = 0; - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) @@ -6287,9 +6387,6 @@ static void __md_stop(struct mddev *mddev) struct md_personality *pers = mddev->pers; md_bitmap_destroy(mddev); mddev_detach(mddev); - /* Ensure ->event_work is done */ - if (mddev->event_work.func) - flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); @@ -6301,7 +6398,6 @@ static void __md_stop(struct mddev *mddev) module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); bioset_exit(&mddev->io_clone_set); @@ -6316,7 +6412,6 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); - percpu_ref_exit(&mddev->writes_pending); } EXPORT_SYMBOL_GPL(md_stop); @@ -6334,18 +6429,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); md_wakeup_thread(mddev->thread); } - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - /* - * Thread might be blocked waiting for metadata update which will now - * never happen - */ - md_wakeup_thread_directly(mddev->sync_thread); - - mddev_unlock(mddev); - wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, - &mddev->recovery)); + stop_sync_thread(mddev, false, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); @@ -6399,20 +6484,8 @@ static int do_md_stop(struct mddev *mddev, int mode, set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); md_wakeup_thread(mddev->thread); } - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - - /* - * Thread might be blocked waiting for metadata update which will now - * never happen - */ - md_wakeup_thread_directly(mddev->sync_thread); - mddev_unlock(mddev); - wait_event(resync_wait, (mddev->sync_thread == NULL && - !test_bit(MD_RECOVERY_RUNNING, - &mddev->recovery))); - mddev_lock_nointr(mddev); + stop_sync_thread(mddev, true, false); mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || @@ -6555,13 +6628,13 @@ static void autorun_devices(int part) if (IS_ERR(mddev)) break; - if (mddev_lock(mddev)) + if (mddev_suspend_and_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { pr_warn("md: %s already running, cannot run %pg\n", mdname(mddev), rdev0->bdev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; @@ -6571,7 +6644,7 @@ static void autorun_devices(int part) export_rdev(rdev, mddev); } autorun_array(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... @@ -6809,7 +6882,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't @@ -7121,7 +7194,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = md_bitmap_load(mddev); @@ -7131,11 +7203,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) md_bitmap_destroy(mddev); fd = -1; } - mddev_resume(mddev); } else if (fd < 0) { - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); } } if (fd < 0) { @@ -7424,7 +7493,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); @@ -7432,7 +7500,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) md_bitmap_destroy(mddev); - mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -7457,9 +7524,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7530,6 +7595,20 @@ static inline bool md_ioctl_valid(unsigned int cmd) } } +static bool md_ioctl_need_suspend(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case SET_BITMAP_FILE: + case SET_ARRAY_INFO: + return true; + default: + return false; + } +} + static int __md_set_array_info(struct mddev *mddev, void __user *argp) { mdu_array_info_t info; @@ -7658,7 +7737,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } - err = mddev_lock(mddev); + + if (!md_is_rdwr(mddev)) + flush_work(&mddev->sync_work); + + err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : + mddev_lock(mddev); if (err) { pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); @@ -7786,7 +7870,10 @@ unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; - mddev_unlock(mddev); + + md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : + mddev_unlock(mddev); + out: if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); @@ -7898,7 +7985,6 @@ static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; - percpu_ref_exit(&mddev->writes_pending); mddev_free(mddev); } @@ -8075,6 +8161,19 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } +static void status_personalities(struct seq_file *seq) +{ + struct md_personality *pers; + + seq_puts(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_puts(seq, "\n"); +} + static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; @@ -8214,105 +8313,43 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } static void *md_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&all_mddevs_lock) { - struct list_head *tmp; - loff_t l = *pos; - struct mddev *mddev; - - if (l == 0x10000) { - ++*pos; - return (void *)2; - } - if (l > 0x10000) - return NULL; - if (!l--) - /* header */ - return (void*)1; - + seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); - list_for_each(tmp,&all_mddevs) - if (!l--) { - mddev = list_entry(tmp, struct mddev, all_mddevs); - if (!mddev_get(mddev)) - continue; - spin_unlock(&all_mddevs_lock); - return mddev; - } - spin_unlock(&all_mddevs_lock); - if (!l--) - return (void*)2;/* tail */ - return NULL; + + return seq_list_start_head(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *tmp; - struct mddev *next_mddev, *mddev = v; - struct mddev *to_put = NULL; - - ++*pos; - if (v == (void*)2) - return NULL; - - spin_lock(&all_mddevs_lock); - if (v == (void*)1) { - tmp = all_mddevs.next; - } else { - to_put = mddev; - tmp = mddev->all_mddevs.next; - } - - for (;;) { - if (tmp == &all_mddevs) { - next_mddev = (void*)2; - *pos = 0x10000; - break; - } - next_mddev = list_entry(tmp, struct mddev, all_mddevs); - if (mddev_get(next_mddev)) - break; - mddev = next_mddev; - tmp = mddev->all_mddevs.next; - } - spin_unlock(&all_mddevs_lock); - - if (to_put) - mddev_put(to_put); - return next_mddev; - + return seq_list_next(v, &all_mddevs, pos); } static void md_seq_stop(struct seq_file *seq, void *v) + __releases(&all_mddevs_lock) { - struct mddev *mddev = v; - - if (mddev && v != (void*)1 && v != (void*)2) - mddev_put(mddev); + spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = v; + struct mddev *mddev; sector_t sectors; struct md_rdev *rdev; - if (v == (void*)1) { - struct md_personality *pers; - seq_printf(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_printf(seq, "\n"); - seq->poll_event = atomic_read(&md_event_count); + if (v == &all_mddevs) { + status_personalities(seq); + if (list_empty(&all_mddevs)) + status_unused(seq); return 0; } - if (v == (void*)2) { - status_unused(seq); + + mddev = list_entry(v, struct mddev, all_mddevs); + if (!mddev_get(mddev)) return 0; - } + spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), @@ -8383,6 +8420,13 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + spin_lock(&all_mddevs_lock); + + if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) + status_unused(seq); + + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); return 0; } @@ -8582,6 +8626,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ + flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -8772,12 +8817,16 @@ void md_do_sync(struct md_thread *thread) int ret; /* just incase thread restarts... */ - if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto skip; + + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - return; + goto skip; } if (mddev_is_clustered(mddev)) { @@ -9168,12 +9217,90 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); - wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } EXPORT_SYMBOL_GPL(md_do_sync); +static bool rdev_removeable(struct md_rdev *rdev) +{ + /* rdev is not used. */ + if (rdev->raid_disk < 0) + return false; + + /* There are still inflight io, don't remove this rdev. */ + if (atomic_read(&rdev->nr_pending)) + return false; + + /* + * An error occurred but has not yet been acknowledged by the metadata + * handler, don't remove this rdev. + */ + if (test_bit(Blocked, &rdev->flags)) + return false; + + /* Fautly rdev is not used, it's safe to remove it. */ + if (test_bit(Faulty, &rdev->flags)) + return true; + + /* Journal disk can only be removed if it's faulty. */ + if (test_bit(Journal, &rdev->flags)) + return false; + + /* + * 'In_sync' is cleared while 'raid_disk' is valid, which means + * replacement has just become active from pers->spare_active(), and + * then pers->hot_remove_disk() will replace this rdev with replacement. + */ + if (!test_bit(In_sync, &rdev->flags)) + return true; + + return false; +} + +static bool rdev_is_spare(struct md_rdev *rdev) +{ + return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags); +} + +static bool rdev_addable(struct md_rdev *rdev) +{ + /* rdev is already used, don't add it again. */ + if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || + test_bit(Faulty, &rdev->flags)) + return false; + + /* Allow to add journal disk. */ + if (test_bit(Journal, &rdev->flags)) + return true; + + /* Allow to add if array is read-write. */ + if (md_is_rdwr(rdev->mddev)) + return true; + + /* + * For read-only array, only allow to readd a rdev. And if bitmap is + * used, don't allow to readd a rdev that is too old. + */ + if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) + return true; + + return false; +} + +static bool md_spares_need_change(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev_removeable(rdev) || rdev_addable(rdev)) + return true; + return false; +} + static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this) { @@ -9206,12 +9333,8 @@ static int remove_and_add_spares(struct mddev *mddev, synchronize_rcu(); rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - ((test_bit(RemoveSynchronized, &rdev->flags) || - (!test_bit(In_sync, &rdev->flags) && - !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0)) { + (test_bit(RemoveSynchronized, &rdev->flags) || + rdev_removeable(rdev))) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -9233,25 +9356,12 @@ static int remove_and_add_spares(struct mddev *mddev, rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; - if (test_bit(Candidate, &rdev->flags)) - continue; - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) + if (rdev_is_spare(rdev)) spares++; - if (rdev->raid_disk >= 0) + if (!rdev_addable(rdev)) continue; - if (test_bit(Faulty, &rdev->flags)) - continue; - if (!test_bit(Journal, &rdev->flags)) { - if (!md_is_rdwr(mddev) && - !(rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; - + if (!test_bit(Journal, &rdev->flags)) rdev->recovery_offset = 0; - } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { /* failure here is OK */ sysfs_link_rdev(mddev, rdev); @@ -9267,30 +9377,152 @@ no_add: return spares; } +static bool md_choose_sync_action(struct mddev *mddev, int *spares) +{ + /* Check if reshape is in progress first. */ + if (mddev->reshape_position != MaxSector) { + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) + return false; + + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* + * Remove any failed drives, then add spares if possible. Spares are + * also removed and re-added, to allow the personality to fail the + * re-add. + */ + *spares = remove_and_add_spares(mddev, NULL); + if (*spares) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + + /* Start new recovery. */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Check if recovery is in progress. */ + if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Delay to choose resync/check/repair in md_do_sync(). */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + return true; + + /* Nothing to be done */ + return false; +} + static void md_start_sync(struct work_struct *ws) { - struct mddev *mddev = container_of(ws, struct mddev, del_work); + struct mddev *mddev = container_of(ws, struct mddev, sync_work); + int spares = 0; + bool suspend = false; + char *name; + + /* + * If reshape is still in progress, spares won't be added or removed + * from conf until reshape is done. + */ + if (mddev->reshape_position == MaxSector && + md_spares_need_change(mddev)) { + suspend = true; + mddev_suspend(mddev, false); + } + + mddev_lock_nointr(mddev); + if (!md_is_rdwr(mddev)) { + /* + * On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself is in-sync. + * As we only add devices that are already in-sync, we can + * activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + goto not_running; + } + + if (!md_choose_sync_action(mddev, &spares)) + goto not_running; + + if (!mddev->pers->sync_request) + goto not_running; + /* + * We are adding a device or devices to an array which has the bitmap + * stored on all devices. So make sure all bitmap pages get written. + */ + if (spares) + md_bitmap_write_all(mddev->bitmap); + + name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? + "reshape" : "resync"; rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "resync")); + md_register_thread(md_do_sync, mddev, name)); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - wake_up(&resync_wait); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); - } else - md_wakeup_thread(mddev->sync_thread); + goto not_running; + } + + mddev_unlock(mddev); + /* + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should + * not set it again. Otherwise, we may cause issue like this one: + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 + * Therefore, use __mddev_resume(mddev, false). + */ + if (suspend) + __mddev_resume(mddev, false); + md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); + return; + +not_running: + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev_unlock(mddev); + /* + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should + * not set it again. Otherwise, we may cause issue like this one: + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 + * Therefore, use __mddev_resume(mddev, false). + */ + if (suspend) + __mddev_resume(mddev, false); + + wake_up(&resync_wait); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); +} + +static void unregister_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + return; + + md_reap_sync_thread(mddev); } /* @@ -9317,21 +9549,6 @@ static void md_start_sync(struct work_struct *ws) */ void md_check_recovery(struct mddev *mddev) { - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { - /* Write superblock - thread that called mddev_suspend() - * holds reconfig_mutex for us. - */ - set_bit(MD_UPDATING_SB, &mddev->flags); - smp_mb__after_atomic(); - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) - md_update_sb(mddev, 0); - clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); - wake_up(&mddev->sb_wait); - } - - if (is_md_suspended(mddev)) - return; - if (mddev->bitmap) md_bitmap_daemon_work(mddev); @@ -9345,7 +9562,8 @@ void md_check_recovery(struct mddev *mddev) } if (!md_is_rdwr(mddev) && - !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || @@ -9358,7 +9576,6 @@ void md_check_recovery(struct mddev *mddev) return; if (mddev_trylock(mddev)) { - int spares = 0; bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) @@ -9366,30 +9583,42 @@ void md_check_recovery(struct mddev *mddev) if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + unregister_sync_thread(mddev); + goto unlock; + } + if (!mddev->external && mddev->in_sync) - /* 'Blocked' flag not needed as failed devices + /* + * 'Blocked' flag not needed as failed devices * will be recorded if array switched to read/write. * Leaving it set will prevent the device * from being removed. */ rdev_for_each(rdev, mddev) clear_bit(Blocked, &rdev->flags); - /* On a read-only array we can: - * - remove failed devices - * - add already-in_sync devices if the array itself - * is in-sync. - * As we only add devices that are already in-sync, - * we can activate the spares immediately. - */ - remove_and_add_spares(mddev, NULL); - /* There is no thread, but we need to call + + /* + * There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); + + /* + * Let md_start_sync() to remove and add rdevs to the + * array. + */ + if (md_spares_need_change(mddev)) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + queue_work(md_misc_wq, &mddev->sync_work); + } + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + goto unlock; } @@ -9419,16 +9648,7 @@ void md_check_recovery(struct mddev *mddev) * still set. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - - if (WARN_ON_ONCE(!mddev->sync_thread)) - goto unlock; - - md_reap_sync_thread(mddev); + unregister_sync_thread(mddev); goto unlock; } @@ -9445,56 +9665,14 @@ void md_check_recovery(struct mddev *mddev) clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - goto not_running; - /* no recovery is running. - * remove any failed drives, then - * add spares if possible. - * Spares are also removed and re-added, to allow - * the personality to fail the re-add. - */ - - if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev) != 0) - /* Cannot proceed */ - goto not_running; - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev, NULL))) { - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - /* nothing to be done ... */ - goto not_running; - - if (mddev->pers->sync_request) { - if (spares) { - /* We are adding a device or devices to an array - * which has the bitmap stored on all devices. - * So make sure all bitmap pages get written - */ - md_bitmap_write_all(mddev->bitmap); - } - INIT_WORK(&mddev->del_work, md_start_sync); - queue_work(md_misc_wq, &mddev->del_work); - goto unlock; - } - not_running: - if (!mddev->sync_thread) { + if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + queue_work(md_misc_wq, &mddev->sync_work); + } else { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); wake_up(&resync_wait); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); } + unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); |