diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 17:35:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 17:39:31 +0000 |
commit | 85c675d0d09a45a135bddd15d7b385f8758c32fb (patch) | |
tree | 76267dbc9b9a130337be3640948fe397b04ac629 /drivers/md | |
parent | Adding upstream version 6.6.15. (diff) | |
download | linux-85c675d0d09a45a135bddd15d7b385f8758c32fb.tar.xz linux-85c675d0d09a45a135bddd15d7b385f8758c32fb.zip |
Adding upstream version 6.7.7.upstream/6.7.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/md')
47 files changed, 1599 insertions, 1730 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 529c9d04e9..b2d10063d3 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -4,6 +4,7 @@ config BCACHE tristate "Block device as cache" select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 + select CLOSURES help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. @@ -19,15 +20,6 @@ config BCACHE_DEBUG Enables extra debugging tools, allows expensive runtime checks to be turned on. -config BCACHE_CLOSURES_DEBUG - bool "Debug closures" - depends on BCACHE - select DEBUG_FS - help - Keeps all active closures in a linked list and provides a debugfs - interface to list them, which makes it possible to see asynchronous - operations that get stuck. - config BCACHE_ASYNC_REGISTRATION bool "Asynchronous device registration" depends on BCACHE diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e59676..054e8a33a7 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -2,6 +2,6 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ +bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ + journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 83eb7f27db..6ae2329052 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -179,6 +179,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ #include <linux/bio.h> +#include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mutex.h> @@ -192,7 +193,6 @@ #include "bcache_ondisk.h" #include "bset.h" #include "util.h" -#include "closure.h" struct bucket { atomic_t pin; @@ -300,6 +300,7 @@ struct cached_dev { struct list_head list; struct bcache_device disk; struct block_device *bdev; + struct bdev_handle *bdev_handle; struct cache_sb sb; struct cache_sb_disk *sb_disk; @@ -422,6 +423,7 @@ struct cache { struct kobject kobj; struct block_device *bdev; + struct bdev_handle *bdev_handle; struct task_struct *alloc_thread; @@ -542,7 +544,7 @@ struct cache_set { struct bio_set bio_split; /* For the btree cache */ - struct shrinker shrink; + struct shrinker *shrink; /* For the btree cache and anything allocation related */ struct mutex bucket_lock; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b709c2fde7..196cdacce3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) w->journal = NULL; } -static void btree_node_write_unlock(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_write_unlock) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); up(&b->io_mutex); } -static void __btree_node_write_done(struct closure *cl) +static CLOSURE_CALLBACK(__btree_node_write_done) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); @@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl) closure_return_with_destructor(cl, btree_node_write_unlock); } -static void btree_node_write_done(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_write_done) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); bio_free_pages(b->bio); - __btree_node_write_done(cl); + __btree_node_write_done(&cl->work); } static void btree_node_write_endio(struct bio *bio) @@ -667,7 +667,7 @@ out_unlock: static unsigned long bch_mca_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; @@ -734,7 +734,7 @@ out: static unsigned long bch_mca_count(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; if (c->shrinker_disabled) return 0; @@ -752,8 +752,8 @@ void bch_btree_cache_free(struct cache_set *c) closure_init_stack(&cl); - if (c->shrink.list.next) - unregister_shrinker(&c->shrink); + if (c->shrink) + shrinker_free(c->shrink); mutex_lock(&c->bucket_lock); @@ -828,14 +828,19 @@ int bch_btree_cache_alloc(struct cache_set *c) c->verify_data = NULL; #endif - c->shrink.count_objects = bch_mca_count; - c->shrink.scan_objects = bch_mca_scan; - c->shrink.seeks = 4; - c->shrink.batch = c->btree_pages * 2; + c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid); + if (!c->shrink) { + pr_warn("bcache: %s: could not allocate shrinker\n", __func__); + return 0; + } + + c->shrink->count_objects = bch_mca_count; + c->shrink->scan_objects = bch_mca_scan; + c->shrink->seeks = 4; + c->shrink->batch = c->btree_pages * 2; + c->shrink->private_data = c; - if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) - pr_warn("bcache: %s: could not register shrinker\n", - __func__); + shrinker_register(c->shrink); return 0; } diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c deleted file mode 100644 index d8d9394a6b..0000000000 --- a/drivers/md/bcache/closure.c +++ /dev/null @@ -1,207 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Asynchronous refcounty things - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/sched/debug.h> - -#include "closure.h" - -static inline void closure_put_after_sub(struct closure *cl, int flags) -{ - int r = flags & CLOSURE_REMAINING_MASK; - - BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); - - if (!r) { - if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - atomic_set(&cl->remaining, - CLOSURE_REMAINING_INITIALIZER); - closure_queue(cl); - } else { - struct closure *parent = cl->parent; - closure_fn *destructor = cl->fn; - - closure_debug_destroy(cl); - - if (destructor) - destructor(cl); - - if (parent) - closure_put(parent); - } - } -} - -/* For clearing flags with the same atomic op as a put */ -void closure_sub(struct closure *cl, int v) -{ - closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); -} - -/* - * closure_put - decrement a closure's refcount - */ -void closure_put(struct closure *cl) -{ - closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); -} - -/* - * closure_wake_up - wake up all closures on a wait list, without memory barrier - */ -void __closure_wake_up(struct closure_waitlist *wait_list) -{ - struct llist_node *list; - struct closure *cl, *t; - struct llist_node *reverse = NULL; - - list = llist_del_all(&wait_list->list); - - /* We first reverse the list to preserve FIFO ordering and fairness */ - reverse = llist_reverse_order(list); - - /* Then do the wakeups */ - llist_for_each_entry_safe(cl, t, reverse, list) { - closure_set_waiting(cl, 0); - closure_sub(cl, CLOSURE_WAITING + 1); - } -} - -/** - * closure_wait - add a closure to a waitlist - * @waitlist: will own a ref on @cl, which will be released when - * closure_wake_up() is called on @waitlist. - * @cl: closure pointer. - * - */ -bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) -{ - if (atomic_read(&cl->remaining) & CLOSURE_WAITING) - return false; - - closure_set_waiting(cl, _RET_IP_); - atomic_add(CLOSURE_WAITING + 1, &cl->remaining); - llist_add(&cl->list, &waitlist->list); - - return true; -} - -struct closure_syncer { - struct task_struct *task; - int done; -}; - -static void closure_sync_fn(struct closure *cl) -{ - struct closure_syncer *s = cl->s; - struct task_struct *p; - - rcu_read_lock(); - p = READ_ONCE(s->task); - s->done = 1; - wake_up_process(p); - rcu_read_unlock(); -} - -void __sched __closure_sync(struct closure *cl) -{ - struct closure_syncer s = { .task = current }; - - cl->s = &s; - continue_at(cl, closure_sync_fn, NULL); - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (s.done) - break; - schedule(); - } - - __set_current_state(TASK_RUNNING); -} - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - -static LIST_HEAD(closure_list); -static DEFINE_SPINLOCK(closure_list_lock); - -void closure_debug_create(struct closure *cl) -{ - unsigned long flags; - - BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); - cl->magic = CLOSURE_MAGIC_ALIVE; - - spin_lock_irqsave(&closure_list_lock, flags); - list_add(&cl->all, &closure_list); - spin_unlock_irqrestore(&closure_list_lock, flags); -} - -void closure_debug_destroy(struct closure *cl) -{ - unsigned long flags; - - BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); - cl->magic = CLOSURE_MAGIC_DEAD; - - spin_lock_irqsave(&closure_list_lock, flags); - list_del(&cl->all); - spin_unlock_irqrestore(&closure_list_lock, flags); -} - -static struct dentry *closure_debug; - -static int debug_show(struct seq_file *f, void *data) -{ - struct closure *cl; - - spin_lock_irq(&closure_list_lock); - - list_for_each_entry(cl, &closure_list, all) { - int r = atomic_read(&cl->remaining); - - seq_printf(f, "%p: %pS -> %pS p %p r %i ", - cl, (void *) cl->ip, cl->fn, cl->parent, - r & CLOSURE_REMAINING_MASK); - - seq_printf(f, "%s%s\n", - test_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(&cl->work)) ? "Q" : "", - r & CLOSURE_RUNNING ? "R" : ""); - - if (r & CLOSURE_WAITING) - seq_printf(f, " W %pS\n", - (void *) cl->waiting_on); - - seq_printf(f, "\n"); - } - - spin_unlock_irq(&closure_list_lock); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(debug); - -void __init closure_debug_init(void) -{ - if (!IS_ERR_OR_NULL(bcache_debug)) - /* - * it is unnecessary to check return value of - * debugfs_create_file(), we should not care - * about this. - */ - closure_debug = debugfs_create_file( - "closures", 0400, bcache_debug, NULL, &debug_fops); -} -#endif - -MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h deleted file mode 100644 index c88cdc4ae4..0000000000 --- a/drivers/md/bcache/closure.h +++ /dev/null @@ -1,378 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CLOSURE_H -#define _LINUX_CLOSURE_H - -#include <linux/llist.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/workqueue.h> - -/* - * Closure is perhaps the most overused and abused term in computer science, but - * since I've been unable to come up with anything better you're stuck with it - * again. - * - * What are closures? - * - * They embed a refcount. The basic idea is they count "things that are in - * progress" - in flight bios, some other thread that's doing something else - - * anything you might want to wait on. - * - * The refcount may be manipulated with closure_get() and closure_put(). - * closure_put() is where many of the interesting things happen, when it causes - * the refcount to go to 0. - * - * Closures can be used to wait on things both synchronously and asynchronously, - * and synchronous and asynchronous use can be mixed without restriction. To - * wait synchronously, use closure_sync() - you will sleep until your closure's - * refcount hits 1. - * - * To wait asynchronously, use - * continue_at(cl, next_function, workqueue); - * - * passing it, as you might expect, the function to run when nothing is pending - * and the workqueue to run that function out of. - * - * continue_at() also, critically, requires a 'return' immediately following the - * location where this macro is referenced, to return to the calling function. - * There's good reason for this. - * - * To use safely closures asynchronously, they must always have a refcount while - * they are running owned by the thread that is running them. Otherwise, suppose - * you submit some bios and wish to have a function run when they all complete: - * - * foo_endio(struct bio *bio) - * { - * closure_put(cl); - * } - * - * closure_init(cl); - * - * do_stuff(); - * closure_get(cl); - * bio1->bi_endio = foo_endio; - * bio_submit(bio1); - * - * do_more_stuff(); - * closure_get(cl); - * bio2->bi_endio = foo_endio; - * bio_submit(bio2); - * - * continue_at(cl, complete_some_read, system_wq); - * - * If closure's refcount started at 0, complete_some_read() could run before the - * second bio was submitted - which is almost always not what you want! More - * importantly, it wouldn't be possible to say whether the original thread or - * complete_some_read()'s thread owned the closure - and whatever state it was - * associated with! - * - * So, closure_init() initializes a closure's refcount to 1 - and when a - * closure_fn is run, the refcount will be reset to 1 first. - * - * Then, the rule is - if you got the refcount with closure_get(), release it - * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount - * on a closure because you called closure_init() or you were run out of a - * closure - _always_ use continue_at(). Doing so consistently will help - * eliminate an entire class of particularly pernicious races. - * - * Lastly, you might have a wait list dedicated to a specific event, and have no - * need for specifying the condition - you just want to wait until someone runs - * closure_wake_up() on the appropriate wait list. In that case, just use - * closure_wait(). It will return either true or false, depending on whether the - * closure was already on a wait list or not - a closure can only be on one wait - * list at a time. - * - * Parents: - * - * closure_init() takes two arguments - it takes the closure to initialize, and - * a (possibly null) parent. - * - * If parent is non null, the new closure will have a refcount for its lifetime; - * a closure is considered to be "finished" when its refcount hits 0 and the - * function to run is null. Hence - * - * continue_at(cl, NULL, NULL); - * - * returns up the (spaghetti) stack of closures, precisely like normal return - * returns up the C stack. continue_at() with non null fn is better thought of - * as doing a tail call. - * - * All this implies that a closure should typically be embedded in a particular - * struct (which its refcount will normally control the lifetime of), and that - * struct can very much be thought of as a stack frame. - */ - -struct closure; -struct closure_syncer; -typedef void (closure_fn) (struct closure *); -extern struct dentry *bcache_debug; - -struct closure_waitlist { - struct llist_head list; -}; - -enum closure_state { - /* - * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by - * the thread that owns the closure, and cleared by the thread that's - * waking up the closure. - * - * The rest are for debugging and don't affect behaviour: - * - * CLOSURE_RUNNING: Set when a closure is running (i.e. by - * closure_init() and when closure_put() runs then next function), and - * must be cleared before remaining hits 0. Primarily to help guard - * against incorrect usage and accidentally transferring references. - * continue_at() and closure_return() clear it for you, if you're doing - * something unusual you can use closure_set_dead() which also helps - * annotate where references are being transferred. - */ - - CLOSURE_BITS_START = (1U << 26), - CLOSURE_DESTRUCTOR = (1U << 26), - CLOSURE_WAITING = (1U << 28), - CLOSURE_RUNNING = (1U << 30), -}; - -#define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) - -#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) -#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) - -struct closure { - union { - struct { - struct workqueue_struct *wq; - struct closure_syncer *s; - struct llist_node list; - closure_fn *fn; - }; - struct work_struct work; - }; - - struct closure *parent; - - atomic_t remaining; - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -#define CLOSURE_MAGIC_DEAD 0xc054dead -#define CLOSURE_MAGIC_ALIVE 0xc054a11e - - unsigned int magic; - struct list_head all; - unsigned long ip; - unsigned long waiting_on; -#endif -}; - -void closure_sub(struct closure *cl, int v); -void closure_put(struct closure *cl); -void __closure_wake_up(struct closure_waitlist *list); -bool closure_wait(struct closure_waitlist *list, struct closure *cl); -void __closure_sync(struct closure *cl); - -/** - * closure_sync - sleep until a closure a closure has nothing left to wait on - * - * Sleeps until the refcount hits 1 - the thread that's running the closure owns - * the last refcount. - */ -static inline void closure_sync(struct closure *cl) -{ - if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) - __closure_sync(cl); -} - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - -void closure_debug_init(void); -void closure_debug_create(struct closure *cl); -void closure_debug_destroy(struct closure *cl); - -#else - -static inline void closure_debug_init(void) {} -static inline void closure_debug_create(struct closure *cl) {} -static inline void closure_debug_destroy(struct closure *cl) {} - -#endif - -static inline void closure_set_ip(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->ip = _THIS_IP_; -#endif -} - -static inline void closure_set_ret_ip(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->ip = _RET_IP_; -#endif -} - -static inline void closure_set_waiting(struct closure *cl, unsigned long f) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->waiting_on = f; -#endif -} - -static inline void closure_set_stopped(struct closure *cl) -{ - atomic_sub(CLOSURE_RUNNING, &cl->remaining); -} - -static inline void set_closure_fn(struct closure *cl, closure_fn *fn, - struct workqueue_struct *wq) -{ - closure_set_ip(cl); - cl->fn = fn; - cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic(); -} - -static inline void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - /** - * Changes made to closure, work_struct, or a couple of other structs - * may cause work.func not pointing to the right location. - */ - BUILD_BUG_ON(offsetof(struct closure, fn) - != offsetof(struct work_struct, func)); - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} - -/** - * closure_get - increment a closure's refcount - */ -static inline void closure_get(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); -#else - atomic_inc(&cl->remaining); -#endif -} - -/** - * closure_init - Initialize a closure, setting the refcount to 1 - * @cl: closure to initialize - * @parent: parent of the new closure. cl will take a refcount on it for its - * lifetime; may be NULL. - */ -static inline void closure_init(struct closure *cl, struct closure *parent) -{ - memset(cl, 0, sizeof(struct closure)); - cl->parent = parent; - if (parent) - closure_get(parent); - - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - - closure_debug_create(cl); - closure_set_ip(cl); -} - -static inline void closure_init_stack(struct closure *cl) -{ - memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -} - -/** - * closure_wake_up - wake up all closures on a wait list, - * with memory barrier - */ -static inline void closure_wake_up(struct closure_waitlist *list) -{ - /* Memory barrier for the wait list */ - smp_mb(); - __closure_wake_up(list); -} - -/** - * continue_at - jump to another function with barrier - * - * After @cl is no longer waiting on anything (i.e. all outstanding refs have - * been dropped with closure_put()), it will resume execution at @fn running out - * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). - * - * This is because after calling continue_at() you no longer have a ref on @cl, - * and whatever @cl owns may be freed out from under you - a running closure fn - * has a ref on its own closure which continue_at() drops. - * - * Note you are expected to immediately return after using this macro. - */ -#define continue_at(_cl, _fn, _wq) \ -do { \ - set_closure_fn(_cl, _fn, _wq); \ - closure_sub(_cl, CLOSURE_RUNNING + 1); \ -} while (0) - -/** - * closure_return - finish execution of a closure - * - * This is used to indicate that @cl is finished: when all outstanding refs on - * @cl have been dropped @cl's ref on its parent closure (as passed to - * closure_init()) will be dropped, if one was specified - thus this can be - * thought of as returning to the parent closure. - */ -#define closure_return(_cl) continue_at((_cl), NULL, NULL) - -/** - * continue_at_nobarrier - jump to another function without barrier - * - * Causes @fn to be executed out of @cl, in @wq context (or called directly if - * @wq is NULL). - * - * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, - * thus it's not safe to touch anything protected by @cl after a - * continue_at_nobarrier(). - */ -#define continue_at_nobarrier(_cl, _fn, _wq) \ -do { \ - set_closure_fn(_cl, _fn, _wq); \ - closure_queue(_cl); \ -} while (0) - -/** - * closure_return_with_destructor - finish execution of a closure, - * with destructor - * - * Works like closure_return(), except @destructor will be called when all - * outstanding refs on @cl have been dropped; @destructor may be used to safely - * free the memory occupied by @cl, and it is called with the ref on the parent - * closure still held - so @destructor could safely return an item to a - * freelist protected by @cl's parent. - */ -#define closure_return_with_destructor(_cl, _destructor) \ -do { \ - set_closure_fn(_cl, _destructor, NULL); \ - closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ -} while (0) - -/** - * closure_call - execute @fn out of a new, uninitialized closure - * - * Typically used when running out of one closure, and we want to run @fn - * asynchronously out of a new closure - @parent will then wait for @cl to - * finish. - */ -static inline void closure_call(struct closure *cl, closure_fn fn, - struct workqueue_struct *wq, - struct closure *parent) -{ - closure_init(cl, parent); - continue_at_nobarrier(cl, fn, wq); -} - -#endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c182c21de2..7ff14bd2fe 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -723,11 +723,11 @@ static void journal_write_endio(struct bio *bio) closure_put(&w->c->journal.io); } -static void journal_write(struct closure *cl); +static CLOSURE_CALLBACK(journal_write); -static void journal_write_done(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_done) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; @@ -736,19 +736,19 @@ static void journal_write_done(struct closure *cl) continue_at_nobarrier(cl, journal_write, bch_journal_wq); } -static void journal_write_unlock(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_unlock) __releases(&c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); c->journal.io_in_flight = 0; spin_unlock(&c->journal.lock); } -static void journal_write_unlocked(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_unlocked) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); struct cache *ca = c->cache; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; @@ -823,12 +823,12 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write_done, NULL); } -static void journal_write(struct closure *cl) +static CLOSURE_CALLBACK(journal_write) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); spin_lock(&c->journal.lock); - journal_write_unlocked(cl); + journal_write_unlocked(&cl->work); } static void journal_try_write(struct cache_set *c) diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 9f32901fda..ebd500bdf0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) /* Moving GC - IO loop */ -static void moving_io_destructor(struct closure *cl) +static CLOSURE_CALLBACK(moving_io_destructor) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); kfree(io); } -static void write_moving_finish(struct closure *cl) +static CLOSURE_CALLBACK(write_moving_finish) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); struct bio *bio = &io->bio.bio; bio_free_pages(bio); @@ -89,9 +89,9 @@ static void moving_init(struct moving_io *io) bch_bio_map(bio, NULL); } -static void write_moving(struct closure *cl) +static CLOSURE_CALLBACK(write_moving) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); struct data_insert_op *op = &io->op; if (!op->status) { @@ -113,9 +113,9 @@ static void write_moving(struct closure *cl) continue_at(cl, write_moving_finish, op->wq); } -static void read_moving_submit(struct closure *cl) +static CLOSURE_CALLBACK(read_moving_submit) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); struct bio *bio = &io->bio.bio; bch_submit_bbio(bio, io->op.c, &io->w->key, 0); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a9b1f38962..83d112bd2b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,7 +25,7 @@ struct kmem_cache *bch_search_cache; -static void bch_data_insert_start(struct closure *cl); +static CLOSURE_CALLBACK(bch_data_insert_start); static unsigned int cache_mode(struct cached_dev *dc) { @@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k) /* Insert data into cache */ -static void bch_data_insert_keys(struct closure *cl) +static CLOSURE_CALLBACK(bch_data_insert_keys) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); atomic_t *journal_ref = NULL; struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; @@ -136,9 +136,9 @@ out: continue_at(cl, bch_data_insert_keys, op->wq); } -static void bch_data_insert_error(struct closure *cl) +static CLOSURE_CALLBACK(bch_data_insert_error) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); /* * Our data write just errored, which means we've got a bunch of keys to @@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl) op->insert_keys.top = dst; - bch_data_insert_keys(cl); + bch_data_insert_keys(&cl->work); } static void bch_data_insert_endio(struct bio *bio) @@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio) bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache"); } -static void bch_data_insert_start(struct closure *cl) +static CLOSURE_CALLBACK(bch_data_insert_start) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); struct bio *bio = op->bio, *n; if (op->bypass) @@ -305,16 +305,16 @@ err: * If op->bypass is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ -void bch_data_insert(struct closure *cl) +CLOSURE_CALLBACK(bch_data_insert) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass); bch_keylist_init(&op->insert_keys); bio_get(op->bio); - bch_data_insert_start(cl); + bch_data_insert_start(&cl->work); } /* @@ -575,9 +575,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return n == bio ? MAP_DONE : MAP_CONTINUE; } -static void cache_lookup(struct closure *cl) +static CLOSURE_CALLBACK(cache_lookup) { - struct search *s = container_of(cl, struct search, iop.cl); + closure_type(s, struct search, iop.cl); struct bio *bio = &s->bio.bio; struct cached_dev *dc; int ret; @@ -698,9 +698,9 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); } -static void search_free(struct closure *cl) +static CLOSURE_CALLBACK(search_free) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); atomic_dec(&s->iop.c->search_inflight); @@ -749,20 +749,20 @@ static inline struct search *search_alloc(struct bio *bio, /* Cached devices */ -static void cached_dev_bio_complete(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_bio_complete) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); cached_dev_put(dc); - search_free(cl); + search_free(&cl->work); } /* Process reads */ -static void cached_dev_read_error_done(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_error_done) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); if (s->iop.replace_collision) bch_mark_cache_miss_collision(s->iop.c, s->d); @@ -770,12 +770,12 @@ static void cached_dev_read_error_done(struct closure *cl) if (s->iop.bio) bio_free_pages(s->iop.bio); - cached_dev_bio_complete(cl); + cached_dev_bio_complete(&cl->work); } -static void cached_dev_read_error(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_error) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct bio *bio = &s->bio.bio; /* @@ -801,9 +801,9 @@ static void cached_dev_read_error(struct closure *cl) continue_at(cl, cached_dev_read_error_done, NULL); } -static void cached_dev_cache_miss_done(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_cache_miss_done) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct bcache_device *d = s->d; if (s->iop.replace_collision) @@ -812,13 +812,13 @@ static void cached_dev_cache_miss_done(struct closure *cl) if (s->iop.bio) bio_free_pages(s->iop.bio); - cached_dev_bio_complete(cl); + cached_dev_bio_complete(&cl->work); closure_put(&d->cl); } -static void cached_dev_read_done(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_done) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); /* @@ -858,9 +858,9 @@ static void cached_dev_read_done(struct closure *cl) continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void cached_dev_read_done_bh(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_done_bh) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s->iop.c, s->d, @@ -955,13 +955,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) /* Process writes */ -static void cached_dev_write_complete(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_write_complete) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); up_read_non_owner(&dc->writeback_lock); - cached_dev_bio_complete(cl); + cached_dev_bio_complete(&cl->work); } static void cached_dev_write(struct cached_dev *dc, struct search *s) @@ -1048,9 +1048,9 @@ insert_data: continue_at(cl, cached_dev_write_complete, NULL); } -static void cached_dev_nodata(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_nodata) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct bio *bio = &s->bio.bio; if (s->iop.flush_journal) @@ -1265,9 +1265,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s, return MAP_CONTINUE; } -static void flash_dev_nodata(struct closure *cl) +static CLOSURE_CALLBACK(flash_dev_nodata) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); if (s->iop.flush_journal) bch_journal_meta(s->iop.c, cl); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 38ab4856ea..46bbef00ae 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -34,7 +34,7 @@ struct data_insert_op { }; unsigned int bch_get_congested(const struct cache_set *c); -void bch_data_insert(struct closure *cl); +CLOSURE_CALLBACK(bch_data_insert); void bch_cached_dev_request_init(struct cached_dev *dc); void cached_dev_submit_bio(struct bio *bio); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1e677af385..1402096b80 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -327,9 +327,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, submit_bio(bio); } -static void bch_write_bdev_super_unlock(struct closure *cl) +static CLOSURE_CALLBACK(bch_write_bdev_super_unlock) { - struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + closure_type(dc, struct cached_dev, sb_write); up(&dc->sb_write_mutex); } @@ -363,9 +363,9 @@ static void write_super_endio(struct bio *bio) closure_put(&ca->set->sb_write); } -static void bcache_write_super_unlock(struct closure *cl) +static CLOSURE_CALLBACK(bcache_write_super_unlock) { - struct cache_set *c = container_of(cl, struct cache_set, sb_write); + closure_type(c, struct cache_set, sb_write); up(&c->sb_write_mutex); } @@ -407,9 +407,9 @@ static void uuid_endio(struct bio *bio) closure_put(cl); } -static void uuid_io_unlock(struct closure *cl) +static CLOSURE_CALLBACK(uuid_io_unlock) { - struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + closure_type(c, struct cache_set, uuid_write); up(&c->uuid_write_mutex); } @@ -1344,9 +1344,9 @@ void bch_cached_dev_release(struct kobject *kobj) module_put(THIS_MODULE); } -static void cached_dev_free(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_free) { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + closure_type(dc, struct cached_dev, disk.cl); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1370,17 +1370,17 @@ static void cached_dev_free(struct closure *cl) if (dc->sb_disk) put_page(virt_to_page(dc->sb_disk)); - if (!IS_ERR_OR_NULL(dc->bdev)) - blkdev_put(dc->bdev, dc); + if (dc->bdev_handle) + bdev_release(dc->bdev_handle); wake_up(&unregister_wait); kobject_put(&dc->disk.kobj); } -static void cached_dev_flush(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_flush) { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + closure_type(dc, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; mutex_lock(&bch_register_lock); @@ -1446,7 +1446,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, - struct block_device *bdev, + struct bdev_handle *bdev_handle, struct cached_dev *dc) { const char *err = "cannot allocate memory"; @@ -1454,14 +1454,15 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, int ret = -ENOMEM; memcpy(&dc->sb, sb, sizeof(struct cache_sb)); - dc->bdev = bdev; + dc->bdev_handle = bdev_handle; + dc->bdev = bdev_handle->bdev; dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) goto err; err = "error creating kobject"; - if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache")) + if (kobject_add(&dc->disk.kobj, bdev_kobj(dc->bdev), "bcache")) goto err; if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; @@ -1498,9 +1499,9 @@ void bch_flash_dev_release(struct kobject *kobj) kfree(d); } -static void flash_dev_free(struct closure *cl) +static CLOSURE_CALLBACK(flash_dev_free) { - struct bcache_device *d = container_of(cl, struct bcache_device, cl); + closure_type(d, struct bcache_device, cl); mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), @@ -1511,9 +1512,9 @@ static void flash_dev_free(struct closure *cl) kobject_put(&d->kobj); } -static void flash_dev_flush(struct closure *cl) +static CLOSURE_CALLBACK(flash_dev_flush) { - struct bcache_device *d = container_of(cl, struct bcache_device, cl); + closure_type(d, struct bcache_device, cl); mutex_lock(&bch_register_lock); bcache_device_unlink(d); @@ -1669,9 +1670,9 @@ void bch_cache_set_release(struct kobject *kobj) module_put(THIS_MODULE); } -static void cache_set_free(struct closure *cl) +static CLOSURE_CALLBACK(cache_set_free) { - struct cache_set *c = container_of(cl, struct cache_set, cl); + closure_type(c, struct cache_set, cl); struct cache *ca; debugfs_remove(c->debug); @@ -1710,9 +1711,9 @@ static void cache_set_free(struct closure *cl) kobject_put(&c->kobj); } -static void cache_set_flush(struct closure *cl) +static CLOSURE_CALLBACK(cache_set_flush) { - struct cache_set *c = container_of(cl, struct cache_set, caching); + closure_type(c, struct cache_set, caching); struct cache *ca = c->cache; struct btree *b; @@ -1807,9 +1808,9 @@ static void conditional_stop_bcache_device(struct cache_set *c, } } -static void __cache_set_unregister(struct closure *cl) +static CLOSURE_CALLBACK(__cache_set_unregister) { - struct cache_set *c = container_of(cl, struct cache_set, caching); + closure_type(c, struct cache_set, caching); struct cached_dev *dc; struct bcache_device *d; size_t i; @@ -2218,8 +2219,8 @@ void bch_cache_release(struct kobject *kobj) if (ca->sb_disk) put_page(virt_to_page(ca->sb_disk)); - if (!IS_ERR_OR_NULL(ca->bdev)) - blkdev_put(ca->bdev, ca); + if (ca->bdev_handle) + bdev_release(ca->bdev_handle); kfree(ca); module_put(THIS_MODULE); @@ -2339,38 +2340,42 @@ err_free: } static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, - struct block_device *bdev, struct cache *ca) + struct bdev_handle *bdev_handle, + struct cache *ca) { const char *err = NULL; /* must be set for any error case */ int ret = 0; memcpy(&ca->sb, sb, sizeof(struct cache_sb)); - ca->bdev = bdev; + ca->bdev_handle = bdev_handle; + ca->bdev = bdev_handle->bdev; ca->sb_disk = sb_disk; - if (bdev_max_discard_sectors((bdev))) + if (bdev_max_discard_sectors((bdev_handle->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); ret = cache_alloc(ca); if (ret != 0) { - /* - * If we failed here, it means ca->kobj is not initialized yet, - * kobject_put() won't be called and there is no chance to - * call blkdev_put() to bdev in bch_cache_release(). So we - * explicitly call blkdev_put() here. - */ - blkdev_put(bdev, ca); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else if (ret == -EPERM) err = "cache_alloc(): cache device is too small"; else err = "cache_alloc(): unknown error"; - goto err; + pr_notice("error %pg: %s\n", bdev_handle->bdev, err); + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call bdev_release() to bdev in bch_cache_release(). So + * we explicitly call bdev_release() here. + */ + bdev_release(bdev_handle); + return ret; } - if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) { - err = "error calling kobject_add"; + if (kobject_add(&ca->kobj, bdev_kobj(bdev_handle->bdev), "bcache")) { + pr_notice("error %pg: error calling kobject_add\n", + bdev_handle->bdev); ret = -ENOMEM; goto out; } @@ -2384,15 +2389,10 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto out; } - pr_info("registered cache device %pg\n", ca->bdev); + pr_info("registered cache device %pg\n", ca->bdev_handle->bdev); out: kobject_put(&ca->kobj); - -err: - if (err) - pr_notice("error %pg: %s\n", ca->bdev, err); - return ret; } @@ -2447,7 +2447,7 @@ struct async_reg_args { char *path; struct cache_sb *sb; struct cache_sb_disk *sb_disk; - struct block_device *bdev; + struct bdev_handle *bdev_handle; void *holder; }; @@ -2458,8 +2458,8 @@ static void register_bdev_worker(struct work_struct *work) container_of(work, struct async_reg_args, reg_work.work); mutex_lock(&bch_register_lock); - if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder) - < 0) + if (register_bdev(args->sb, args->sb_disk, args->bdev_handle, + args->holder) < 0) fail = true; mutex_unlock(&bch_register_lock); @@ -2479,7 +2479,8 @@ static void register_cache_worker(struct work_struct *work) container_of(work, struct async_reg_args, reg_work.work); /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder)) + if (register_cache(args->sb, args->sb_disk, args->bdev_handle, + args->holder)) fail = true; if (fail) @@ -2516,7 +2517,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, char *path = NULL; struct cache_sb *sb; struct cache_sb_disk *sb_disk; - struct block_device *bdev, *bdev2; + struct bdev_handle *bdev_handle, *bdev_handle2; void *holder = NULL; ssize_t ret; bool async_registration = false; @@ -2549,15 +2550,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); - if (IS_ERR(bdev)) + bdev_handle = bdev_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(bdev_handle)) goto out_free_sb; err = "failed to set blocksize"; - if (set_blocksize(bdev, 4096)) + if (set_blocksize(bdev_handle->bdev, 4096)) goto out_blkdev_put; - err = read_super(sb, bdev, &sb_disk); + err = read_super(sb, bdev_handle->bdev, &sb_disk); if (err) goto out_blkdev_put; @@ -2569,13 +2570,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, } /* Now reopen in exclusive mode with proper holder */ - bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, - holder, NULL); - blkdev_put(bdev, NULL); - bdev = bdev2; - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - bdev = NULL; + bdev_handle2 = bdev_open_by_dev(bdev_handle->bdev->bd_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE, holder, NULL); + bdev_release(bdev_handle); + bdev_handle = bdev_handle2; + if (IS_ERR(bdev_handle)) { + ret = PTR_ERR(bdev_handle); + bdev_handle = NULL; if (ret == -EBUSY) { dev_t dev; @@ -2610,7 +2611,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, args->path = path; args->sb = sb; args->sb_disk = sb_disk; - args->bdev = bdev; + args->bdev_handle = bdev_handle; args->holder = holder; register_device_async(args); /* No wait and returns to user space */ @@ -2619,14 +2620,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (SB_IS_BDEV(sb)) { mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_disk, bdev, holder); + ret = register_bdev(sb, sb_disk, bdev_handle, holder); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) goto out_free_sb; } else { /* blkdev_put() will be called in bch_cache_release() */ - ret = register_cache(sb, sb_disk, bdev, holder); + ret = register_cache(sb, sb_disk, bdev_handle, holder); if (ret) goto out_free_sb; } @@ -2642,8 +2643,8 @@ out_free_holder: out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out_free_sb: kfree(sb); out_free_path: @@ -2907,7 +2908,6 @@ static int __init bcache_init(void) goto err; bch_debug_init(); - closure_debug_init(); bcache_is_reboot = false; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 18ac98dc89..a438efb660 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -866,7 +866,8 @@ STORE(__bch_cache_set) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->shrink.scan_objects(&c->shrink, &sc); + if (c->shrink) + c->shrink->scan_objects(c->shrink, &sc); } sysfs_strtoul_clamp(congested_read_threshold_us, diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 6f3cb7c921..f61ab1bada 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,6 +4,7 @@ #define _BCACHE_UTIL_H #include <linux/blkdev.h> +#include <linux/closure.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/clock.h> @@ -13,8 +14,6 @@ #include <linux/workqueue.h> #include <linux/crc64.h> -#include "closure.h" - struct closure; #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3accfdaee6..8827a6f130 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -341,16 +341,16 @@ static void dirty_init(struct keybuf_key *w) bch_bio_map(bio, NULL); } -static void dirty_io_destructor(struct closure *cl) +static CLOSURE_CALLBACK(dirty_io_destructor) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); kfree(io); } -static void write_dirty_finish(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty_finish) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio) closure_put(&io->cl); } -static void write_dirty(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio) dirty_endio(bio); } -static void read_dirty_submit(struct closure *cl) +static CLOSURE_CALLBACK(read_dirty_submit) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); closure_bio_submit(io->dc->disk.c, &io->bio, cl); diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index 92afdca760..9ab32abe5e 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -26,7 +26,7 @@ struct prison_region { struct dm_bio_prison { mempool_t cell_pool; unsigned int num_locks; - struct prison_region regions[]; + struct prison_region regions[] __counted_by(num_locks); }; static struct kmem_cache *_cell_cache; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 486e1180cc..f03d7dba27 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1001,7 +1001,7 @@ struct dm_bufio_client { sector_t start; - struct shrinker shrinker; + struct shrinker *shrinker; struct work_struct shrink_work; atomic_long_t need_shrink; @@ -2405,7 +2405,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink { struct dm_bufio_client *c; - c = container_of(shrink, struct dm_bufio_client, shrinker); + c = shrink->private_data; atomic_long_add(sc->nr_to_scan, &c->need_shrink); queue_work(dm_bufio_wq, &c->shrink_work); @@ -2414,7 +2414,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); + struct dm_bufio_client *c = shrink->private_data; unsigned long count = cache_total(&c->cache); unsigned long retain_target = get_retain_buffers(c); unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink); @@ -2527,14 +2527,20 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign INIT_WORK(&c->shrink_work, shrink_work); atomic_long_set(&c->need_shrink, 0); - c->shrinker.count_objects = dm_bufio_shrink_count; - c->shrinker.scan_objects = dm_bufio_shrink_scan; - c->shrinker.seeks = 1; - c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - if (r) + c->shrinker = shrinker_alloc(0, "dm-bufio:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (!c->shrinker) { + r = -ENOMEM; goto bad; + } + + c->shrinker->count_objects = dm_bufio_shrink_count; + c->shrinker->scan_objects = dm_bufio_shrink_scan; + c->shrinker->seeks = 1; + c->shrinker->batch = 0; + c->shrinker->private_data = c; + + shrinker_register(c->shrinker); mutex_lock(&dm_bufio_clients_lock); dm_bufio_client_count++; @@ -2574,7 +2580,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) drop_buffers(c); - unregister_shrinker(&c->shrinker); + shrinker_free(c->shrinker); flush_work(&c->shrink_work); mutex_lock(&dm_bufio_clients_lock); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index acffed750e..96751cd3d1 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -597,7 +597,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); - strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + strscpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); @@ -707,7 +707,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); - strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + strscpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); @@ -1726,7 +1726,7 @@ static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + strscpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); hint_size = dm_cache_policy_get_hint_size(policy); @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_rwsem without holding cmd->root_lock + * - must take shrinker_mutex without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 095b9b49aa..e6757a30dc 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -22,6 +22,8 @@ #include "dm-ima.h" #define DM_RESERVED_MAX_IOS 1024 +#define DM_MAX_TARGETS 1048576 +#define DM_MAX_TARGET_PARAMS 1024 struct dm_io; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 17ffbf7fbe..4ab4e8dcfd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -62,6 +62,8 @@ struct convert_context { struct skcipher_request *req; struct aead_request *req_aead; } r; + bool aead_recheck; + bool aead_failed; }; @@ -73,10 +75,8 @@ struct dm_crypt_io { struct bio *base_bio; u8 *integrity_metadata; bool integrity_metadata_from_pool:1; - bool in_tasklet:1; struct work_struct work; - struct tasklet_struct tasklet; struct convert_context ctx; @@ -84,6 +84,8 @@ struct dm_crypt_io { blk_status_t error; sector_t sector; + struct bvec_iter saved_bi_iter; + struct rb_node rb_node; } CRYPTO_MINALIGN_ATTR; @@ -224,7 +226,7 @@ struct crypt_config { struct mutex bio_alloc_lock; u8 *authenc_key; /* space for keys in authenc() format (if used) */ - u8 key[]; + u8 key[] __counted_by(key_size); }; #define MIN_IOS 64 @@ -652,13 +654,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, /* calculate crc32 for every 32bit part and xor it */ desc->tfm = tcw->crc32_tfm; for (i = 0; i < 4; i++) { - r = crypto_shash_init(desc); - if (r) - goto out; - r = crypto_shash_update(desc, &buf[i * 4], 4); - if (r) - goto out; - r = crypto_shash_final(desc, &buf[i * 4]); + r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); if (r) goto out; } @@ -1378,10 +1374,13 @@ static int crypt_convert_block_aead(struct crypt_config *cc, if (r == -EBADMSG) { sector_t s = le64_to_cpu(*sector); - DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", - ctx->bio_in->bi_bdev, s); - dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", - ctx->bio_in, s, 0); + ctx->aead_failed = true; + if (ctx->aead_recheck) { + DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", + ctx->bio_in->bi_bdev, s); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", + ctx->bio_in, s, 0); + } } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) @@ -1765,10 +1764,11 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, io->base_bio = bio; io->sector = sector; io->error = 0; + io->ctx.aead_recheck = false; + io->ctx.aead_failed = false; io->ctx.r.req = NULL; io->integrity_metadata = NULL; io->integrity_metadata_from_pool = false; - io->in_tasklet = false; atomic_set(&io->io_pending, 0); } @@ -1777,12 +1777,7 @@ static void crypt_inc_pending(struct dm_crypt_io *io) atomic_inc(&io->io_pending); } -static void kcryptd_io_bio_endio(struct work_struct *work) -{ - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - - bio_endio(io->base_bio); -} +static void kcryptd_queue_read(struct dm_crypt_io *io); /* * One of the bios was finished. Check for completion of @@ -1797,6 +1792,15 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (!atomic_dec_and_test(&io->io_pending)) return; + if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) && + cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) { + io->ctx.aead_recheck = true; + io->ctx.aead_failed = false; + io->error = 0; + kcryptd_queue_read(io); + return; + } + if (io->ctx.r.req) crypt_free_req(cc, io->ctx.r.req, base_bio); @@ -1807,20 +1811,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io) base_bio->bi_status = error; - /* - * If we are running this function from our tasklet, - * we can't call bio_endio() here, because it will call - * clone_endio() from dm.c, which in turn will - * free the current struct dm_crypt_io structure with - * our tasklet. In this case we need to delay bio_endio() - * execution to after the tasklet is done and dequeued. - */ - if (io->in_tasklet) { - INIT_WORK(&io->work, kcryptd_io_bio_endio); - queue_work(cc->io_queue, &io->work); - return; - } - bio_endio(base_bio); } @@ -1846,15 +1836,19 @@ static void crypt_endio(struct bio *clone) struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; unsigned int rw = bio_data_dir(clone); - blk_status_t error; + blk_status_t error = clone->bi_status; + + if (io->ctx.aead_recheck && !error) { + kcryptd_queue_crypt(io); + return; + } /* * free the processed pages */ - if (rw == WRITE) + if (rw == WRITE || io->ctx.aead_recheck) crypt_free_buffer_pages(cc, clone); - error = clone->bi_status; bio_put(clone); if (rw == READ && !error) { @@ -1875,6 +1869,22 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) struct crypt_config *cc = io->cc; struct bio *clone; + if (io->ctx.aead_recheck) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) + return 1; + crypt_inc_pending(io); + clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); + if (unlikely(!clone)) { + crypt_dec_pending(io); + return 1; + } + clone->bi_iter.bi_sector = cc->start + io->sector; + crypt_convert_init(cc, &io->ctx, clone, clone, io->sector); + io->saved_bi_iter = clone->bi_iter; + dm_submit_bio_remap(io->base_bio, clone); + return 0; + } + /* * We need the original biovec array in order to decrypt the whole bio * data *afterwards* -- thanks to immutable biovecs we don't need to @@ -2101,6 +2111,12 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) io->ctx.bio_out = clone; io->ctx.iter_out = clone->bi_iter; + if (crypt_integrity_aead(cc)) { + bio_copy_data(clone, io->base_bio); + io->ctx.bio_in = clone; + io->ctx.iter_in = clone->bi_iter; + } + sector += bio_sectors(clone); crypt_inc_pending(io); @@ -2137,6 +2153,14 @@ dec: static void kcryptd_crypt_read_done(struct dm_crypt_io *io) { + if (io->ctx.aead_recheck) { + if (!io->error) { + io->ctx.bio_in->bi_iter = io->saved_bi_iter; + bio_copy_data(io->base_bio, io->ctx.bio_in); + } + crypt_free_buffer_pages(io->cc, io->ctx.bio_in); + bio_put(io->ctx.bio_in); + } crypt_dec_pending(io); } @@ -2166,11 +2190,17 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_inc_pending(io); - crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, - io->sector); + if (io->ctx.aead_recheck) { + io->ctx.cc_sector = io->sector + cc->iv_offset; + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + } else { + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, + io->sector); - r = crypt_convert(cc, &io->ctx, - test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + } /* * Crypto API backlogged the request, because its queue was full * and we're in softirq context, so continue from a workqueue @@ -2212,10 +2242,13 @@ static void kcryptd_async_done(void *data, int error) if (error == -EBADMSG) { sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)); - DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", - ctx->bio_in->bi_bdev, s); - dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", - ctx->bio_in, s, 0); + ctx->aead_failed = true; + if (ctx->aead_recheck) { + DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", + ctx->bio_in->bi_bdev, s); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", + ctx->bio_in, s, 0); + } io->error = BLK_STS_PROTECTION; } else if (error < 0) io->error = BLK_STS_IOERR; @@ -2252,11 +2285,6 @@ static void kcryptd_crypt(struct work_struct *work) kcryptd_crypt_write_convert(io); } -static void kcryptd_crypt_tasklet(unsigned long work) -{ - kcryptd_crypt((struct work_struct *)work); -} - static void kcryptd_queue_crypt(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; @@ -2268,15 +2296,10 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) * irqs_disabled(): the kernel may run some IO completion from the idle thread, but * it is being executed with irqs disabled. */ - if (in_hardirq() || irqs_disabled()) { - io->in_tasklet = true; - tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work); - tasklet_schedule(&io->tasklet); + if (!(in_hardirq() || irqs_disabled())) { + kcryptd_crypt(&io->work); return; } - - kcryptd_crypt(&io->work); - return; } INIT_WORK(&io->work, kcryptd_crypt); @@ -2868,10 +2891,9 @@ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api) if (!start || !end || ++start > end) return -EINVAL; - mac_alg = kzalloc(end - start + 1, GFP_KERNEL); + mac_alg = kmemdup_nul(start, end - start, GFP_KERNEL); if (!mac_alg) return -ENOMEM; - strncpy(mac_alg, start, end - start); mac = crypto_alloc_ahash(mac_alg, 0, CRYPTO_ALG_ALLOCATES_MEMORY); kfree(mac_alg); @@ -3151,7 +3173,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar sval = strchr(opt_string + strlen("integrity:"), ':') + 1; if (!strcasecmp(sval, "aead")) { set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); - } else if (strcasecmp(sval, "none")) { + } else if (strcasecmp(sval, "none")) { ti->error = "Unknown integrity profile"; return -EINVAL; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 3726fae300..5eabdb06c6 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/slab.h> +#include <linux/kthread.h> #include <linux/device-mapper.h> @@ -31,6 +32,7 @@ struct delay_c { struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; + struct task_struct *worker; bool may_delay; struct delay_class read; @@ -66,6 +68,11 @@ static void queue_timeout(struct delay_c *dc, unsigned long expires) mutex_unlock(&dc->timer_lock); } +static inline bool delay_is_fast(struct delay_c *dc) +{ + return !!dc->worker; +} + static void flush_bios(struct bio *bio) { struct bio *n; @@ -78,36 +85,61 @@ static void flush_bios(struct bio *bio) } } -static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) +static void flush_delayed_bios(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; + struct bio_list flush_bio_list; unsigned long next_expires = 0; - unsigned long start_timer = 0; - struct bio_list flush_bios = { }; + bool start_timer = false; + bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + cond_resched(); if (flush_all || time_after_eq(jiffies, delayed->expires)) { struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); - bio_list_add(&flush_bios, bio); + bio_list_add(&flush_bio_list, bio); delayed->class->ops--; continue; } - if (!start_timer) { - start_timer = 1; - next_expires = delayed->expires; - } else - next_expires = min(next_expires, delayed->expires); + if (!delay_is_fast(dc)) { + if (!start_timer) { + start_timer = true; + next_expires = delayed->expires; + } else { + next_expires = min(next_expires, delayed->expires); + } + } } mutex_unlock(&delayed_bios_lock); if (start_timer) queue_timeout(dc, next_expires); - return bio_list_get(&flush_bios); + flush_bios(bio_list_get(&flush_bio_list)); +} + +static int flush_worker_fn(void *data) +{ + struct delay_c *dc = data; + + while (!kthread_should_stop()) { + flush_delayed_bios(dc, false); + mutex_lock(&delayed_bios_lock); + if (unlikely(list_empty(&dc->delayed_bios))) { + set_current_state(TASK_INTERRUPTIBLE); + mutex_unlock(&delayed_bios_lock); + schedule(); + } else { + mutex_unlock(&delayed_bios_lock); + cond_resched(); + } + } + + return 0; } static void flush_expired_bios(struct work_struct *work) @@ -115,7 +147,7 @@ static void flush_expired_bios(struct work_struct *work) struct delay_c *dc; dc = container_of(work, struct delay_c, flush_expired_bios); - flush_bios(flush_delayed_bios(dc, 0)); + flush_delayed_bios(dc, false); } static void delay_dtr(struct dm_target *ti) @@ -131,6 +163,8 @@ static void delay_dtr(struct dm_target *ti) dm_put_device(ti, dc->write.dev); if (dc->flush.dev) dm_put_device(ti, dc->flush.dev); + if (dc->worker) + kthread_stop(dc->worker); mutex_destroy(&dc->timer_lock); @@ -175,6 +209,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; int ret; + unsigned int max_delay; if (argc != 3 && argc != 6 && argc != 9) { ti->error = "Requires exactly 3, 6 or 9 arguments"; @@ -188,8 +223,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = dc; - timer_setup(&dc->delay_timer, handle_delayed_timer, 0); - INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); INIT_LIST_HEAD(&dc->delayed_bios); mutex_init(&dc->timer_lock); dc->may_delay = true; @@ -198,6 +231,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->read, argv); if (ret) goto bad; + max_delay = dc->read.delay; if (argc == 3) { ret = delay_class_ctr(ti, &dc->write, argv); @@ -206,6 +240,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->flush, argv); if (ret) goto bad; + max_delay = max(max_delay, dc->write.delay); + max_delay = max(max_delay, dc->flush.delay); goto out; } @@ -216,19 +252,37 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->flush, argv + 3); if (ret) goto bad; + max_delay = max(max_delay, dc->flush.delay); goto out; } ret = delay_class_ctr(ti, &dc->flush, argv + 6); if (ret) goto bad; + max_delay = max(max_delay, dc->flush.delay); out: - dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!dc->kdelayd_wq) { - ret = -EINVAL; - DMERR("Couldn't start kdelayd"); - goto bad; + if (max_delay < 50) { + /* + * In case of small requested delays, use kthread instead of + * timers and workqueue to achieve better latency. + */ + dc->worker = kthread_create(&flush_worker_fn, dc, + "dm-delay-flush-worker"); + if (IS_ERR(dc->worker)) { + ret = PTR_ERR(dc->worker); + dc->worker = NULL; + goto bad; + } + } else { + timer_setup(&dc->delay_timer, handle_delayed_timer, 0); + INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + ret = -EINVAL; + DMERR("Couldn't start kdelayd"); + goto bad; + } } ti->num_flush_bios = 1; @@ -264,7 +318,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) list_add_tail(&delayed->list, &dc->delayed_bios); mutex_unlock(&delayed_bios_lock); - queue_timeout(dc, expires); + if (delay_is_fast(dc)) + wake_up_process(dc->worker); + else + queue_timeout(dc, expires); return DM_MAPIO_SUBMITTED; } @@ -277,8 +334,9 @@ static void delay_presuspend(struct dm_target *ti) dc->may_delay = false; mutex_unlock(&delayed_bios_lock); - del_timer_sync(&dc->delay_timer); - flush_bios(flush_delayed_bios(dc, 1)); + if (!delay_is_fast(dc)) + del_timer_sync(&dc->delay_timer); + flush_delayed_bios(dc, true); } static void delay_resume(struct dm_target *ti) @@ -363,7 +421,7 @@ out: static struct target_type delay_target = { .name = "delay", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .features = DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, .ctr = delay_ctr, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 120153e44a..f57fb82152 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b remaining_size = size; - order = MAX_ORDER - 1; + order = MAX_ORDER; while (remaining_size) { struct page *pages; unsigned size_to_add, to_copy; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 9261bbebd6..e8e8fc33d3 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -278,6 +278,8 @@ struct dm_integrity_c { atomic64_t number_of_mismatches; + mempool_t recheck_pool; + struct notifier_block reboot_notifier; }; @@ -493,42 +495,32 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) { SHASH_DESC_ON_STACK(desc, ic->journal_mac); int r; - unsigned int size = crypto_shash_digestsize(ic->journal_mac); + unsigned int mac_size = crypto_shash_digestsize(ic->journal_mac); + __u8 *sb = (__u8 *)ic->sb; + __u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size; - if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) { + if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT) { dm_integrity_io_error(ic, "digest is too long", -EINVAL); return -EINVAL; } desc->tfm = ic->journal_mac; - r = crypto_shash_init(desc); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_init", r); - return r; - } - - r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); - return r; - } - if (likely(wr)) { - r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size); + r = crypto_shash_digest(desc, sb, mac - sb, mac); if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_final", r); + dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } } else { - __u8 result[HASH_MAX_DIGESTSIZE]; + __u8 actual_mac[HASH_MAX_DIGESTSIZE]; - r = crypto_shash_final(desc, result); + r = crypto_shash_digest(desc, sb, mac - sb, actual_mac); if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_final", r); + dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } - if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) { + if (memcmp(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); return -EILSEQ; @@ -1699,6 +1691,77 @@ failed: get_random_bytes(result, ic->tag_size); } +static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + struct dm_integrity_c *ic = dio->ic; + struct bvec_iter iter; + struct bio_vec bv; + sector_t sector, logical_sector, area, offset; + struct page *page; + void *buffer; + + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, + &dio->metadata_offset); + sector = get_data_sector(ic, area, offset); + logical_sector = dio->range.logical_sector; + + page = mempool_alloc(&ic->recheck_pool, GFP_NOIO); + buffer = page_to_virt(page); + + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { + unsigned pos = 0; + + do { + char *mem; + int r; + struct dm_io_request io_req; + struct dm_io_region io_loc; + io_req.bi_opf = REQ_OP_READ; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = sector; + io_loc.count = ic->sectors_per_block; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dio->bi_status = errno_to_blk_status(r); + goto free_ret; + } + + integrity_sector_checksum(ic, logical_sector, buffer, checksum); + r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block, + &dio->metadata_offset, ic->tag_size, TAG_CMP); + if (r) { + if (r > 0) { + DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", + bio->bi_bdev, logical_sector); + atomic64_inc(&ic->number_of_mismatches); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", + bio, logical_sector, 0); + r = -EILSEQ; + } + dio->bi_status = errno_to_blk_status(r); + goto free_ret; + } + + mem = bvec_kmap_local(&bv); + memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT); + kunmap_local(mem); + + pos += ic->sectors_per_block << SECTOR_SHIFT; + sector += ic->sectors_per_block; + logical_sector += ic->sectors_per_block; + } while (pos < bv.bv_len); + } +free_ret: + mempool_free(page, &ic->recheck_pool); +} + static void integrity_metadata(struct work_struct *w) { struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); @@ -1786,15 +1849,8 @@ again: checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - sector_t s; - - s = sector - ((r + ic->tag_size - 1) / ic->tag_size); - DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", - bio->bi_bdev, s); - r = -EILSEQ; - atomic64_inc(&ic->number_of_mismatches); - dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", - bio, s, 0); + integrity_recheck(dio, checksums); + goto skip_io; } if (likely(checksums != checksums_onstack)) kfree(checksums); @@ -4271,6 +4327,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv goto bad; } + r = mempool_init_page_pool(&ic->recheck_pool, 1, 0); + if (r) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); if (!ic->metadata_wq) { @@ -4619,6 +4681,7 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + mempool_exit(&ic->recheck_pool); mempool_exit(&ic->journal_io_mempool); if (ic->io) dm_io_client_destroy(ic->io); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 21ebb6c393..3b1ad7127c 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1295,8 +1295,8 @@ static void retrieve_status(struct dm_table *table, spec->status = 0; spec->sector_start = ti->begin; spec->length = ti->len; - strncpy(spec->target_type, ti->type->name, - sizeof(spec->target_type) - 1); + strscpy_pad(spec->target_type, ti->type->name, + sizeof(spec->target_type)); outptr += sizeof(struct dm_target_spec); remaining = len - (outptr - outbuf); @@ -1941,7 +1941,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern minimum_data_size - sizeof(param_kernel->version))) return -EFAULT; - if (param_kernel->data_size < minimum_data_size) { + if (unlikely(param_kernel->data_size < minimum_data_size) || + unlikely(param_kernel->data_size > DM_MAX_TARGETS * DM_MAX_TARGET_PARAMS)) { DMERR("Invalid data size in the ioctl structure: %u", param_kernel->data_size); return -EINVAL; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index f4448d520e..2d3e186ca8 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -85,7 +85,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) return lc->start + dm_target_offset(ti, bi_sector); } -static int linear_map(struct dm_target *ti, struct bio *bio) +int linear_map(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 5aace6ee6d..7e4f27e861 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -224,7 +224,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, lc->usr_argc = argc; - strncpy(lc->uuid, argv[0], DM_UUID_LEN); + strscpy(lc->uuid, argv[0], sizeof(lc->uuid)); argc--; argv++; spin_lock_init(&lc->flush_lock); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5f9991765f..eb009d6bb0 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -254,7 +254,7 @@ struct raid_set { int mode; } journal_dev; - struct raid_dev dev[]; + struct raid_dev dev[] __counted_by(raid_disks); }; static void rs_config_backup(struct raid_set *rs, struct rs_layout *l) @@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r return ERR_PTR(-ENOMEM); } - mddev_init(&rs->md); + if (mddev_init(&rs->md)) { + kfree(rs); + ti->error = "Cannot initialize raid context"; + return ERR_PTR(-ENOMEM); + } rs->raid_disks = raid_devs; rs->delta_disks = 0; @@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs) dm_put_device(rs->ti, rs->dev[i].data_dev); } + mddev_destroy(&rs->md); kfree(rs); } @@ -3239,7 +3244,7 @@ size_check: set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); /* Has to be held on running the array */ - mddev_lock_nointr(&rs->md); + mddev_suspend_and_lock_nointr(&rs->md); r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ if (r) { @@ -3263,7 +3268,6 @@ size_check: } } - mddev_suspend(&rs->md); set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ @@ -3313,6 +3317,9 @@ static void raid_dtr(struct dm_target *ti) mddev_lock_nointr(&rs->md); md_stop(&rs->md); mddev_unlock(&rs->md); + + if (work_pending(&rs->md.event_work)) + flush_work(&rs->md.event_work); raid_set_free(rs); } @@ -3793,9 +3800,7 @@ static void raid_postsuspend(struct dm_target *ti) if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) md_stop_writes(&rs->md); - mddev_lock_nointr(&rs->md); - mddev_suspend(&rs->md); - mddev_unlock(&rs->md); + mddev_suspend(&rs->md, false); } } @@ -4054,8 +4059,7 @@ static void raid_resume(struct dm_target *ti) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; - mddev_resume(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } } diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index db2d997a6c..bdc14ec998 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -56,7 +56,7 @@ struct dm_stat { size_t percpu_alloc_size; size_t histogram_alloc_size; struct dm_stat_percpu *stat_percpu[NR_CPUS]; - struct dm_stat_shared stat_shared[]; + struct dm_stat_shared stat_shared[] __counted_by(n_entries); }; #define STAT_PRECISE_TIMESTAMPS 1 diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e2854a3cbd..16b93ae51d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -44,7 +44,7 @@ struct stripe_c { /* Work struct used for triggering events*/ struct work_struct trigger_event; - struct stripe stripe[]; + struct stripe stripe[] __counted_by(stripes); }; /* @@ -268,7 +268,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, return DM_MAPIO_SUBMITTED; } -static int stripe_map(struct dm_target *ti, struct bio *bio) +int stripe_map(struct dm_target *ti, struct bio *bio) { struct stripe_c *sc = ti->private; uint32_t stripe; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 37b48f63ae..08c9c20f9c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -129,7 +129,12 @@ static int alloc_targets(struct dm_table *t, unsigned int num) int dm_table_create(struct dm_table **result, blk_mode_t mode, unsigned int num_targets, struct mapped_device *md) { - struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); + struct dm_table *t; + + if (num_targets > DM_MAX_TARGETS) + return -EOVERFLOW; + + t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) return -ENOMEM; @@ -144,7 +149,7 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode, if (!num_targets) { kfree(t); - return -ENOMEM; + return -EOVERFLOW; } if (alloc_targets(t, num_targets)) { @@ -844,7 +849,8 @@ static bool dm_table_supports_dax(struct dm_table *t, if (!ti->type->direct_access) return false; - if (!ti->type->iterate_devices || + if (dm_target_is_wildcard(ti->type) || + !ti->type->iterate_devices || ti->type->iterate_devices(ti, iterate_fn, NULL)) return false; } @@ -1587,6 +1593,14 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, return blk_queue_zoned_model(q) != *zoned_model; } +static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return blk_queue_zoned_model(q) != BLK_ZONED_NONE; +} + /* * Check the device zoned model based on the target feature flag. If the target * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are @@ -1600,6 +1614,18 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); + /* + * For the wildcard target (dm-error), if we do not have a + * backing device, we must always return false. If we have a + * backing device, the result must depend on checking zoned + * model, like for any other target. So for this, check directly + * if the target backing device is zoned as we get "false" when + * dm-error was set without a backing device. + */ + if (dm_target_is_wildcard(ti->type) && + !ti->type->iterate_devices(ti, device_is_zoned_model, NULL)) + return false; + if (dm_target_supports_zoned_hm(ti->type)) { if (!ti->type->iterate_devices || ti->type->iterate_devices(ti, device_not_zoned_model, diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 27e2992ff2..0c4efb0bef 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -116,9 +116,63 @@ EXPORT_SYMBOL(dm_unregister_target); * io-err: always fails an io, useful for bringing * up LVs that have holes in them. */ +struct io_err_c { + struct dm_dev *dev; + sector_t start; +}; + +static int io_err_get_args(struct dm_target *tt, unsigned int argc, char **args) +{ + unsigned long long start; + struct io_err_c *ioec; + char dummy; + int ret; + + ioec = kmalloc(sizeof(*ioec), GFP_KERNEL); + if (!ioec) { + tt->error = "Cannot allocate io_err context"; + return -ENOMEM; + } + + ret = -EINVAL; + if (sscanf(args[1], "%llu%c", &start, &dummy) != 1 || + start != (sector_t)start) { + tt->error = "Invalid device sector"; + goto bad; + } + ioec->start = start; + + ret = dm_get_device(tt, args[0], dm_table_get_mode(tt->table), &ioec->dev); + if (ret) { + tt->error = "Device lookup failed"; + goto bad; + } + + tt->private = ioec; + + return 0; + +bad: + kfree(ioec); + + return ret; +} + static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { /* + * If we have arguments, assume it is the path to the backing + * block device and its mapping start sector (same as dm-linear). + * In this case, get the device so that we can get its limits. + */ + if (argc == 2) { + int ret = io_err_get_args(tt, argc, args); + + if (ret) + return ret; + } + + /* * Return error for discards instead of -EOPNOTSUPP */ tt->num_discard_bios = 1; @@ -129,7 +183,12 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) static void io_err_dtr(struct dm_target *tt) { - /* empty */ + struct io_err_c *ioec = tt->private; + + if (ioec) { + dm_put_device(tt, ioec->dev); + kfree(ioec); + } } static int io_err_map(struct dm_target *tt, struct bio *bio) @@ -149,6 +208,45 @@ static void io_err_release_clone_rq(struct request *clone, { } +#ifdef CONFIG_BLK_DEV_ZONED +static sector_t io_err_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct io_err_c *ioec = ti->private; + + return ioec->start + dm_target_offset(ti, bi_sector); +} + +static int io_err_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct io_err_c *ioec = ti->private; + + /* + * This should never be called when we do not have a backing device + * as that mean the target is not a zoned one. + */ + if (WARN_ON_ONCE(!ioec)) + return -EIO; + + return dm_report_zones(ioec->dev->bdev, ioec->start, + io_err_map_sector(ti, args->next_sector), + args, nr_zones); +} +#else +#define io_err_report_zones NULL +#endif + +static int io_err_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct io_err_c *ioec = ti->private; + + if (!ioec) + return 0; + + return fn(ti, ioec->dev, ioec->start, ti->len, data); +} + static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits) { limits->max_discard_sectors = UINT_MAX; @@ -165,15 +263,17 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, static struct target_type error_target = { .name = "error", - .version = {1, 6, 0}, - .features = DM_TARGET_WILDCARD, + .version = {1, 7, 0}, + .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, .clone_and_map_rq = io_err_clone_and_map_rq, .release_clone_rq = io_err_release_clone_rq, + .iterate_devices = io_err_iterate_devices, .io_hints = io_err_io_hints, .direct_access = io_err_dax_direct_access, + .report_zones = io_err_report_zones, }; int __init dm_target_init(void) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 14e58ae705..7b620b187d 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -482,6 +482,63 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, return 0; } +static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + memcpy(data, io->recheck_buffer, len); + io->recheck_buffer += len; + + return 0; +} + +static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter start, sector_t cur_block) +{ + struct page *page; + void *buffer; + int r; + struct dm_io_request io_req; + struct dm_io_region io_loc; + + page = mempool_alloc(&v->recheck_pool, GFP_NOIO); + buffer = page_to_virt(page); + + io_req.bi_opf = REQ_OP_READ; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = buffer; + io_req.notify.fn = NULL; + io_req.client = v->io; + io_loc.bdev = v->data_dev->bdev; + io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT); + io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT); + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) + goto free_ret; + + r = verity_hash(v, verity_io_hash_req(v, io), buffer, + 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io), true); + if (unlikely(r)) + goto free_ret; + + if (memcmp(verity_io_real_digest(v, io), + verity_io_want_digest(v, io), v->digest_size)) { + r = -EIO; + goto free_ret; + } + + io->recheck_buffer = buffer; + r = verity_for_bv_block(v, io, &start, verity_recheck_copy); + if (unlikely(r)) + goto free_ret; + + r = 0; +free_ret: + mempool_free(page, &v->recheck_pool); + + return r; +} + static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, u8 *data, size_t len) { @@ -508,9 +565,7 @@ static int verity_verify_io(struct dm_verity_io *io) { bool is_zero; struct dm_verity *v = io->v; -#if defined(CONFIG_DM_VERITY_FEC) struct bvec_iter start; -#endif struct bvec_iter iter_copy; struct bvec_iter *iter; struct crypto_wait wait; @@ -561,10 +616,7 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; -#if defined(CONFIG_DM_VERITY_FEC) - if (verity_fec_is_enabled(v)) - start = *iter; -#endif + start = *iter; r = verity_for_io_block(v, io, iter, &wait); if (unlikely(r < 0)) return r; @@ -586,6 +638,10 @@ static int verity_verify_io(struct dm_verity_io *io) * tasklet since it may sleep, so fallback to work-queue. */ return -EAGAIN; + } else if (verity_recheck(v, io, start, cur_block) == 0) { + if (v->validated_blocks) + set_bit(cur_block, v->validated_blocks); + continue; #if defined(CONFIG_DM_VERITY_FEC) } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, cur_block, NULL, &start) == 0) { @@ -645,23 +701,6 @@ static void verity_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } -static void verity_tasklet(unsigned long data) -{ - struct dm_verity_io *io = (struct dm_verity_io *)data; - int err; - - io->in_tasklet = true; - err = verity_verify_io(io); - if (err == -EAGAIN || err == -ENOMEM) { - /* fallback to retrying with work-queue */ - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); - return; - } - - verity_finish_io(io, errno_to_blk_status(err)); -} - static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; @@ -674,13 +713,8 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) { - tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io); - tasklet_schedule(&io->tasklet); - } else { - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); - } + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); } /* @@ -963,6 +997,10 @@ static void verity_dtr(struct dm_target *ti) if (v->verify_wq) destroy_workqueue(v->verify_wq); + mempool_exit(&v->recheck_pool); + if (v->io) + dm_io_client_destroy(v->io); + if (v->bufio) dm_bufio_client_destroy(v->bufio); @@ -1401,6 +1439,20 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } v->hash_blocks = hash_position; + r = mempool_init_page_pool(&v->recheck_pool, 1, 0); + if (unlikely(r)) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + + v->io = dm_io_client_create(); + if (IS_ERR(v->io)) { + r = PTR_ERR(v->io); + v->io = NULL; + ti->error = "Cannot allocate dm io"; + goto bad; + } + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), dm_bufio_alloc_callback, NULL, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index f9d522c870..4620a98c99 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -11,6 +11,7 @@ #ifndef DM_VERITY_H #define DM_VERITY_H +#include <linux/dm-io.h> #include <linux/dm-bufio.h> #include <linux/device-mapper.h> #include <linux/interrupt.h> @@ -68,6 +69,9 @@ struct dm_verity { unsigned long *validated_blocks; /* bitset blocks validated */ char *signature_key_desc; /* signature keyring reference */ + + struct dm_io_client *io; + mempool_t recheck_pool; }; struct dm_verity_io { @@ -83,7 +87,8 @@ struct dm_verity_io { struct bvec_iter iter; struct work_struct work; - struct tasklet_struct tasklet; + + char *recheck_buffer; /* * Three variably-size fields follow this struct: diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 9d3cca8e3d..60a4dc01ea 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -187,7 +187,7 @@ struct dmz_metadata { struct rb_root mblk_rbtree; struct list_head mblk_lru_list; struct list_head mblk_dirty_list; - struct shrinker mblk_shrinker; + struct shrinker *mblk_shrinker; /* Zone allocation management */ struct mutex map_lock; @@ -615,7 +615,7 @@ static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + struct dmz_metadata *zmd = shrink->private_data; return atomic_read(&zmd->nr_mblks); } @@ -626,7 +626,7 @@ static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + struct dmz_metadata *zmd = shrink->private_data; unsigned long count; spin_lock(&zmd->mblk_lock); @@ -2936,19 +2936,23 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, */ zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; zmd->max_nr_mblks = zmd->min_nr_mblks + 512; - zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; - zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; - zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker, "dm-zoned-meta:(%u:%u)", - MAJOR(dev->bdev->bd_dev), - MINOR(dev->bdev->bd_dev)); - if (ret) { - dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); + zmd->mblk_shrinker = shrinker_alloc(0, "dm-zoned-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); + if (!zmd->mblk_shrinker) { + ret = -ENOMEM; + dmz_zmd_err(zmd, "Allocate metadata cache shrinker failed"); goto err; } + zmd->mblk_shrinker->count_objects = dmz_mblock_shrinker_count; + zmd->mblk_shrinker->scan_objects = dmz_mblock_shrinker_scan; + zmd->mblk_shrinker->private_data = zmd; + + shrinker_register(zmd->mblk_shrinker); + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); for (i = 0; i < zmd->nr_devs; i++) dmz_print_dev(zmd, i); @@ -2995,7 +2999,7 @@ err: */ void dmz_dtr_metadata(struct dmz_metadata *zmd) { - unregister_shrinker(&zmd->mblk_shrinker); + shrinker_free(zmd->mblk_shrinker); dmz_cleanup_metadata(zmd); kfree(zmd); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 64a1f306c9..23c32cd1f1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -570,13 +570,15 @@ static void dm_end_io_acct(struct dm_io *io) dm_io_acct(io, true); } -static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) +static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t gfp_mask) { struct dm_io *io; struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs); + clone = bio_alloc_clone(NULL, bio, gfp_mask, &md->mempools->io_bs); + if (unlikely(!clone)) + return NULL; tio = clone_to_tio(clone); tio->flags = 0; dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); @@ -724,7 +726,7 @@ static struct table_device *open_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode) { struct table_device *td; - struct block_device *bdev; + struct bdev_handle *bdev_handle; u64 part_off; int r; @@ -733,9 +735,9 @@ static struct table_device *open_table_device(struct mapped_device *md, return ERR_PTR(-ENOMEM); refcount_set(&td->count, 1); - bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL); - if (IS_ERR(bdev)) { - r = PTR_ERR(bdev); + bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL); + if (IS_ERR(bdev_handle)) { + r = PTR_ERR(bdev_handle); goto out_free_td; } @@ -745,20 +747,22 @@ static struct table_device *open_table_device(struct mapped_device *md, * called. */ if (md->disk->slave_dir) { - r = bd_link_disk_holder(bdev, md->disk); + r = bd_link_disk_holder(bdev_handle->bdev, md->disk); if (r) goto out_blkdev_put; } td->dm_dev.mode = mode; - td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); + td->dm_dev.bdev = bdev_handle->bdev; + td->dm_dev.bdev_handle = bdev_handle; + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off, + NULL, NULL); format_dev_t(td->dm_dev.name, dev); list_add(&td->list, &md->table_devices); return td; out_blkdev_put: - blkdev_put(bdev, _dm_claim_ptr); + bdev_release(bdev_handle); out_free_td: kfree(td); return ERR_PTR(r); @@ -771,7 +775,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); - blkdev_put(td->dm_dev.bdev, _dm_claim_ptr); + bdev_release(td->dm_dev.bdev_handle); put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td); @@ -1424,9 +1428,16 @@ static void __map_bio(struct bio *clone) if (unlikely(dm_emulate_zone_append(md))) r = dm_zone_map_bio(tio); else + goto do_map; + } else { +do_map: + if (likely(ti->type->map == linear_map)) + r = linear_map(ti, clone); + else if (ti->type->map == stripe_map) + r = stripe_map(ti, clone); + else r = ti->type->map(ti, clone); - } else - r = ti->type->map(ti, clone); + } switch (r) { case DM_MAPIO_SUBMITTED: @@ -1471,15 +1482,15 @@ static void setup_split_accounting(struct clone_info *ci, unsigned int len) static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, - unsigned *len) + unsigned *len, gfp_t gfp_flag) { struct bio *bio; - int try; + int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1; - for (try = 0; try < 2; try++) { + for (; try < 2; try++) { int bio_nr; - if (try) + if (try && num_bios > 1) mutex_lock(&ci->io->md->table_devices_lock); for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { bio = alloc_tio(ci, ti, bio_nr, len, @@ -1489,7 +1500,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, bio_list_add(blist, bio); } - if (try) + if (try && num_bios > 1) mutex_unlock(&ci->io->md->table_devices_lock); if (bio_nr == num_bios) return; @@ -1499,34 +1510,31 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } } -static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned int num_bios, unsigned int *len) +static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, + unsigned int num_bios, unsigned int *len, + gfp_t gfp_flag) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; unsigned int ret = 0; - switch (num_bios) { - case 0: - break; - case 1: - if (len) - setup_split_accounting(ci, *len); - clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); - __map_bio(clone); - ret = 1; - break; - default: - if (len) - setup_split_accounting(ci, *len); - /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ - alloc_multiple_bios(&blist, ci, ti, num_bios, len); - while ((clone = bio_list_pop(&blist))) { + if (WARN_ON_ONCE(num_bios == 0)) /* num_bios = 0 is a bug in caller */ + return 0; + + /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ + if (len) + setup_split_accounting(ci, *len); + + /* + * Using alloc_multiple_bios(), even if num_bios is 1, to consistently + * support allocating using GFP_NOWAIT with GFP_NOIO fallback. + */ + alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag); + while ((clone = bio_list_pop(&blist))) { + if (num_bios > 1) dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); - __map_bio(clone); - ret += 1; - } - break; + __map_bio(clone); + ret += 1; } return ret; @@ -1553,8 +1561,12 @@ static void __send_empty_flush(struct clone_info *ci) unsigned int bios; struct dm_target *ti = dm_table_get_target(t, i); + if (unlikely(ti->num_flush_bios == 0)) + continue; + atomic_add(ti->num_flush_bios, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, + NULL, GFP_NOWAIT); atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); } @@ -1567,10 +1579,9 @@ static void __send_empty_flush(struct clone_info *ci) bio_uninit(ci->bio); } -static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, - unsigned int num_bios, - unsigned int max_granularity, - unsigned int max_sectors) +static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, + unsigned int num_bios, unsigned int max_granularity, + unsigned int max_sectors) { unsigned int len, bios; @@ -1578,7 +1589,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target __max_io_len(ti, ci->sector, max_granularity, max_sectors)); atomic_add(num_bios, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, num_bios, &len); + bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO); /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following (+1) subtraction @@ -1647,8 +1658,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, if (unlikely(!num_bios)) return BLK_STS_NOTSUPP; - __send_changing_extent_only(ci, ti, num_bios, - max_granularity, max_sectors); + __send_abnormal_io(ci, ti, num_bios, max_granularity, max_sectors); + return BLK_STS_OK; } @@ -1707,10 +1718,6 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) if (unlikely(!ti)) return BLK_STS_IOERR; - if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) && - unlikely(!dm_target_supports_nowait(ti->type))) - return BLK_STS_NOTSUPP; - if (unlikely(ci->is_abnormal_io)) return __process_abnormal_io(ci, ti); @@ -1722,7 +1729,17 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); setup_split_accounting(ci, len); - clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); + + if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) { + if (unlikely(!dm_target_supports_nowait(ti->type))) + return BLK_STS_NOTSUPP; + + clone = alloc_tio(ci, ti, 0, &len, GFP_NOWAIT); + if (unlikely(!clone)) + return BLK_STS_AGAIN; + } else { + clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); + } __map_bio(clone); ci->sector += len; @@ -1731,11 +1748,11 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) return BLK_STS_OK; } -static void init_clone_info(struct clone_info *ci, struct mapped_device *md, +static void init_clone_info(struct clone_info *ci, struct dm_io *io, struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; - ci->io = alloc_io(md, bio); + ci->io = io; ci->bio = bio; ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; @@ -1770,8 +1787,18 @@ static void dm_split_and_process_bio(struct mapped_device *md, return; } - init_clone_info(&ci, md, map, bio, is_abnormal); - io = ci.io; + /* Only support nowait for normal IO */ + if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { + io = alloc_io(md, bio, GFP_NOWAIT); + if (unlikely(!io)) { + /* Unable to do anything without dm_io. */ + bio_wouldblock_error(bio); + return; + } + } else { + io = alloc_io(md, bio, GFP_NOIO); + } + init_clone_info(&ci, io, map, bio, is_abnormal); if (bio->bi_opf & REQ_PREFLUSH) { __send_empty_flush(&ci); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index f682295af9..7f1acbf6bd 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -188,9 +188,11 @@ void dm_kobject_release(struct kobject *kobj); /* * Targets for linear and striped mappings */ +int linear_map(struct dm_target *ti, struct bio *bio); int dm_linear_init(void); void dm_linear_exit(void); +int stripe_map(struct dm_target *ti, struct bio *bio); int dm_stripe_init(void); void dm_stripe_exit(void); diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 6eaa0eab40..4b80165afd 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args) return; } - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) { pr_err("md: failed to lock array %s\n", name); goto out_mddev_put; @@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args) if (err) pr_warn("md: starting %s failed\n", name); out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); out_mddev_put: mddev_put(mddev); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 6f9ff14971..9672f75c30 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev) md_bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2348,14 +2348,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len) { int rv; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; + if (mddev->pers) { - if (!mddev->pers->quiesce) { - rv = -EBUSY; - goto out; - } if (mddev->recovery || mddev->sync_thread) { rv = -EBUSY; goto out; @@ -2369,11 +2366,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) rv = -EBUSY; goto out; } - if (mddev->pers) { - mddev_suspend(mddev); - md_bitmap_destroy(mddev); - mddev_resume(mddev); - } + + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; @@ -2383,6 +2377,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { /* No bitmap, OK to set a location */ long long offset; + struct bitmap *bitmap; + if (strncmp(buf, "none", 4) == 0) /* nothing to be done */; else if (strncmp(buf, "file:", 5) == 0) { @@ -2406,25 +2402,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; goto out; } + mddev->bitmap_info.offset = offset; - if (mddev->pers) { - struct bitmap *bitmap; - bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); - if (IS_ERR(bitmap)) - rv = PTR_ERR(bitmap); - else { - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); - if (rv) - mddev->bitmap_info.offset = 0; - } - if (rv) { - md_bitmap_destroy(mddev); - mddev_resume(mddev); - goto out; - } - mddev_resume(mddev); + bitmap = md_bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) { + rv = PTR_ERR(bitmap); + goto out; + } + + mddev->bitmap = bitmap; + rv = md_bitmap_load(mddev); + if (rv) { + mddev->bitmap_info.offset = 0; + md_bitmap_destroy(mddev); + goto out; } } } @@ -2437,7 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } rv = 0; out: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (rv) return rv; return len; @@ -2546,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; @@ -2571,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, false); + mddev_destroy_serial_pool(mddev, NULL); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return len; } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1e26eb2233..8e36a0feec 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -501,7 +501,7 @@ static void process_suspend_info(struct mddev *mddev, mddev->pers->quiesce(mddev, 0); } -static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) { char disk_uuid[64]; struct md_cluster_info *cinfo = mddev->cluster_info; @@ -509,6 +509,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) char raid_slot[16]; char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; int len; + int res = 0; len = snprintf(disk_uuid, 64, "DEVICE_UUID="); sprintf(disk_uuid + len, "%pU", cmsg->uuid); @@ -517,9 +518,14 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) init_completion(&cinfo->newdisk_completion); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); - wait_for_completion_timeout(&cinfo->newdisk_completion, - NEW_DEV_TIMEOUT); + if (!wait_for_completion_timeout(&cinfo->newdisk_completion, + NEW_DEV_TIMEOUT)) { + pr_err("md-cluster(%s:%d): timeout on a new disk adding\n", + __func__, __LINE__); + res = -1; + } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + return res; } @@ -594,7 +600,8 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) le64_to_cpu(msg->high)); break; case NEWDISK: - process_add_new_disk(mddev, msg); + if (process_add_new_disk(mddev, msg)) + ret = -1; break; case REMOVE: process_remove_disk(mddev, msg); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 71ac996468..8eca7693b7 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) if (!conf) return NULL; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + cnt = 0; conf->array_sectors = 0; @@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; - /* - * conf->raid_disks is copy of mddev->raid_disks. The reason to - * keep a copy of mddev->raid_disks in struct linear_conf is, - * mddev->raid_disks may not be consistent with pointers number of - * conf->disks[] when it is updated in linear_add() and used to - * iterate old conf->disks[] earray in linear_congested(). - * Here conf->raid_disks is always consitent with number of - * pointers in conf->disks[] array, and mddev->private is updated - * with rcu_assign_pointer() in linear_addr(), such race can be - * avoided. - */ - conf->raid_disks = raid_disks; - return conf; out: @@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) * in linear_congested(), therefore kfree_rcu() is used to free * oldconf until no one uses it anymore. */ - mddev_suspend(mddev); oldconf = rcu_dereference_protected(mddev->private, lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; @@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - mddev_resume(mddev); kfree_rcu(oldconf, rcu); return 0; } diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h index 24e97db50e..5587eeedb8 100644 --- a/drivers/md/md-linear.h +++ b/drivers/md/md-linear.h @@ -12,6 +12,6 @@ struct linear_conf struct rcu_head rcu; sector_t array_sectors; int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[]; + struct dev_info disks[] __counted_by(raid_disks); }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 8c40c1c395..58889bc726 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -82,6 +82,14 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; + +/* + * This workqueue is used for sync_work to register new sync_thread, and for + * del_work to remove rdev, and for event_work that is only set by dm-raid. + * + * Noted that sync_work will grab reconfig_mutex, hence never flush this + * workqueue whith reconfig_mutex grabbed. + */ static struct workqueue_struct *md_misc_wq; struct workqueue_struct *md_bitmap_wq; @@ -91,6 +99,18 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -206,8 +226,7 @@ static int rdev_need_serial(struct md_rdev *rdev) * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ -void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; @@ -215,15 +234,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, !test_bit(CollisionCheck, &rdev->flags)) return; - if (!is_suspend) - mddev_suspend(mddev); - if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) - goto abort; + return; if (mddev->serial_info_pool == NULL) { /* @@ -238,10 +254,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, pr_err("can't alloc memory pool for serialization\n"); } } - -abort: - if (!is_suspend) - mddev_resume(mddev); } /* @@ -250,8 +262,7 @@ abort: * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -260,8 +271,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ - if (!is_suspend) - mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || @@ -283,8 +292,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } - if (!is_suspend) - mddev_resume(mddev); } } @@ -305,7 +312,6 @@ static struct ctl_table raid_table[] = { .mode = S_IRUGO|S_IWUSR, .proc_handler = proc_dointvec, }, - { } }; static int start_readonly; @@ -346,6 +352,10 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -359,11 +369,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; if (bio_data_dir(bio) != WRITE) return false; - if (mddev->suspend_lo >= mddev->suspend_hi) + if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio_end_sector(bio) < mddev->suspend_lo) + if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } @@ -431,42 +441,73 @@ static void md_submit_bio(struct bio *bio) md_handle_request(mddev, bio); } -/* mddev_suspend makes sure no new requests are submitted - * to the device, and that any requests that have been submitted - * are completely handled. - * Once mddev_detach() is called and completes, the module will be - * completely unused. +/* + * Make sure no new requests are submitted to the device, and any requests that + * have been submitted are completely handled. */ -void mddev_suspend(struct mddev *mddev) +int mddev_suspend(struct mddev *mddev, bool interruptible) { - struct md_thread *thread = rcu_dereference_protected(mddev->thread, - lockdep_is_held(&mddev->reconfig_mutex)); + int err = 0; - WARN_ON_ONCE(thread && current == thread->tsk); - if (mddev->suspended++) - return; - wake_up(&mddev->sb_wait); - set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); - percpu_ref_kill(&mddev->active_io); + /* + * hold reconfig_mutex to wait for normal io will deadlock, because + * other context can't update super_block, and normal io can rely on + * updating super_block. + */ + lockdep_assert_not_held(&mddev->reconfig_mutex); - if (mddev->pers && mddev->pers->prepare_suspend) - mddev->pers->prepare_suspend(mddev); + if (interruptible) + err = mutex_lock_interruptible(&mddev->suspend_mutex); + else + mutex_lock(&mddev->suspend_mutex); + if (err) + return err; - wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); - wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); + if (mddev->suspended) { + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + mutex_unlock(&mddev->suspend_mutex); + return 0; + } + + percpu_ref_kill(&mddev->active_io); + if (interruptible) + err = wait_event_interruptible(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + else + wait_event(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + if (err) { + percpu_ref_resurrect(&mddev->active_io); + mutex_unlock(&mddev->suspend_mutex); + return err; + } + + /* + * For raid456, io might be waiting for reshape to make progress, + * allow new reshape to start while waiting for io to be done to + * prevent deadlock. + */ + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); + + mutex_unlock(&mddev->suspend_mutex); + return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); -void mddev_resume(struct mddev *mddev) +static void __mddev_resume(struct mddev *mddev, bool recovery_needed) { - lockdep_assert_held(&mddev->reconfig_mutex); - if (--mddev->suspended) + lockdep_assert_not_held(&mddev->reconfig_mutex); + + mutex_lock(&mddev->suspend_mutex); + WRITE_ONCE(mddev->suspended, mddev->suspended - 1); + if (mddev->suspended) { + mutex_unlock(&mddev->suspend_mutex); return; + } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); @@ -474,9 +515,17 @@ void mddev_resume(struct mddev *mddev) percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (recovery_needed) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + mutex_unlock(&mddev->suspend_mutex); +} + +void mddev_resume(struct mddev *mddev) +{ + return __mddev_resume(mddev, true); } EXPORT_SYMBOL_GPL(mddev_resume); @@ -530,8 +579,12 @@ static void submit_flushes(struct work_struct *ws) rcu_read_lock(); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); + queue_work(md_wq, &mddev->flush_work); + } } static void md_submit_flush_data(struct work_struct *ws) @@ -626,34 +679,63 @@ static inline struct mddev *mddev_get(struct mddev *mddev) static void mddev_delayed_delete(struct work_struct *ws); +static void __mddev_put(struct mddev *mddev) +{ + if (mddev->raid_disks || !list_empty(&mddev->disks) || + mddev->ctime || mddev->hold_active) + return; + + /* Array is not configured at all, and not held active, so destroy it */ + set_bit(MD_DELETED, &mddev->flags); + + /* + * Call queue_work inside the spinlock so that flush_workqueue() after + * mddev_find will succeed in waiting for the work to be done. + */ + queue_work(md_misc_wq, &mddev->del_work); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks) && - mddev->ctime == 0 && !mddev->hold_active) { - /* Array is not configured at all, and not held active, - * so destroy it */ - set_bit(MD_DELETED, &mddev->flags); - /* - * Call queue_work inside the spinlock so that - * flush_workqueue() after mddev_find will succeed in waiting - * for the work to be done. - */ - INIT_WORK(&mddev->del_work, mddev_delayed_delete); - queue_work(md_misc_wq, &mddev->del_work); - } + __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } static void md_safemode_timeout(struct timer_list *t); +static void md_start_sync(struct work_struct *ws); + +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} + +static void no_op(struct percpu_ref *r) {} -void mddev_init(struct mddev *mddev) +int mddev_init(struct mddev *mddev) { + + if (percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + return -ENOMEM; + + if (percpu_ref_init(&mddev->writes_pending, no_op, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + percpu_ref_exit(&mddev->active_io); + return -ENOMEM; + } + + /* We want to start with the refcount at zero */ + percpu_ref_put(&mddev->writes_pending); + mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->sync_mutex); + mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -672,9 +754,21 @@ void mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + + INIT_WORK(&mddev->sync_work, md_start_sync); + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + + return 0; } EXPORT_SYMBOL_GPL(mddev_init); +void mddev_destroy(struct mddev *mddev) +{ + percpu_ref_exit(&mddev->active_io); + percpu_ref_exit(&mddev->writes_pending); +} +EXPORT_SYMBOL_GPL(mddev_destroy); + static struct mddev *mddev_find_locked(dev_t unit) { struct mddev *mddev; @@ -718,13 +812,16 @@ static struct mddev *mddev_alloc(dev_t unit) new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); - mddev_init(new); + + error = mddev_init(new); + if (error) + goto out_free_new; spin_lock(&all_mddevs_lock); if (unit) { error = -EEXIST; if (mddev_find_locked(unit)) - goto out_free_new; + goto out_destroy_new; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); @@ -735,7 +832,7 @@ static struct mddev *mddev_alloc(dev_t unit) error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) - goto out_free_new; + goto out_destroy_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; } @@ -743,8 +840,11 @@ static struct mddev *mddev_alloc(dev_t unit) list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; -out_free_new: + +out_destroy_new: spin_unlock(&all_mddevs_lock); + mddev_destroy(new); +out_free_new: kfree(new); return ERR_PTR(error); } @@ -755,6 +855,7 @@ static void mddev_free(struct mddev *mddev) list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); + mddev_destroy(mddev); kfree(mddev); } @@ -940,9 +1041,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, - 1, - REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, - GFP_NOIO, &mddev->sync_set); + 1, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META + | REQ_PREFLUSH | REQ_FUA, + GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); @@ -1122,6 +1224,7 @@ struct super_type { struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, + struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); @@ -1259,8 +1362,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor /* * validate_super for 0.90.0 + * note: we are not using "freshest" for 0.9 superblock */ -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); @@ -1772,7 +1876,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return ret; } -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); @@ -1868,13 +1972,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for - * spares (which don't need an event count) */ - ++ev1; + * spares (which don't need an event count). + * Similar to mdadm, we allow event counter difference of 1 + * from the freshest device. + */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) - if (ev1 < mddev->events) + if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an @@ -1895,8 +2001,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; - } else + } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { + /* + * If we are assembling, and our event counter is smaller than the + * highest event counter, we cannot trust our superblock about the role. + * It could happen that our rdev was marked as Faulty, and all other + * superblocks were updated with +1 event counter. + * Then, before the next superblock update, which typically happens when + * remove_and_add_spares() removes the device from the array, there was + * a crash or reboot. + * If we allow current rdev without consulting the freshest superblock, + * we could cause data corruption. + * Note that in this case our event counter is smaller by 1 than the + * highest, otherwise, this rdev would not be allowed into array; + * both kernel and mdadm allow event counter difference of 1. + */ + struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); + u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); + + if (rdev->desc_nr >= freshest_max_dev) { + /* this is unexpected, better not proceed */ + pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", + mdname(mddev), rdev->bdev, rdev->desc_nr, + freshest->bdev, freshest_max_dev); + return -EUCLEAN; + } + + role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); + pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", + mdname(mddev), rdev->bdev, role, role, freshest->bdev); + } else { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + } switch(role) { case MD_DISK_ROLE_SPARE: /* spare */ break; @@ -2422,7 +2558,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2462,8 +2598,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, - test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev); + bdev_release(rdev->bdev_handle); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -2475,7 +2610,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2804,12 +2939,8 @@ static int add_bound_rdev(struct md_rdev *rdev) * and should be added immediately. */ super_types[mddev->major_version]. - validate_super(mddev, rdev); - if (add_journal) - mddev_suspend(mddev); + validate_super(mddev, NULL/*freshest*/, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); - if (add_journal) - mddev_resume(mddev); if (err) { md_kick_rdev_from_array(rdev); return err; @@ -2946,11 +3077,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_serial_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; @@ -3562,6 +3693,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; + bool suspend = false; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3569,17 +3701,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + if (!mddev) + return -ENODEV; - if (entry->store == state_store && cmd_match(page, "remove")) - kn = sysfs_break_active_protection(kobj, attr); + if (entry->store == state_store) { + if (cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + if (cmd_match(page, "remove") || cmd_match(page, "re-add") || + cmd_match(page, "writemostly") || + cmd_match(page, "-writemostly")) + suspend = true; + } - rv = mddev ? mddev_lock(mddev) : -ENODEV; + rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) @@ -3643,7 +3783,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init); static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { struct md_rdev *rdev; - struct md_rdev *holder; sector_t size; int err; @@ -3658,21 +3797,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; - if (super_format == -2) { - holder = &claim_rdev; - } else { - holder = rdev; - set_bit(Holder, &rdev->flags); - } - - rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, - holder, NULL); - if (IS_ERR(rdev->bdev)) { + rdev->bdev_handle = bdev_open_by_dev(newdev, + BLK_OPEN_READ | BLK_OPEN_WRITE, + super_format == -2 ? &claim_rdev : rdev, NULL); + if (IS_ERR(rdev->bdev_handle)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); - err = PTR_ERR(rdev->bdev); + err = PTR_ERR(rdev->bdev_handle); goto out_clear_rdev; } + rdev->bdev = rdev->bdev_handle->bdev; kobject_init(&rdev->kobj, &rdev_ktype); @@ -3703,7 +3837,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe return rdev; out_blkdev_put: - blkdev_put(rdev->bdev, holder); + bdev_release(rdev->bdev_handle); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: @@ -3742,7 +3876,7 @@ static int analyze_sbs(struct mddev *mddev) } super_types[mddev->major_version]. - validate_super(mddev, freshest); + validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { @@ -3757,7 +3891,7 @@ static int analyze_sbs(struct mddev *mddev) } if (rdev != freshest) { if (super_types[mddev->major_version]. - validate_super(mddev, rdev)) { + validate_super(mddev, freshest, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); @@ -3884,12 +4018,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers == NULL) { - strncpy(mddev->clevel, buf, slen); + memcpy(mddev->clevel, buf, slen); if (mddev->clevel[slen-1] == '\n') slen--; mddev->clevel[slen] = 0; @@ -3922,7 +4056,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Now find the new personality */ - strncpy(clevel, buf, slen); + memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n') slen--; clevel[slen] = 0; @@ -3977,7 +4111,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Looks like we have a winner */ - mddev_suspend(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); @@ -4063,14 +4196,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len) blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return rv; } @@ -4378,6 +4510,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) int err = 0; enum array_state st = match_word(buf, array_states); + /* No lock dependent actions */ + switch (st) { + case suspended: /* not supported yet */ + case write_pending: /* cannot be set */ + case active_idle: /* cannot be set */ + case broken: /* cannot be set */ + case bad_word: + return -EINVAL; + default: + break; + } + if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between @@ -4402,23 +4546,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_lock(mddev); if (err) return err; - err = -EINVAL; - switch(st) { - case bad_word: - break; - case clear: - /* stopping an active array */ - err = do_md_stop(mddev, 0, NULL); - break; + + switch (st) { case inactive: - /* stopping an active array */ + /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2, NULL); - else - err = 0; /* already inactive */ break; - case suspended: - break; /* not supported yet */ + case clear: + err = do_md_stop(mddev, 0, NULL); + break; case readonly: if (mddev->pers) err = md_set_readonly(mddev, NULL); @@ -4469,10 +4606,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = do_md_run(mddev); } break; - case write_pending: - case active_idle: - case broken: - /* these cannot be set */ + default: + err = -EINVAL; break; } @@ -4545,7 +4680,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { @@ -4566,14 +4701,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; @@ -4708,7 +4843,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; - strncpy(mddev->metadata_type, buf+9, namelen); + memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; @@ -4768,25 +4903,29 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } -static void stop_sync_thread(struct mddev *mddev) +/** + * stop_sync_thread() - wait for sync_thread to stop if it's running. + * @mddev: the array. + * @locked: if set, reconfig_mutex will still be held after this function + * return; if not set, reconfig_mutex will be released after this + * function return. + * @check_seq: if set, only wait for curent running sync_thread to stop, noted + * that new sync_thread can still start. + */ +static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) { - if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return; + int sync_seq; - if (mddev_lock(mddev)) - return; + if (check_seq) + sync_seq = atomic_read(&mddev->sync_seq); - /* - * Check again in case MD_RECOVERY_RUNNING is cleared before lock is - * held. - */ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - mddev_unlock(mddev); + if (!locked) + mddev_unlock(mddev); return; } - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); + mddev_unlock(mddev); set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* @@ -4794,21 +4933,28 @@ static void stop_sync_thread(struct mddev *mddev) * never happen */ md_wakeup_thread_directly(mddev->sync_thread); + if (work_pending(&mddev->sync_work)) + flush_work(&mddev->sync_work); - mddev_unlock(mddev); + wait_event(resync_wait, + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); + + if (locked) + mddev_lock_nointr(mddev); } static void idle_sync_thread(struct mddev *mddev) { - int sync_seq = atomic_read(&mddev->sync_seq); - mutex_lock(&mddev->sync_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev); - wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || - !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + if (mddev_lock(mddev)) { + mutex_unlock(&mddev->sync_mutex); + return; + } + stop_sync_thread(mddev, false, true); mutex_unlock(&mddev->sync_mutex); } @@ -4816,11 +4962,13 @@ static void frozen_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev); - wait_event(resync_wait, mddev->sync_thread == NULL && - !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + if (mddev_lock(mddev)) { + mutex_unlock(&mddev->sync_mutex); + return; + } + stop_sync_thread(mddev, false, false); mutex_unlock(&mddev->sync_mutex); } @@ -4882,6 +5030,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) /* A write to sync_action is enough to justify * canceling read-auto mode */ + flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } @@ -5138,7 +5287,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t @@ -5153,21 +5303,14 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend(mddev, true); if (err) return err; - err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - goto unlock; - mddev_suspend(mddev); - mddev->suspend_lo = new; + + WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); - err = 0; -unlock: - mddev_unlock(mddev); - return err ?: len; + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -5175,7 +5318,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t @@ -5190,21 +5334,14 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend(mddev, true); if (err) return err; - err = -EINVAL; - if (mddev->pers == NULL) - goto unlock; - mddev_suspend(mddev); - mddev->suspend_hi = new; + WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); - err = 0; -unlock: - mddev_unlock(mddev); - return err ?: len; + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -5451,7 +5588,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) if (value == mddev->serialize_policy) return len; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->level != 1)) { @@ -5460,15 +5597,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; } - mddev_suspend(mddev); if (value) - mddev_create_serial_pool(mddev, NULL, true); + mddev_create_serial_pool(mddev, NULL); else - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; - mddev_resume(mddev); unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -5607,21 +5742,6 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } -static void no_op(struct percpu_ref *r) {} - -int mddev_init_writes_pending(struct mddev *mddev) -{ - if (mddev->writes_pending.percpu_count_ptr) - return 0; - if (percpu_ref_init(&mddev->writes_pending, no_op, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) - return -ENOMEM; - /* We want to start with the refcount at zero */ - percpu_ref_put(&mddev->writes_pending); - return 0; -} -EXPORT_SYMBOL_GPL(mddev_init_writes_pending); - struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5793,12 +5913,6 @@ static void md_safemode_timeout(struct timer_list *t) } static int start_dirty_degraded; -static void active_io_release(struct percpu_ref *ref) -{ - struct mddev *mddev = container_of(ref, struct mddev, active_io); - - wake_up(&mddev->sb_wait); -} int md_run(struct mddev *mddev) { @@ -5879,15 +5993,10 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } - err = percpu_ref_init(&mddev->active_io, active_io_release, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); - if (err) - return err; - if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - goto exit_active_io; + return err; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); @@ -6084,8 +6193,6 @@ exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); -exit_active_io: - percpu_ref_exit(&mddev->active_io); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6233,14 +6340,7 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - + stop_sync_thread(mddev, true, false); del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { @@ -6259,7 +6359,7 @@ static void __md_stop_writes(struct mddev *mddev) } /* disable policy to guarantee rdevs free resources for serialization */ mddev->serialize_policy = 0; - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) @@ -6287,9 +6387,6 @@ static void __md_stop(struct mddev *mddev) struct md_personality *pers = mddev->pers; md_bitmap_destroy(mddev); mddev_detach(mddev); - /* Ensure ->event_work is done */ - if (mddev->event_work.func) - flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); @@ -6301,7 +6398,6 @@ static void __md_stop(struct mddev *mddev) module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); bioset_exit(&mddev->io_clone_set); @@ -6316,7 +6412,6 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); - percpu_ref_exit(&mddev->writes_pending); } EXPORT_SYMBOL_GPL(md_stop); @@ -6334,18 +6429,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); md_wakeup_thread(mddev->thread); } - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - /* - * Thread might be blocked waiting for metadata update which will now - * never happen - */ - md_wakeup_thread_directly(mddev->sync_thread); - - mddev_unlock(mddev); - wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, - &mddev->recovery)); + stop_sync_thread(mddev, false, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); @@ -6399,20 +6484,8 @@ static int do_md_stop(struct mddev *mddev, int mode, set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); md_wakeup_thread(mddev->thread); } - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - - /* - * Thread might be blocked waiting for metadata update which will now - * never happen - */ - md_wakeup_thread_directly(mddev->sync_thread); - mddev_unlock(mddev); - wait_event(resync_wait, (mddev->sync_thread == NULL && - !test_bit(MD_RECOVERY_RUNNING, - &mddev->recovery))); - mddev_lock_nointr(mddev); + stop_sync_thread(mddev, true, false); mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || @@ -6555,13 +6628,13 @@ static void autorun_devices(int part) if (IS_ERR(mddev)) break; - if (mddev_lock(mddev)) + if (mddev_suspend_and_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { pr_warn("md: %s already running, cannot run %pg\n", mdname(mddev), rdev0->bdev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; @@ -6571,7 +6644,7 @@ static void autorun_devices(int part) export_rdev(rdev, mddev); } autorun_array(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... @@ -6809,7 +6882,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't @@ -7121,7 +7194,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = md_bitmap_load(mddev); @@ -7131,11 +7203,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) md_bitmap_destroy(mddev); fd = -1; } - mddev_resume(mddev); } else if (fd < 0) { - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); } } if (fd < 0) { @@ -7424,7 +7493,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); @@ -7432,7 +7500,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) md_bitmap_destroy(mddev); - mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -7457,9 +7524,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7530,6 +7595,20 @@ static inline bool md_ioctl_valid(unsigned int cmd) } } +static bool md_ioctl_need_suspend(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case SET_BITMAP_FILE: + case SET_ARRAY_INFO: + return true; + default: + return false; + } +} + static int __md_set_array_info(struct mddev *mddev, void __user *argp) { mdu_array_info_t info; @@ -7658,7 +7737,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } - err = mddev_lock(mddev); + + if (!md_is_rdwr(mddev)) + flush_work(&mddev->sync_work); + + err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : + mddev_lock(mddev); if (err) { pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); @@ -7786,7 +7870,10 @@ unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; - mddev_unlock(mddev); + + md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : + mddev_unlock(mddev); + out: if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); @@ -7898,7 +7985,6 @@ static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; - percpu_ref_exit(&mddev->writes_pending); mddev_free(mddev); } @@ -8075,6 +8161,19 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } +static void status_personalities(struct seq_file *seq) +{ + struct md_personality *pers; + + seq_puts(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_puts(seq, "\n"); +} + static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; @@ -8214,105 +8313,43 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } static void *md_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&all_mddevs_lock) { - struct list_head *tmp; - loff_t l = *pos; - struct mddev *mddev; - - if (l == 0x10000) { - ++*pos; - return (void *)2; - } - if (l > 0x10000) - return NULL; - if (!l--) - /* header */ - return (void*)1; - + seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); - list_for_each(tmp,&all_mddevs) - if (!l--) { - mddev = list_entry(tmp, struct mddev, all_mddevs); - if (!mddev_get(mddev)) - continue; - spin_unlock(&all_mddevs_lock); - return mddev; - } - spin_unlock(&all_mddevs_lock); - if (!l--) - return (void*)2;/* tail */ - return NULL; + + return seq_list_start_head(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *tmp; - struct mddev *next_mddev, *mddev = v; - struct mddev *to_put = NULL; - - ++*pos; - if (v == (void*)2) - return NULL; - - spin_lock(&all_mddevs_lock); - if (v == (void*)1) { - tmp = all_mddevs.next; - } else { - to_put = mddev; - tmp = mddev->all_mddevs.next; - } - - for (;;) { - if (tmp == &all_mddevs) { - next_mddev = (void*)2; - *pos = 0x10000; - break; - } - next_mddev = list_entry(tmp, struct mddev, all_mddevs); - if (mddev_get(next_mddev)) - break; - mddev = next_mddev; - tmp = mddev->all_mddevs.next; - } - spin_unlock(&all_mddevs_lock); - - if (to_put) - mddev_put(to_put); - return next_mddev; - + return seq_list_next(v, &all_mddevs, pos); } static void md_seq_stop(struct seq_file *seq, void *v) + __releases(&all_mddevs_lock) { - struct mddev *mddev = v; - - if (mddev && v != (void*)1 && v != (void*)2) - mddev_put(mddev); + spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = v; + struct mddev *mddev; sector_t sectors; struct md_rdev *rdev; - if (v == (void*)1) { - struct md_personality *pers; - seq_printf(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_printf(seq, "\n"); - seq->poll_event = atomic_read(&md_event_count); + if (v == &all_mddevs) { + status_personalities(seq); + if (list_empty(&all_mddevs)) + status_unused(seq); return 0; } - if (v == (void*)2) { - status_unused(seq); + + mddev = list_entry(v, struct mddev, all_mddevs); + if (!mddev_get(mddev)) return 0; - } + spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), @@ -8383,6 +8420,13 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + spin_lock(&all_mddevs_lock); + + if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) + status_unused(seq); + + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); return 0; } @@ -8582,6 +8626,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ + flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -8772,12 +8817,16 @@ void md_do_sync(struct md_thread *thread) int ret; /* just incase thread restarts... */ - if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto skip; + + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - return; + goto skip; } if (mddev_is_clustered(mddev)) { @@ -9168,12 +9217,90 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); - wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } EXPORT_SYMBOL_GPL(md_do_sync); +static bool rdev_removeable(struct md_rdev *rdev) +{ + /* rdev is not used. */ + if (rdev->raid_disk < 0) + return false; + + /* There are still inflight io, don't remove this rdev. */ + if (atomic_read(&rdev->nr_pending)) + return false; + + /* + * An error occurred but has not yet been acknowledged by the metadata + * handler, don't remove this rdev. + */ + if (test_bit(Blocked, &rdev->flags)) + return false; + + /* Fautly rdev is not used, it's safe to remove it. */ + if (test_bit(Faulty, &rdev->flags)) + return true; + + /* Journal disk can only be removed if it's faulty. */ + if (test_bit(Journal, &rdev->flags)) + return false; + + /* + * 'In_sync' is cleared while 'raid_disk' is valid, which means + * replacement has just become active from pers->spare_active(), and + * then pers->hot_remove_disk() will replace this rdev with replacement. + */ + if (!test_bit(In_sync, &rdev->flags)) + return true; + + return false; +} + +static bool rdev_is_spare(struct md_rdev *rdev) +{ + return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags); +} + +static bool rdev_addable(struct md_rdev *rdev) +{ + /* rdev is already used, don't add it again. */ + if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || + test_bit(Faulty, &rdev->flags)) + return false; + + /* Allow to add journal disk. */ + if (test_bit(Journal, &rdev->flags)) + return true; + + /* Allow to add if array is read-write. */ + if (md_is_rdwr(rdev->mddev)) + return true; + + /* + * For read-only array, only allow to readd a rdev. And if bitmap is + * used, don't allow to readd a rdev that is too old. + */ + if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) + return true; + + return false; +} + +static bool md_spares_need_change(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev_removeable(rdev) || rdev_addable(rdev)) + return true; + return false; +} + static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this) { @@ -9206,12 +9333,8 @@ static int remove_and_add_spares(struct mddev *mddev, synchronize_rcu(); rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - ((test_bit(RemoveSynchronized, &rdev->flags) || - (!test_bit(In_sync, &rdev->flags) && - !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0)) { + (test_bit(RemoveSynchronized, &rdev->flags) || + rdev_removeable(rdev))) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -9233,25 +9356,12 @@ static int remove_and_add_spares(struct mddev *mddev, rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; - if (test_bit(Candidate, &rdev->flags)) - continue; - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) + if (rdev_is_spare(rdev)) spares++; - if (rdev->raid_disk >= 0) + if (!rdev_addable(rdev)) continue; - if (test_bit(Faulty, &rdev->flags)) - continue; - if (!test_bit(Journal, &rdev->flags)) { - if (!md_is_rdwr(mddev) && - !(rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; - + if (!test_bit(Journal, &rdev->flags)) rdev->recovery_offset = 0; - } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { /* failure here is OK */ sysfs_link_rdev(mddev, rdev); @@ -9267,30 +9377,152 @@ no_add: return spares; } +static bool md_choose_sync_action(struct mddev *mddev, int *spares) +{ + /* Check if reshape is in progress first. */ + if (mddev->reshape_position != MaxSector) { + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) + return false; + + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* + * Remove any failed drives, then add spares if possible. Spares are + * also removed and re-added, to allow the personality to fail the + * re-add. + */ + *spares = remove_and_add_spares(mddev, NULL); + if (*spares) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + + /* Start new recovery. */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Check if recovery is in progress. */ + if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Delay to choose resync/check/repair in md_do_sync(). */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + return true; + + /* Nothing to be done */ + return false; +} + static void md_start_sync(struct work_struct *ws) { - struct mddev *mddev = container_of(ws, struct mddev, del_work); + struct mddev *mddev = container_of(ws, struct mddev, sync_work); + int spares = 0; + bool suspend = false; + char *name; + + /* + * If reshape is still in progress, spares won't be added or removed + * from conf until reshape is done. + */ + if (mddev->reshape_position == MaxSector && + md_spares_need_change(mddev)) { + suspend = true; + mddev_suspend(mddev, false); + } + + mddev_lock_nointr(mddev); + if (!md_is_rdwr(mddev)) { + /* + * On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself is in-sync. + * As we only add devices that are already in-sync, we can + * activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + goto not_running; + } + + if (!md_choose_sync_action(mddev, &spares)) + goto not_running; + + if (!mddev->pers->sync_request) + goto not_running; + /* + * We are adding a device or devices to an array which has the bitmap + * stored on all devices. So make sure all bitmap pages get written. + */ + if (spares) + md_bitmap_write_all(mddev->bitmap); + + name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? + "reshape" : "resync"; rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "resync")); + md_register_thread(md_do_sync, mddev, name)); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - wake_up(&resync_wait); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); - } else - md_wakeup_thread(mddev->sync_thread); + goto not_running; + } + + mddev_unlock(mddev); + /* + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should + * not set it again. Otherwise, we may cause issue like this one: + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 + * Therefore, use __mddev_resume(mddev, false). + */ + if (suspend) + __mddev_resume(mddev, false); + md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); + return; + +not_running: + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev_unlock(mddev); + /* + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should + * not set it again. Otherwise, we may cause issue like this one: + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 + * Therefore, use __mddev_resume(mddev, false). + */ + if (suspend) + __mddev_resume(mddev, false); + + wake_up(&resync_wait); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); +} + +static void unregister_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + return; + + md_reap_sync_thread(mddev); } /* @@ -9317,21 +9549,6 @@ static void md_start_sync(struct work_struct *ws) */ void md_check_recovery(struct mddev *mddev) { - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { - /* Write superblock - thread that called mddev_suspend() - * holds reconfig_mutex for us. - */ - set_bit(MD_UPDATING_SB, &mddev->flags); - smp_mb__after_atomic(); - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) - md_update_sb(mddev, 0); - clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); - wake_up(&mddev->sb_wait); - } - - if (is_md_suspended(mddev)) - return; - if (mddev->bitmap) md_bitmap_daemon_work(mddev); @@ -9345,7 +9562,8 @@ void md_check_recovery(struct mddev *mddev) } if (!md_is_rdwr(mddev) && - !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || @@ -9358,7 +9576,6 @@ void md_check_recovery(struct mddev *mddev) return; if (mddev_trylock(mddev)) { - int spares = 0; bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) @@ -9366,30 +9583,42 @@ void md_check_recovery(struct mddev *mddev) if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + unregister_sync_thread(mddev); + goto unlock; + } + if (!mddev->external && mddev->in_sync) - /* 'Blocked' flag not needed as failed devices + /* + * 'Blocked' flag not needed as failed devices * will be recorded if array switched to read/write. * Leaving it set will prevent the device * from being removed. */ rdev_for_each(rdev, mddev) clear_bit(Blocked, &rdev->flags); - /* On a read-only array we can: - * - remove failed devices - * - add already-in_sync devices if the array itself - * is in-sync. - * As we only add devices that are already in-sync, - * we can activate the spares immediately. - */ - remove_and_add_spares(mddev, NULL); - /* There is no thread, but we need to call + + /* + * There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); + + /* + * Let md_start_sync() to remove and add rdevs to the + * array. + */ + if (md_spares_need_change(mddev)) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + queue_work(md_misc_wq, &mddev->sync_work); + } + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + goto unlock; } @@ -9419,16 +9648,7 @@ void md_check_recovery(struct mddev *mddev) * still set. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - - if (WARN_ON_ONCE(!mddev->sync_thread)) - goto unlock; - - md_reap_sync_thread(mddev); + unregister_sync_thread(mddev); goto unlock; } @@ -9445,56 +9665,14 @@ void md_check_recovery(struct mddev *mddev) clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - goto not_running; - /* no recovery is running. - * remove any failed drives, then - * add spares if possible. - * Spares are also removed and re-added, to allow - * the personality to fail the re-add. - */ - - if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev) != 0) - /* Cannot proceed */ - goto not_running; - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev, NULL))) { - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - /* nothing to be done ... */ - goto not_running; - - if (mddev->pers->sync_request) { - if (spares) { - /* We are adding a device or devices to an array - * which has the bitmap stored on all devices. - * So make sure all bitmap pages get written - */ - md_bitmap_write_all(mddev->bitmap); - } - INIT_WORK(&mddev->del_work, md_start_sync); - queue_work(md_misc_wq, &mddev->del_work); - goto unlock; - } - not_running: - if (!mddev->sync_thread) { + if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + queue_work(md_misc_wq, &mddev->sync_work); + } else { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); wake_up(&resync_wait); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); } + unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7c9c13abd7..ade83af123 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -59,6 +59,7 @@ struct md_rdev { */ struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ + struct bdev_handle *bdev_handle; /* Handle from open for bdev */ struct page *sb_page, *bb_page; int sb_loaded; @@ -211,9 +212,6 @@ enum flag_bits { * check if there is collision between raid1 * serial bios. */ - Holder, /* rdev is used as holder while opening - * underlying disk exclusively. - */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -248,10 +246,6 @@ struct md_cluster_info; * become failed. * @MD_HAS_PPL: The raid array has PPL feature set. * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. - * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata - * without taking reconfig_mutex. - * @MD_UPDATING_SB: md_check_recovery is updating the metadata without - * explicitly holding reconfig_mutex. * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. @@ -268,8 +262,6 @@ enum mddev_flags { MD_FAILFAST_SUPPORTED, MD_HAS_PPL, MD_HAS_MULTIPLE_PPLS, - MD_ALLOW_SB_UPDATE, - MD_UPDATING_SB, MD_NOT_READY, MD_BROKEN, MD_DELETED, @@ -316,6 +308,7 @@ struct mddev { unsigned long sb_flags; int suspended; + struct mutex suspend_mutex; struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes @@ -453,7 +446,10 @@ struct mddev { struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ struct kernfs_node *sysfs_level; /*handle for 'level' */ - struct work_struct del_work; /* used for delayed sysfs removal */ + /* used for delayed sysfs removal */ + struct work_struct del_work; + /* used for register new sync thread */ + struct work_struct sync_work; /* "lock" protects: * flush_bio transition from NULL to !NULL @@ -567,23 +563,6 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static inline bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - -static inline bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} - static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -643,7 +622,6 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); - void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -768,7 +746,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); -extern int mddev_init_writes_pending(struct mddev *mddev); extern bool md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); @@ -795,7 +772,8 @@ extern int md_integrity_register(struct mddev *mddev); extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); -extern void mddev_init(struct mddev *mddev); +extern int mddev_init(struct mddev *mddev); +extern void mddev_destroy(struct mddev *mddev); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); @@ -806,15 +784,14 @@ extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); -extern void mddev_suspend(struct mddev *mddev); +extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); -extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); -extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); +extern void mddev_destroy_serial_pool(struct mddev *mddev, + struct md_rdev *rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); @@ -852,6 +829,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio mddev->queue->limits.max_write_zeroes_sectors = 0; } +static inline int mddev_suspend_and_lock(struct mddev *mddev) +{ + int ret; + + ret = mddev_suspend(mddev, true); + if (ret) + return ret; + + ret = mddev_lock(mddev); + if (ret) + mddev_resume(mddev); + + return ret; +} + +static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) +{ + mddev_suspend(mddev, false); + mutex_lock(&mddev->reconfig_mutex); +} + +static inline void mddev_unlock_and_resume(struct mddev *mddev) +{ + mddev_unlock(mddev); + mddev_resume(mddev); +} + struct mdu_array_info_s; struct mdu_disk_info_s; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 911670273d..e138922d51 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; bool write_behind = false; + bool is_discard = (bio_op(bio) == REQ_OP_DISCARD); if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * write-mostly, which means we could allocate write behind * bio later. */ - if (rdev && test_bit(WriteMostly, &rdev->flags)) + if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) write_behind = true; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { @@ -3122,8 +3123,7 @@ static int raid1_run(struct mddev *mddev) mdname(mddev)); return -EIO; } - if (mddev_init_writes_pending(mddev) < 0) - return -ENOMEM; + /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0234131208..b7b0a573e7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev) sector_t min_offset_diff = 0; int first = 1; - if (mddev_init_writes_pending(mddev) < 0) - return -ENOMEM; - if (mddev->private == NULL) { conf = setup_conf(mddev); if (IS_ERR(conf)) @@ -4310,11 +4307,7 @@ static int raid10_run(struct mddev *mddev) clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) - goto out_free_conf; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } return 0; @@ -4710,16 +4703,8 @@ out: clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) { - ret = -EAGAIN; - goto abort; - } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); conf->reshape_checkpoint = jiffies; - md_wakeup_thread(mddev->sync_thread); md_new_event(); return 0; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 518b7cfa78..6157f5beb9 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space); void r5c_check_stripe_cache_usage(struct r5conf *conf) { int total_cached; + struct r5l_log *log = READ_ONCE(conf->log); - if (!r5c_is_writeback(conf->log)) + if (!r5c_is_writeback(log)) return; total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + @@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ if (total_cached > conf->min_nr_stripes * 1 / 2 || atomic_read(&conf->empty_inactive_list_nr) > 0) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ void r5c_check_cached_full_stripe(struct r5conf *conf) { - if (!r5c_is_writeback(conf->log)) + struct r5l_log *log = READ_ONCE(conf->log); + + if (!r5c_is_writeback(log)) return; /* @@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!r5c_is_writeback(log)) return 0; @@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log) void r5c_make_stripe_write_out(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); BUG_ON(!r5c_is_writeback(log)); @@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh) */ static void r5c_finish_cache_stripe(struct stripe_head *sh) { - struct r5l_log *log = sh->raid_conf->log; + struct r5l_log *log = READ_ONCE(sh->raid_conf->log); if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); @@ -683,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work) disable_writeback_work); struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; - int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -692,14 +694,14 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - conf->log == NULL || - (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && - (locked = mddev_trylock(mddev)))); - if (locked) { - mddev_suspend(mddev); + !READ_ONCE(conf->log) || + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + + log = READ_ONCE(conf->log); + if (log) { + mddev_suspend(mddev, false); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; mddev_resume(mddev); - mddev_unlock(mddev); } } @@ -1151,7 +1153,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) static sector_t r5c_calculate_new_cp(struct r5conf *conf) { struct stripe_head *sh; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t new_cp; unsigned long flags; @@ -1159,12 +1161,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf) return log->next_checkpoint; spin_lock_irqsave(&log->stripe_in_journal_lock, flags); - if (list_empty(&conf->log->stripe_in_journal_list)) { + if (list_empty(&log->stripe_in_journal_list)) { /* all stripes flushed */ spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); return log->next_checkpoint; } - sh = list_first_entry(&conf->log->stripe_in_journal_list, + sh = list_first_entry(&log->stripe_in_journal_list, struct stripe_head, r5c); new_cp = sh->log_start; spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); @@ -1399,7 +1401,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) struct stripe_head *sh, *next; lockdep_assert_held(&conf->device_lock); - if (!conf->log) + if (!READ_ONCE(conf->log)) return; count = 0; @@ -1420,7 +1422,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) static void r5c_do_reclaim(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); struct stripe_head *sh; int count = 0; unsigned long flags; @@ -1549,7 +1551,7 @@ static void r5l_reclaim_thread(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; @@ -1591,7 +1593,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); /* don't allow write if journal disk is missing */ if (!log) @@ -2583,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode) mode == R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; - mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; - mddev_resume(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); @@ -2610,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, if (strlen(r5c_journal_mode_str[mode]) == len && !strncmp(page, r5c_journal_mode_str[mode], len)) break; - ret = mddev_lock(mddev); + ret = mddev_suspend_and_lock(mddev); if (ret) return ret; ret = r5c_journal_mode_set(mddev, mode); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return ret ?: length; } @@ -2635,7 +2635,7 @@ int r5c_try_caching_write(struct r5conf *conf, struct stripe_head_state *s, int disks) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; struct r5dev *dev; int to_cache = 0; @@ -2802,7 +2802,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; int do_wakeup = 0; sector_t tree_index; @@ -2941,7 +2941,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) /* check whether this big stripe is in write back cache. */ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t tree_index; void *slot; @@ -3049,14 +3049,14 @@ int r5l_start(struct r5l_log *log) void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; if ((raid5_calc_degraded(conf) > 0 || test_bit(Journal, &rdev->flags)) && - conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } @@ -3145,7 +3145,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - conf->log = log; + WRITE_ONCE(conf->log, log); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3173,7 +3173,7 @@ void r5l_exit_log(struct r5conf *conf) * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to * ensure disable_writeback_work wakes up and exits. */ - conf->log = NULL; + WRITE_ONCE(conf->log, NULL); wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 68d86dbecb..6fe334bb95 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely, "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; +static void raid5_quiesce(struct mddev *mddev, int quiesce); + static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; @@ -2499,15 +2501,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) unsigned long cpu; int err = 0; - /* - * Never shrink. And mddev_suspend() could deadlock if this is called - * from raid5d. In that case, scribble_disks and scribble_sectors - * should equal to new_disks and new_sectors - */ + /* Never shrink. */ if (conf->scribble_disks >= new_disks && conf->scribble_sectors >= new_sectors) return 0; - mddev_suspend(conf->mddev); + + raid5_quiesce(conf->mddev, true); cpus_read_lock(); for_each_present_cpu(cpu) { @@ -2521,7 +2520,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) } cpus_read_unlock(); - mddev_resume(conf->mddev); + raid5_quiesce(conf->mddev, false); + if (!err) { conf->scribble_disks = new_disks; conf->scribble_sectors = new_sectors; @@ -5960,19 +5960,6 @@ out: return ret; } -static bool reshape_inprogress(struct mddev *mddev) -{ - return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery); -} - -static bool reshape_disabled(struct mddev *mddev) -{ - return is_md_suspended(mddev) || !md_is_rdwr(mddev); -} - static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -6004,8 +5991,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; + return STRIPE_SCHEDULE_AND_RETRY; } } spin_unlock_irq(&conf->device_lock); @@ -6084,15 +6070,6 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); -out: - if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && - reshape_disabled(mddev)) { - bi->bi_status = BLK_STS_IOERR; - ret = STRIPE_FAIL; - pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", - mdname(mddev)); - } - return ret; } @@ -7032,7 +7009,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) new != roundup_pow_of_two(new)) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; @@ -7056,7 +7033,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) goto out_unlock; } - mddev_suspend(mddev); mutex_lock(&conf->cache_size_mutex); size = conf->max_nr_stripes; @@ -7071,10 +7047,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) err = -ENOMEM; } mutex_unlock(&conf->cache_size_mutex); - mddev_resume(mddev); out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7160,7 +7135,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) return -EINVAL; new = !!new; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; @@ -7169,15 +7144,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) else if (new != conf->skip_copy) { struct request_queue *q = mddev->queue; - mddev_suspend(mddev); conf->skip_copy = new; if (new) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - mddev_resume(mddev); } - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7232,15 +7205,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (new > 8192) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; if (!conf) err = -ENODEV; else if (new != conf->worker_cnt_per_group) { - mddev_suspend(mddev); - old_groups = conf->worker_groups; if (old_groups) flush_workqueue(raid5_wq); @@ -7257,9 +7228,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) kfree(old_groups[0].workers); kfree(old_groups); } - mddev_resume(mddev); } - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7408,7 +7378,7 @@ static void free_conf(struct r5conf *conf) log_exit(conf); - unregister_shrinker(&conf->shrinker); + shrinker_free(conf->shrinker); free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -7456,7 +7426,7 @@ static int raid5_alloc_percpu(struct r5conf *conf) static unsigned long raid5_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + struct r5conf *conf = shrink->private_data; unsigned long ret = SHRINK_STOP; if (mutex_trylock(&conf->cache_size_mutex)) { @@ -7477,7 +7447,7 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, static unsigned long raid5_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + struct r5conf *conf = shrink->private_data; if (conf->max_nr_stripes < conf->min_nr_stripes) /* unlikely, but not impossible */ @@ -7712,18 +7682,22 @@ static struct r5conf *setup_conf(struct mddev *mddev) * it reduces the queue depth and so can hurt throughput. * So set it rather large, scaled by number of devices. */ - conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; - conf->shrinker.scan_objects = raid5_cache_scan; - conf->shrinker.count_objects = raid5_cache_count; - conf->shrinker.batch = 128; - conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); - if (ret) { - pr_warn("md/raid:%s: couldn't register shrinker.\n", + conf->shrinker = shrinker_alloc(0, "md-raid5:%s", mdname(mddev)); + if (!conf->shrinker) { + ret = -ENOMEM; + pr_warn("md/raid:%s: couldn't allocate shrinker.\n", mdname(mddev)); goto abort; } + conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * 4; + conf->shrinker->scan_objects = raid5_cache_scan; + conf->shrinker->count_objects = raid5_cache_count; + conf->shrinker->batch = 128; + conf->shrinker->private_data = conf; + + shrinker_register(conf->shrinker); + sprintf(pers_name, "raid%d", mddev->new_level); rcu_assign_pointer(conf->thread, md_register_thread(raid5d, mddev, pers_name)); @@ -7785,9 +7759,6 @@ static int raid5_run(struct mddev *mddev) long long min_offset_diff = 0; int first = 1; - if (mddev_init_writes_pending(mddev) < 0) - return -ENOMEM; - if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -8031,11 +8002,7 @@ static int raid5_run(struct mddev *mddev) clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) - goto abort; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } /* Ok, everything is just fine now */ @@ -8568,8 +8535,8 @@ static int raid5_start_reshape(struct mddev *mddev) * the reshape wasn't running - like Discard or Read - have * completed. */ - mddev_suspend(mddev); - mddev_resume(mddev); + raid5_quiesce(mddev, true); + raid5_quiesce(mddev, false); /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. @@ -8614,29 +8581,8 @@ static int raid5_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) { - mddev->recovery = 0; - spin_lock_irq(&conf->device_lock); - write_seqcount_begin(&conf->gen_lock); - mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - mddev->new_chunk_sectors = - conf->chunk_sectors = conf->prev_chunk_sectors; - mddev->new_layout = conf->algorithm = conf->prev_algo; - rdev_for_each(rdev, mddev) - rdev->new_data_offset = rdev->data_offset; - smp_wmb(); - conf->generation --; - conf->reshape_progress = MaxSector; - mddev->reshape_position = MaxSector; - write_seqcount_end(&conf->gen_lock); - spin_unlock_irq(&conf->device_lock); - return -EAGAIN; - } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); conf->reshape_checkpoint = jiffies; - md_wakeup_thread(mddev->sync_thread); md_new_event(); return 0; } @@ -8984,12 +8930,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) struct r5conf *conf; int err; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; if (!conf) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return -ENODEV; } @@ -8999,19 +8945,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) { - mddev_suspend(mddev); + if (err) log_exit(conf); - mddev_resume(mddev); - } } } else err = -EINVAL; } else if (strncmp(buf, "resync", 6) == 0) { if (raid5_has_ppl(conf)) { - mddev_suspend(mddev); log_exit(conf); - mddev_resume(mddev); err = resize_stripes(conf, conf->pool_size); } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && r5l_log_disk_error(conf)) { @@ -9024,11 +8965,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) break; } - if (!journal_dev_exists) { - mddev_suspend(mddev); + if (!journal_dev_exists) clear_bit(MD_HAS_JOURNAL, &mddev->flags); - mddev_resume(mddev); - } else /* need remove journal device first */ + else /* need remove journal device first */ err = -EBUSY; } else err = -EINVAL; @@ -9039,7 +8978,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) if (!err) md_update_sb(mddev, 1); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err; } @@ -9051,22 +8990,6 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } -static void raid5_prepare_suspend(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || - percpu_ref_is_zero(&mddev->active_io)); - if (percpu_ref_is_zero(&mddev->active_io)) - return; - - /* - * Reshape is not in progress, and array is suspended, io that is - * waiting for reshpape can never be done. - */ - wake_up(&conf->wait_for_overlap); -} - static struct md_personality raid6_personality = { .name = "raid6", @@ -9087,7 +9010,6 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9112,7 +9034,6 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9138,7 +9059,6 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 97a795979a..22bea20ecc 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -670,7 +670,7 @@ struct r5conf { wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; - struct shrinker shrinker; + struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; |