summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:40:19 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:40:19 +0000
commit9f0fc191371843c4fc000a226b0a26b6c059aacd (patch)
tree35f8be3ef04506ac891ad001e8c41e535ae8d01d /drivers/md
parentReleasing progress-linux version 6.6.15-2~progress7.99u1. (diff)
downloadlinux-9f0fc191371843c4fc000a226b0a26b6c059aacd.tar.xz
linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.zip
Merging upstream version 6.7.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/Kconfig10
-rw-r--r--drivers/md/bcache/Makefile4
-rw-r--r--drivers/md/bcache/bcache.h6
-rw-r--r--drivers/md/bcache/btree.c41
-rw-r--r--drivers/md/bcache/closure.c207
-rw-r--r--drivers/md/bcache/closure.h378
-rw-r--r--drivers/md/bcache/journal.c20
-rw-r--r--drivers/md/bcache/movinggc.c16
-rw-r--r--drivers/md/bcache/request.c74
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c136
-rw-r--r--drivers/md/bcache/sysfs.c3
-rw-r--r--drivers/md/bcache/util.h3
-rw-r--r--drivers/md/bcache/writeback.c16
-rw-r--r--drivers/md/dm-bio-prison-v1.c2
-rw-r--r--drivers/md/dm-bufio.c28
-rw-r--r--drivers/md/dm-cache-metadata.c8
-rw-r--r--drivers/md/dm-core.h2
-rw-r--r--drivers/md/dm-crypt.c144
-rw-r--r--drivers/md/dm-delay.c102
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-integrity.c121
-rw-r--r--drivers/md/dm-ioctl.c7
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-log-userspace-base.c2
-rw-r--r--drivers/md/dm-raid.c22
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-table.c32
-rw-r--r--drivers/md/dm-target.c106
-rw-r--r--drivers/md/dm-verity-target.c112
-rw-r--r--drivers/md/dm-verity.h7
-rw-r--r--drivers/md/dm-zoned-metadata.c28
-rw-r--r--drivers/md/dm.c141
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/md-autodetect.c4
-rw-r--r--drivers/md/md-bitmap.c61
-rw-r--r--drivers/md/md-cluster.c15
-rw-r--r--drivers/md/md-linear.c28
-rw-r--r--drivers/md/md-linear.h2
-rw-r--r--drivers/md/md.c1104
-rw-r--r--drivers/md/md.h74
-rw-r--r--drivers/md/raid1.c6
-rw-r--r--drivers/md/raid10.c19
-rw-r--r--drivers/md/raid5-cache.c64
-rw-r--r--drivers/md/raid5.c158
-rw-r--r--drivers/md/raid5.h2
47 files changed, 1599 insertions, 1730 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 529c9d04e..b2d10063d 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -4,6 +4,7 @@ config BCACHE
tristate "Block device as cache"
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
+ select CLOSURES
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
@@ -19,15 +20,6 @@ config BCACHE_DEBUG
Enables extra debugging tools, allows expensive runtime checks to be
turned on.
-config BCACHE_CLOSURES_DEBUG
- bool "Debug closures"
- depends on BCACHE
- select DEBUG_FS
- help
- Keeps all active closures in a linked list and provides a debugfs
- interface to list them, which makes it possible to see asynchronous
- operations that get stuck.
-
config BCACHE_ASYNC_REGISTRATION
bool "Asynchronous device registration"
depends on BCACHE
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e5967..054e8a33a 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -2,6 +2,6 @@
obj-$(CONFIG_BCACHE) += bcache.o
-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\
+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
util.o writeback.o features.o
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 83eb7f27d..6ae232905 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -179,6 +179,7 @@
#define pr_fmt(fmt) "bcache: %s() " fmt, __func__
#include <linux/bio.h>
+#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -192,7 +193,6 @@
#include "bcache_ondisk.h"
#include "bset.h"
#include "util.h"
-#include "closure.h"
struct bucket {
atomic_t pin;
@@ -300,6 +300,7 @@ struct cached_dev {
struct list_head list;
struct bcache_device disk;
struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct cache_sb sb;
struct cache_sb_disk *sb_disk;
@@ -422,6 +423,7 @@ struct cache {
struct kobject kobj;
struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct task_struct *alloc_thread;
@@ -542,7 +544,7 @@ struct cache_set {
struct bio_set bio_split;
/* For the btree cache */
- struct shrinker shrink;
+ struct shrinker *shrink;
/* For the btree cache and anything allocation related */
struct mutex bucket_lock;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index b709c2fde..196cdacce 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
w->journal = NULL;
}
-static void btree_node_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_unlock)
{
- struct btree *b = container_of(cl, struct btree, io);
+ closure_type(b, struct btree, io);
up(&b->io_mutex);
}
-static void __btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(__btree_node_write_done)
{
- struct btree *b = container_of(cl, struct btree, io);
+ closure_type(b, struct btree, io);
struct btree_write *w = btree_prev_write(b);
bch_bbio_free(b->bio, b->c);
@@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl)
closure_return_with_destructor(cl, btree_node_write_unlock);
}
-static void btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_done)
{
- struct btree *b = container_of(cl, struct btree, io);
+ closure_type(b, struct btree, io);
bio_free_pages(b->bio);
- __btree_node_write_done(cl);
+ __btree_node_write_done(&cl->work);
}
static void btree_node_write_endio(struct bio *bio)
@@ -667,7 +667,7 @@ out_unlock:
static unsigned long bch_mca_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+ struct cache_set *c = shrink->private_data;
struct btree *b, *t;
unsigned long i, nr = sc->nr_to_scan;
unsigned long freed = 0;
@@ -734,7 +734,7 @@ out:
static unsigned long bch_mca_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+ struct cache_set *c = shrink->private_data;
if (c->shrinker_disabled)
return 0;
@@ -752,8 +752,8 @@ void bch_btree_cache_free(struct cache_set *c)
closure_init_stack(&cl);
- if (c->shrink.list.next)
- unregister_shrinker(&c->shrink);
+ if (c->shrink)
+ shrinker_free(c->shrink);
mutex_lock(&c->bucket_lock);
@@ -828,14 +828,19 @@ int bch_btree_cache_alloc(struct cache_set *c)
c->verify_data = NULL;
#endif
- c->shrink.count_objects = bch_mca_count;
- c->shrink.scan_objects = bch_mca_scan;
- c->shrink.seeks = 4;
- c->shrink.batch = c->btree_pages * 2;
+ c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid);
+ if (!c->shrink) {
+ pr_warn("bcache: %s: could not allocate shrinker\n", __func__);
+ return 0;
+ }
+
+ c->shrink->count_objects = bch_mca_count;
+ c->shrink->scan_objects = bch_mca_scan;
+ c->shrink->seeks = 4;
+ c->shrink->batch = c->btree_pages * 2;
+ c->shrink->private_data = c;
- if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
- pr_warn("bcache: %s: could not register shrinker\n",
- __func__);
+ shrinker_register(c->shrink);
return 0;
}
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
deleted file mode 100644
index d8d9394a6..000000000
--- a/drivers/md/bcache/closure.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Asynchronous refcounty things
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/sched/debug.h>
-
-#include "closure.h"
-
-static inline void closure_put_after_sub(struct closure *cl, int flags)
-{
- int r = flags & CLOSURE_REMAINING_MASK;
-
- BUG_ON(flags & CLOSURE_GUARD_MASK);
- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
-
- if (!r) {
- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
- atomic_set(&cl->remaining,
- CLOSURE_REMAINING_INITIALIZER);
- closure_queue(cl);
- } else {
- struct closure *parent = cl->parent;
- closure_fn *destructor = cl->fn;
-
- closure_debug_destroy(cl);
-
- if (destructor)
- destructor(cl);
-
- if (parent)
- closure_put(parent);
- }
- }
-}
-
-/* For clearing flags with the same atomic op as a put */
-void closure_sub(struct closure *cl, int v)
-{
- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
-}
-
-/*
- * closure_put - decrement a closure's refcount
- */
-void closure_put(struct closure *cl)
-{
- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
-}
-
-/*
- * closure_wake_up - wake up all closures on a wait list, without memory barrier
- */
-void __closure_wake_up(struct closure_waitlist *wait_list)
-{
- struct llist_node *list;
- struct closure *cl, *t;
- struct llist_node *reverse = NULL;
-
- list = llist_del_all(&wait_list->list);
-
- /* We first reverse the list to preserve FIFO ordering and fairness */
- reverse = llist_reverse_order(list);
-
- /* Then do the wakeups */
- llist_for_each_entry_safe(cl, t, reverse, list) {
- closure_set_waiting(cl, 0);
- closure_sub(cl, CLOSURE_WAITING + 1);
- }
-}
-
-/**
- * closure_wait - add a closure to a waitlist
- * @waitlist: will own a ref on @cl, which will be released when
- * closure_wake_up() is called on @waitlist.
- * @cl: closure pointer.
- *
- */
-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
-{
- if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
- return false;
-
- closure_set_waiting(cl, _RET_IP_);
- atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
- llist_add(&cl->list, &waitlist->list);
-
- return true;
-}
-
-struct closure_syncer {
- struct task_struct *task;
- int done;
-};
-
-static void closure_sync_fn(struct closure *cl)
-{
- struct closure_syncer *s = cl->s;
- struct task_struct *p;
-
- rcu_read_lock();
- p = READ_ONCE(s->task);
- s->done = 1;
- wake_up_process(p);
- rcu_read_unlock();
-}
-
-void __sched __closure_sync(struct closure *cl)
-{
- struct closure_syncer s = { .task = current };
-
- cl->s = &s;
- continue_at(cl, closure_sync_fn, NULL);
-
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (s.done)
- break;
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-static LIST_HEAD(closure_list);
-static DEFINE_SPINLOCK(closure_list_lock);
-
-void closure_debug_create(struct closure *cl)
-{
- unsigned long flags;
-
- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
- cl->magic = CLOSURE_MAGIC_ALIVE;
-
- spin_lock_irqsave(&closure_list_lock, flags);
- list_add(&cl->all, &closure_list);
- spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-
-void closure_debug_destroy(struct closure *cl)
-{
- unsigned long flags;
-
- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
- cl->magic = CLOSURE_MAGIC_DEAD;
-
- spin_lock_irqsave(&closure_list_lock, flags);
- list_del(&cl->all);
- spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-
-static struct dentry *closure_debug;
-
-static int debug_show(struct seq_file *f, void *data)
-{
- struct closure *cl;
-
- spin_lock_irq(&closure_list_lock);
-
- list_for_each_entry(cl, &closure_list, all) {
- int r = atomic_read(&cl->remaining);
-
- seq_printf(f, "%p: %pS -> %pS p %p r %i ",
- cl, (void *) cl->ip, cl->fn, cl->parent,
- r & CLOSURE_REMAINING_MASK);
-
- seq_printf(f, "%s%s\n",
- test_bit(WORK_STRUCT_PENDING_BIT,
- work_data_bits(&cl->work)) ? "Q" : "",
- r & CLOSURE_RUNNING ? "R" : "");
-
- if (r & CLOSURE_WAITING)
- seq_printf(f, " W %pS\n",
- (void *) cl->waiting_on);
-
- seq_printf(f, "\n");
- }
-
- spin_unlock_irq(&closure_list_lock);
- return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(debug);
-
-void __init closure_debug_init(void)
-{
- if (!IS_ERR_OR_NULL(bcache_debug))
- /*
- * it is unnecessary to check return value of
- * debugfs_create_file(), we should not care
- * about this.
- */
- closure_debug = debugfs_create_file(
- "closures", 0400, bcache_debug, NULL, &debug_fops);
-}
-#endif
-
-MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
deleted file mode 100644
index c88cdc4ae..000000000
--- a/drivers/md/bcache/closure.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CLOSURE_H
-#define _LINUX_CLOSURE_H
-
-#include <linux/llist.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/workqueue.h>
-
-/*
- * Closure is perhaps the most overused and abused term in computer science, but
- * since I've been unable to come up with anything better you're stuck with it
- * again.
- *
- * What are closures?
- *
- * They embed a refcount. The basic idea is they count "things that are in
- * progress" - in flight bios, some other thread that's doing something else -
- * anything you might want to wait on.
- *
- * The refcount may be manipulated with closure_get() and closure_put().
- * closure_put() is where many of the interesting things happen, when it causes
- * the refcount to go to 0.
- *
- * Closures can be used to wait on things both synchronously and asynchronously,
- * and synchronous and asynchronous use can be mixed without restriction. To
- * wait synchronously, use closure_sync() - you will sleep until your closure's
- * refcount hits 1.
- *
- * To wait asynchronously, use
- * continue_at(cl, next_function, workqueue);
- *
- * passing it, as you might expect, the function to run when nothing is pending
- * and the workqueue to run that function out of.
- *
- * continue_at() also, critically, requires a 'return' immediately following the
- * location where this macro is referenced, to return to the calling function.
- * There's good reason for this.
- *
- * To use safely closures asynchronously, they must always have a refcount while
- * they are running owned by the thread that is running them. Otherwise, suppose
- * you submit some bios and wish to have a function run when they all complete:
- *
- * foo_endio(struct bio *bio)
- * {
- * closure_put(cl);
- * }
- *
- * closure_init(cl);
- *
- * do_stuff();
- * closure_get(cl);
- * bio1->bi_endio = foo_endio;
- * bio_submit(bio1);
- *
- * do_more_stuff();
- * closure_get(cl);
- * bio2->bi_endio = foo_endio;
- * bio_submit(bio2);
- *
- * continue_at(cl, complete_some_read, system_wq);
- *
- * If closure's refcount started at 0, complete_some_read() could run before the
- * second bio was submitted - which is almost always not what you want! More
- * importantly, it wouldn't be possible to say whether the original thread or
- * complete_some_read()'s thread owned the closure - and whatever state it was
- * associated with!
- *
- * So, closure_init() initializes a closure's refcount to 1 - and when a
- * closure_fn is run, the refcount will be reset to 1 first.
- *
- * Then, the rule is - if you got the refcount with closure_get(), release it
- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
- * on a closure because you called closure_init() or you were run out of a
- * closure - _always_ use continue_at(). Doing so consistently will help
- * eliminate an entire class of particularly pernicious races.
- *
- * Lastly, you might have a wait list dedicated to a specific event, and have no
- * need for specifying the condition - you just want to wait until someone runs
- * closure_wake_up() on the appropriate wait list. In that case, just use
- * closure_wait(). It will return either true or false, depending on whether the
- * closure was already on a wait list or not - a closure can only be on one wait
- * list at a time.
- *
- * Parents:
- *
- * closure_init() takes two arguments - it takes the closure to initialize, and
- * a (possibly null) parent.
- *
- * If parent is non null, the new closure will have a refcount for its lifetime;
- * a closure is considered to be "finished" when its refcount hits 0 and the
- * function to run is null. Hence
- *
- * continue_at(cl, NULL, NULL);
- *
- * returns up the (spaghetti) stack of closures, precisely like normal return
- * returns up the C stack. continue_at() with non null fn is better thought of
- * as doing a tail call.
- *
- * All this implies that a closure should typically be embedded in a particular
- * struct (which its refcount will normally control the lifetime of), and that
- * struct can very much be thought of as a stack frame.
- */
-
-struct closure;
-struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
-extern struct dentry *bcache_debug;
-
-struct closure_waitlist {
- struct llist_head list;
-};
-
-enum closure_state {
- /*
- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
- * the thread that owns the closure, and cleared by the thread that's
- * waking up the closure.
- *
- * The rest are for debugging and don't affect behaviour:
- *
- * CLOSURE_RUNNING: Set when a closure is running (i.e. by
- * closure_init() and when closure_put() runs then next function), and
- * must be cleared before remaining hits 0. Primarily to help guard
- * against incorrect usage and accidentally transferring references.
- * continue_at() and closure_return() clear it for you, if you're doing
- * something unusual you can use closure_set_dead() which also helps
- * annotate where references are being transferred.
- */
-
- CLOSURE_BITS_START = (1U << 26),
- CLOSURE_DESTRUCTOR = (1U << 26),
- CLOSURE_WAITING = (1U << 28),
- CLOSURE_RUNNING = (1U << 30),
-};
-
-#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
-
-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
-
-struct closure {
- union {
- struct {
- struct workqueue_struct *wq;
- struct closure_syncer *s;
- struct llist_node list;
- closure_fn *fn;
- };
- struct work_struct work;
- };
-
- struct closure *parent;
-
- atomic_t remaining;
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-#define CLOSURE_MAGIC_DEAD 0xc054dead
-#define CLOSURE_MAGIC_ALIVE 0xc054a11e
-
- unsigned int magic;
- struct list_head all;
- unsigned long ip;
- unsigned long waiting_on;
-#endif
-};
-
-void closure_sub(struct closure *cl, int v);
-void closure_put(struct closure *cl);
-void __closure_wake_up(struct closure_waitlist *list);
-bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void __closure_sync(struct closure *cl);
-
-/**
- * closure_sync - sleep until a closure a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-static inline void closure_sync(struct closure *cl)
-{
- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
- __closure_sync(cl);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-void closure_debug_init(void);
-void closure_debug_create(struct closure *cl);
-void closure_debug_destroy(struct closure *cl);
-
-#else
-
-static inline void closure_debug_init(void) {}
-static inline void closure_debug_create(struct closure *cl) {}
-static inline void closure_debug_destroy(struct closure *cl) {}
-
-#endif
-
-static inline void closure_set_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->ip = _THIS_IP_;
-#endif
-}
-
-static inline void closure_set_ret_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->ip = _RET_IP_;
-#endif
-}
-
-static inline void closure_set_waiting(struct closure *cl, unsigned long f)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->waiting_on = f;
-#endif
-}
-
-static inline void closure_set_stopped(struct closure *cl)
-{
- atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-}
-
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
- struct workqueue_struct *wq)
-{
- closure_set_ip(cl);
- cl->fn = fn;
- cl->wq = wq;
- /* between atomic_dec() in closure_put() */
- smp_mb__before_atomic();
-}
-
-static inline void closure_queue(struct closure *cl)
-{
- struct workqueue_struct *wq = cl->wq;
- /**
- * Changes made to closure, work_struct, or a couple of other structs
- * may cause work.func not pointing to the right location.
- */
- BUILD_BUG_ON(offsetof(struct closure, fn)
- != offsetof(struct work_struct, func));
- if (wq) {
- INIT_WORK(&cl->work, cl->work.func);
- BUG_ON(!queue_work(wq, &cl->work));
- } else
- cl->fn(cl);
-}
-
-/**
- * closure_get - increment a closure's refcount
- */
-static inline void closure_get(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- BUG_ON((atomic_inc_return(&cl->remaining) &
- CLOSURE_REMAINING_MASK) <= 1);
-#else
- atomic_inc(&cl->remaining);
-#endif
-}
-
-/**
- * closure_init - Initialize a closure, setting the refcount to 1
- * @cl: closure to initialize
- * @parent: parent of the new closure. cl will take a refcount on it for its
- * lifetime; may be NULL.
- */
-static inline void closure_init(struct closure *cl, struct closure *parent)
-{
- memset(cl, 0, sizeof(struct closure));
- cl->parent = parent;
- if (parent)
- closure_get(parent);
-
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-
- closure_debug_create(cl);
- closure_set_ip(cl);
-}
-
-static inline void closure_init_stack(struct closure *cl)
-{
- memset(cl, 0, sizeof(struct closure));
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-}
-
-/**
- * closure_wake_up - wake up all closures on a wait list,
- * with memory barrier
- */
-static inline void closure_wake_up(struct closure_waitlist *list)
-{
- /* Memory barrier for the wait list */
- smp_mb();
- __closure_wake_up(list);
-}
-
-/**
- * continue_at - jump to another function with barrier
- *
- * After @cl is no longer waiting on anything (i.e. all outstanding refs have
- * been dropped with closure_put()), it will resume execution at @fn running out
- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
- *
- * This is because after calling continue_at() you no longer have a ref on @cl,
- * and whatever @cl owns may be freed out from under you - a running closure fn
- * has a ref on its own closure which continue_at() drops.
- *
- * Note you are expected to immediately return after using this macro.
- */
-#define continue_at(_cl, _fn, _wq) \
-do { \
- set_closure_fn(_cl, _fn, _wq); \
- closure_sub(_cl, CLOSURE_RUNNING + 1); \
-} while (0)
-
-/**
- * closure_return - finish execution of a closure
- *
- * This is used to indicate that @cl is finished: when all outstanding refs on
- * @cl have been dropped @cl's ref on its parent closure (as passed to
- * closure_init()) will be dropped, if one was specified - thus this can be
- * thought of as returning to the parent closure.
- */
-#define closure_return(_cl) continue_at((_cl), NULL, NULL)
-
-/**
- * continue_at_nobarrier - jump to another function without barrier
- *
- * Causes @fn to be executed out of @cl, in @wq context (or called directly if
- * @wq is NULL).
- *
- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
- * thus it's not safe to touch anything protected by @cl after a
- * continue_at_nobarrier().
- */
-#define continue_at_nobarrier(_cl, _fn, _wq) \
-do { \
- set_closure_fn(_cl, _fn, _wq); \
- closure_queue(_cl); \
-} while (0)
-
-/**
- * closure_return_with_destructor - finish execution of a closure,
- * with destructor
- *
- * Works like closure_return(), except @destructor will be called when all
- * outstanding refs on @cl have been dropped; @destructor may be used to safely
- * free the memory occupied by @cl, and it is called with the ref on the parent
- * closure still held - so @destructor could safely return an item to a
- * freelist protected by @cl's parent.
- */
-#define closure_return_with_destructor(_cl, _destructor) \
-do { \
- set_closure_fn(_cl, _destructor, NULL); \
- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
-} while (0)
-
-/**
- * closure_call - execute @fn out of a new, uninitialized closure
- *
- * Typically used when running out of one closure, and we want to run @fn
- * asynchronously out of a new closure - @parent will then wait for @cl to
- * finish.
- */
-static inline void closure_call(struct closure *cl, closure_fn fn,
- struct workqueue_struct *wq,
- struct closure *parent)
-{
- closure_init(cl, parent);
- continue_at_nobarrier(cl, fn, wq);
-}
-
-#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index c182c21de..7ff14bd2f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -723,11 +723,11 @@ static void journal_write_endio(struct bio *bio)
closure_put(&w->c->journal.io);
}
-static void journal_write(struct closure *cl);
+static CLOSURE_CALLBACK(journal_write);
-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
{
- struct journal *j = container_of(cl, struct journal, io);
+ closure_type(j, struct journal, io);
struct journal_write *w = (j->cur == j->w)
? &j->w[1]
: &j->w[0];
@@ -736,19 +736,19 @@ static void journal_write_done(struct closure *cl)
continue_at_nobarrier(cl, journal_write, bch_journal_wq);
}
-static void journal_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlock)
__releases(&c->journal.lock)
{
- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ closure_type(c, struct cache_set, journal.io);
c->journal.io_in_flight = 0;
spin_unlock(&c->journal.lock);
}
-static void journal_write_unlocked(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlocked)
__releases(c->journal.lock)
{
- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ closure_type(c, struct cache_set, journal.io);
struct cache *ca = c->cache;
struct journal_write *w = c->journal.cur;
struct bkey *k = &c->journal.key;
@@ -823,12 +823,12 @@ static void journal_write_unlocked(struct closure *cl)
continue_at(cl, journal_write_done, NULL);
}
-static void journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write)
{
- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ closure_type(c, struct cache_set, journal.io);
spin_lock(&c->journal.lock);
- journal_write_unlocked(cl);
+ journal_write_unlocked(&cl->work);
}
static void journal_try_write(struct cache_set *c)
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 9f32901fd..ebd500bdf 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
/* Moving GC - IO loop */
-static void moving_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(moving_io_destructor)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
kfree(io);
}
-static void write_moving_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving_finish)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bio_free_pages(bio);
@@ -89,9 +89,9 @@ static void moving_init(struct moving_io *io)
bch_bio_map(bio, NULL);
}
-static void write_moving(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
struct data_insert_op *op = &io->op;
if (!op->status) {
@@ -113,9 +113,9 @@ static void write_moving(struct closure *cl)
continue_at(cl, write_moving_finish, op->wq);
}
-static void read_moving_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_moving_submit)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a9b1f3896..83d112bd2 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,7 +25,7 @@
struct kmem_cache *bch_search_cache;
-static void bch_data_insert_start(struct closure *cl);
+static CLOSURE_CALLBACK(bch_data_insert_start);
static unsigned int cache_mode(struct cached_dev *dc)
{
@@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)
/* Insert data into cache */
-static void bch_data_insert_keys(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_keys)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
atomic_t *journal_ref = NULL;
struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
int ret;
@@ -136,9 +136,9 @@ out:
continue_at(cl, bch_data_insert_keys, op->wq);
}
-static void bch_data_insert_error(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_error)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
/*
* Our data write just errored, which means we've got a bunch of keys to
@@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl)
op->insert_keys.top = dst;
- bch_data_insert_keys(cl);
+ bch_data_insert_keys(&cl->work);
}
static void bch_data_insert_endio(struct bio *bio)
@@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio)
bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
}
-static void bch_data_insert_start(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_start)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
if (op->bypass)
@@ -305,16 +305,16 @@ err:
* If op->bypass is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
-void bch_data_insert(struct closure *cl)
+CLOSURE_CALLBACK(bch_data_insert)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
trace_bcache_write(op->c, op->inode, op->bio,
op->writeback, op->bypass);
bch_keylist_init(&op->insert_keys);
bio_get(op->bio);
- bch_data_insert_start(cl);
+ bch_data_insert_start(&cl->work);
}
/*
@@ -575,9 +575,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
return n == bio ? MAP_DONE : MAP_CONTINUE;
}
-static void cache_lookup(struct closure *cl)
+static CLOSURE_CALLBACK(cache_lookup)
{
- struct search *s = container_of(cl, struct search, iop.cl);
+ closure_type(s, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
struct cached_dev *dc;
int ret;
@@ -698,9 +698,9 @@ static void do_bio_hook(struct search *s,
bio_cnt_set(bio, 3);
}
-static void search_free(struct closure *cl)
+static CLOSURE_CALLBACK(search_free)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
atomic_dec(&s->iop.c->search_inflight);
@@ -749,20 +749,20 @@ static inline struct search *search_alloc(struct bio *bio,
/* Cached devices */
-static void cached_dev_bio_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_bio_complete)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
cached_dev_put(dc);
- search_free(cl);
+ search_free(&cl->work);
}
/* Process reads */
-static void cached_dev_read_error_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error_done)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
if (s->iop.replace_collision)
bch_mark_cache_miss_collision(s->iop.c, s->d);
@@ -770,12 +770,12 @@ static void cached_dev_read_error_done(struct closure *cl)
if (s->iop.bio)
bio_free_pages(s->iop.bio);
- cached_dev_bio_complete(cl);
+ cached_dev_bio_complete(&cl->work);
}
-static void cached_dev_read_error(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
/*
@@ -801,9 +801,9 @@ static void cached_dev_read_error(struct closure *cl)
continue_at(cl, cached_dev_read_error_done, NULL);
}
-static void cached_dev_cache_miss_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_cache_miss_done)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct bcache_device *d = s->d;
if (s->iop.replace_collision)
@@ -812,13 +812,13 @@ static void cached_dev_cache_miss_done(struct closure *cl)
if (s->iop.bio)
bio_free_pages(s->iop.bio);
- cached_dev_bio_complete(cl);
+ cached_dev_bio_complete(&cl->work);
closure_put(&d->cl);
}
-static void cached_dev_read_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
/*
@@ -858,9 +858,9 @@ static void cached_dev_read_done(struct closure *cl)
continue_at(cl, cached_dev_cache_miss_done, NULL);
}
-static void cached_dev_read_done_bh(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done_bh)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s->iop.c, s->d,
@@ -955,13 +955,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
/* Process writes */
-static void cached_dev_write_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_write_complete)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
up_read_non_owner(&dc->writeback_lock);
- cached_dev_bio_complete(cl);
+ cached_dev_bio_complete(&cl->work);
}
static void cached_dev_write(struct cached_dev *dc, struct search *s)
@@ -1048,9 +1048,9 @@ insert_data:
continue_at(cl, cached_dev_write_complete, NULL);
}
-static void cached_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_nodata)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
if (s->iop.flush_journal)
@@ -1265,9 +1265,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
return MAP_CONTINUE;
}
-static void flash_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_nodata)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
if (s->iop.flush_journal)
bch_journal_meta(s->iop.c, cl);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 38ab4856e..46bbef00a 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -34,7 +34,7 @@ struct data_insert_op {
};
unsigned int bch_get_congested(const struct cache_set *c);
-void bch_data_insert(struct closure *cl);
+CLOSURE_CALLBACK(bch_data_insert);
void bch_cached_dev_request_init(struct cached_dev *dc);
void cached_dev_submit_bio(struct bio *bio);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1e677af38..1402096b8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -327,9 +327,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
submit_bio(bio);
}
-static void bch_write_bdev_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bch_write_bdev_super_unlock)
{
- struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+ closure_type(dc, struct cached_dev, sb_write);
up(&dc->sb_write_mutex);
}
@@ -363,9 +363,9 @@ static void write_super_endio(struct bio *bio)
closure_put(&ca->set->sb_write);
}
-static void bcache_write_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bcache_write_super_unlock)
{
- struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+ closure_type(c, struct cache_set, sb_write);
up(&c->sb_write_mutex);
}
@@ -407,9 +407,9 @@ static void uuid_endio(struct bio *bio)
closure_put(cl);
}
-static void uuid_io_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(uuid_io_unlock)
{
- struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+ closure_type(c, struct cache_set, uuid_write);
up(&c->uuid_write_mutex);
}
@@ -1344,9 +1344,9 @@ void bch_cached_dev_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
-static void cached_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_free)
{
- struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ closure_type(dc, struct cached_dev, disk.cl);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
@@ -1370,17 +1370,17 @@ static void cached_dev_free(struct closure *cl)
if (dc->sb_disk)
put_page(virt_to_page(dc->sb_disk));
- if (!IS_ERR_OR_NULL(dc->bdev))
- blkdev_put(dc->bdev, dc);
+ if (dc->bdev_handle)
+ bdev_release(dc->bdev_handle);
wake_up(&unregister_wait);
kobject_put(&dc->disk.kobj);
}
-static void cached_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_flush)
{
- struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ closure_type(dc, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
mutex_lock(&bch_register_lock);
@@ -1446,7 +1446,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
/* Cached device - bcache superblock */
static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
- struct block_device *bdev,
+ struct bdev_handle *bdev_handle,
struct cached_dev *dc)
{
const char *err = "cannot allocate memory";
@@ -1454,14 +1454,15 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
int ret = -ENOMEM;
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
- dc->bdev = bdev;
+ dc->bdev_handle = bdev_handle;
+ dc->bdev = bdev_handle->bdev;
dc->sb_disk = sb_disk;
if (cached_dev_init(dc, sb->block_size << 9))
goto err;
err = "error creating kobject";
- if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
+ if (kobject_add(&dc->disk.kobj, bdev_kobj(dc->bdev), "bcache"))
goto err;
if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
goto err;
@@ -1498,9 +1499,9 @@ void bch_flash_dev_release(struct kobject *kobj)
kfree(d);
}
-static void flash_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_free)
{
- struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ closure_type(d, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
atomic_long_sub(bcache_dev_sectors_dirty(d),
@@ -1511,9 +1512,9 @@ static void flash_dev_free(struct closure *cl)
kobject_put(&d->kobj);
}
-static void flash_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_flush)
{
- struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ closure_type(d, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
@@ -1669,9 +1670,9 @@ void bch_cache_set_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
-static void cache_set_free(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_free)
{
- struct cache_set *c = container_of(cl, struct cache_set, cl);
+ closure_type(c, struct cache_set, cl);
struct cache *ca;
debugfs_remove(c->debug);
@@ -1710,9 +1711,9 @@ static void cache_set_free(struct closure *cl)
kobject_put(&c->kobj);
}
-static void cache_set_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_flush)
{
- struct cache_set *c = container_of(cl, struct cache_set, caching);
+ closure_type(c, struct cache_set, caching);
struct cache *ca = c->cache;
struct btree *b;
@@ -1807,9 +1808,9 @@ static void conditional_stop_bcache_device(struct cache_set *c,
}
}
-static void __cache_set_unregister(struct closure *cl)
+static CLOSURE_CALLBACK(__cache_set_unregister)
{
- struct cache_set *c = container_of(cl, struct cache_set, caching);
+ closure_type(c, struct cache_set, caching);
struct cached_dev *dc;
struct bcache_device *d;
size_t i;
@@ -2218,8 +2219,8 @@ void bch_cache_release(struct kobject *kobj)
if (ca->sb_disk)
put_page(virt_to_page(ca->sb_disk));
- if (!IS_ERR_OR_NULL(ca->bdev))
- blkdev_put(ca->bdev, ca);
+ if (ca->bdev_handle)
+ bdev_release(ca->bdev_handle);
kfree(ca);
module_put(THIS_MODULE);
@@ -2339,38 +2340,42 @@ err_free:
}
static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
- struct block_device *bdev, struct cache *ca)
+ struct bdev_handle *bdev_handle,
+ struct cache *ca)
{
const char *err = NULL; /* must be set for any error case */
int ret = 0;
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
- ca->bdev = bdev;
+ ca->bdev_handle = bdev_handle;
+ ca->bdev = bdev_handle->bdev;
ca->sb_disk = sb_disk;
- if (bdev_max_discard_sectors((bdev)))
+ if (bdev_max_discard_sectors((bdev_handle->bdev)))
ca->discard = CACHE_DISCARD(&ca->sb);
ret = cache_alloc(ca);
if (ret != 0) {
- /*
- * If we failed here, it means ca->kobj is not initialized yet,
- * kobject_put() won't be called and there is no chance to
- * call blkdev_put() to bdev in bch_cache_release(). So we
- * explicitly call blkdev_put() here.
- */
- blkdev_put(bdev, ca);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
else if (ret == -EPERM)
err = "cache_alloc(): cache device is too small";
else
err = "cache_alloc(): unknown error";
- goto err;
+ pr_notice("error %pg: %s\n", bdev_handle->bdev, err);
+ /*
+ * If we failed here, it means ca->kobj is not initialized yet,
+ * kobject_put() won't be called and there is no chance to
+ * call bdev_release() to bdev in bch_cache_release(). So
+ * we explicitly call bdev_release() here.
+ */
+ bdev_release(bdev_handle);
+ return ret;
}
- if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
- err = "error calling kobject_add";
+ if (kobject_add(&ca->kobj, bdev_kobj(bdev_handle->bdev), "bcache")) {
+ pr_notice("error %pg: error calling kobject_add\n",
+ bdev_handle->bdev);
ret = -ENOMEM;
goto out;
}
@@ -2384,15 +2389,10 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
goto out;
}
- pr_info("registered cache device %pg\n", ca->bdev);
+ pr_info("registered cache device %pg\n", ca->bdev_handle->bdev);
out:
kobject_put(&ca->kobj);
-
-err:
- if (err)
- pr_notice("error %pg: %s\n", ca->bdev, err);
-
return ret;
}
@@ -2447,7 +2447,7 @@ struct async_reg_args {
char *path;
struct cache_sb *sb;
struct cache_sb_disk *sb_disk;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
void *holder;
};
@@ -2458,8 +2458,8 @@ static void register_bdev_worker(struct work_struct *work)
container_of(work, struct async_reg_args, reg_work.work);
mutex_lock(&bch_register_lock);
- if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder)
- < 0)
+ if (register_bdev(args->sb, args->sb_disk, args->bdev_handle,
+ args->holder) < 0)
fail = true;
mutex_unlock(&bch_register_lock);
@@ -2479,7 +2479,8 @@ static void register_cache_worker(struct work_struct *work)
container_of(work, struct async_reg_args, reg_work.work);
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder))
+ if (register_cache(args->sb, args->sb_disk, args->bdev_handle,
+ args->holder))
fail = true;
if (fail)
@@ -2516,7 +2517,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
char *path = NULL;
struct cache_sb *sb;
struct cache_sb_disk *sb_disk;
- struct block_device *bdev, *bdev2;
+ struct bdev_handle *bdev_handle, *bdev_handle2;
void *holder = NULL;
ssize_t ret;
bool async_registration = false;
@@ -2549,15 +2550,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
ret = -EINVAL;
err = "failed to open device";
- bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
- if (IS_ERR(bdev))
+ bdev_handle = bdev_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(bdev_handle))
goto out_free_sb;
err = "failed to set blocksize";
- if (set_blocksize(bdev, 4096))
+ if (set_blocksize(bdev_handle->bdev, 4096))
goto out_blkdev_put;
- err = read_super(sb, bdev, &sb_disk);
+ err = read_super(sb, bdev_handle->bdev, &sb_disk);
if (err)
goto out_blkdev_put;
@@ -2569,13 +2570,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
}
/* Now reopen in exclusive mode with proper holder */
- bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
- holder, NULL);
- blkdev_put(bdev, NULL);
- bdev = bdev2;
- if (IS_ERR(bdev)) {
- ret = PTR_ERR(bdev);
- bdev = NULL;
+ bdev_handle2 = bdev_open_by_dev(bdev_handle->bdev->bd_dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE, holder, NULL);
+ bdev_release(bdev_handle);
+ bdev_handle = bdev_handle2;
+ if (IS_ERR(bdev_handle)) {
+ ret = PTR_ERR(bdev_handle);
+ bdev_handle = NULL;
if (ret == -EBUSY) {
dev_t dev;
@@ -2610,7 +2611,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
args->path = path;
args->sb = sb;
args->sb_disk = sb_disk;
- args->bdev = bdev;
+ args->bdev_handle = bdev_handle;
args->holder = holder;
register_device_async(args);
/* No wait and returns to user space */
@@ -2619,14 +2620,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (SB_IS_BDEV(sb)) {
mutex_lock(&bch_register_lock);
- ret = register_bdev(sb, sb_disk, bdev, holder);
+ ret = register_bdev(sb, sb_disk, bdev_handle, holder);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
if (ret < 0)
goto out_free_sb;
} else {
/* blkdev_put() will be called in bch_cache_release() */
- ret = register_cache(sb, sb_disk, bdev, holder);
+ ret = register_cache(sb, sb_disk, bdev_handle, holder);
if (ret)
goto out_free_sb;
}
@@ -2642,8 +2643,8 @@ out_free_holder:
out_put_sb_page:
put_page(virt_to_page(sb_disk));
out_blkdev_put:
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_handle)
+ bdev_release(bdev_handle);
out_free_sb:
kfree(sb);
out_free_path:
@@ -2907,7 +2908,6 @@ static int __init bcache_init(void)
goto err;
bch_debug_init();
- closure_debug_init();
bcache_is_reboot = false;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 18ac98dc8..a438efb66 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -866,7 +866,8 @@ STORE(__bch_cache_set)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->shrink.scan_objects(&c->shrink, &sc);
+ if (c->shrink)
+ c->shrink->scan_objects(c->shrink, &sc);
}
sysfs_strtoul_clamp(congested_read_threshold_us,
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 6f3cb7c92..f61ab1bad 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
#define _BCACHE_UTIL_H
#include <linux/blkdev.h>
+#include <linux/closure.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched/clock.h>
@@ -13,8 +14,6 @@
#include <linux/workqueue.h>
#include <linux/crc64.h>
-#include "closure.h"
-
struct closure;
#ifdef CONFIG_BCACHE_DEBUG
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 3accfdaee..8827a6f13 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -341,16 +341,16 @@ static void dirty_init(struct keybuf_key *w)
bch_bio_map(bio, NULL);
}
-static void dirty_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(dirty_io_destructor)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
kfree(io);
}
-static void write_dirty_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty_finish)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
@@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio)
closure_put(&io->cl);
}
-static void write_dirty(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
@@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio)
dirty_endio(bio);
}
-static void read_dirty_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_dirty_submit)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
closure_bio_submit(io->dc->disk.c, &io->bio, cl);
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 92afdca76..9ab32abe5 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -26,7 +26,7 @@ struct prison_region {
struct dm_bio_prison {
mempool_t cell_pool;
unsigned int num_locks;
- struct prison_region regions[];
+ struct prison_region regions[] __counted_by(num_locks);
};
static struct kmem_cache *_cell_cache;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 486e1180c..f03d7dba2 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1001,7 +1001,7 @@ struct dm_bufio_client {
sector_t start;
- struct shrinker shrinker;
+ struct shrinker *shrinker;
struct work_struct shrink_work;
atomic_long_t need_shrink;
@@ -2405,7 +2405,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink
{
struct dm_bufio_client *c;
- c = container_of(shrink, struct dm_bufio_client, shrinker);
+ c = shrink->private_data;
atomic_long_add(sc->nr_to_scan, &c->need_shrink);
queue_work(dm_bufio_wq, &c->shrink_work);
@@ -2414,7 +2414,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink
static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
+ struct dm_bufio_client *c = shrink->private_data;
unsigned long count = cache_total(&c->cache);
unsigned long retain_target = get_retain_buffers(c);
unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
@@ -2527,14 +2527,20 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
INIT_WORK(&c->shrink_work, shrink_work);
atomic_long_set(&c->need_shrink, 0);
- c->shrinker.count_objects = dm_bufio_shrink_count;
- c->shrinker.scan_objects = dm_bufio_shrink_scan;
- c->shrinker.seeks = 1;
- c->shrinker.batch = 0;
- r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
- MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
- if (r)
+ c->shrinker = shrinker_alloc(0, "dm-bufio:(%u:%u)",
+ MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+ if (!c->shrinker) {
+ r = -ENOMEM;
goto bad;
+ }
+
+ c->shrinker->count_objects = dm_bufio_shrink_count;
+ c->shrinker->scan_objects = dm_bufio_shrink_scan;
+ c->shrinker->seeks = 1;
+ c->shrinker->batch = 0;
+ c->shrinker->private_data = c;
+
+ shrinker_register(c->shrinker);
mutex_lock(&dm_bufio_clients_lock);
dm_bufio_client_count++;
@@ -2574,7 +2580,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
drop_buffers(c);
- unregister_shrinker(&c->shrinker);
+ shrinker_free(c->shrinker);
flush_work(&c->shrink_work);
mutex_lock(&dm_bufio_clients_lock);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index acffed750..96751cd3d 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -597,7 +597,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
- strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+ strscpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]);
cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]);
cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);
@@ -707,7 +707,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
- strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+ strscpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
@@ -1726,7 +1726,7 @@ static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
(strlen(policy_name) > sizeof(cmd->policy_name) - 1))
return -EINVAL;
- strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+ strscpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
hint_size = dm_cache_policy_get_hint_size(policy);
@@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs cmd root_lock).
- * - must take shrinker_rwsem without holding cmd->root_lock
+ * - must take shrinker_mutex without holding cmd->root_lock
*/
new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
CACHE_MAX_CONCURRENT_LOCKS);
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 095b9b49a..e6757a30d 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -22,6 +22,8 @@
#include "dm-ima.h"
#define DM_RESERVED_MAX_IOS 1024
+#define DM_MAX_TARGETS 1048576
+#define DM_MAX_TARGET_PARAMS 1024
struct dm_io;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 17ffbf7fb..4ab4e8dcf 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -62,6 +62,8 @@ struct convert_context {
struct skcipher_request *req;
struct aead_request *req_aead;
} r;
+ bool aead_recheck;
+ bool aead_failed;
};
@@ -73,10 +75,8 @@ struct dm_crypt_io {
struct bio *base_bio;
u8 *integrity_metadata;
bool integrity_metadata_from_pool:1;
- bool in_tasklet:1;
struct work_struct work;
- struct tasklet_struct tasklet;
struct convert_context ctx;
@@ -84,6 +84,8 @@ struct dm_crypt_io {
blk_status_t error;
sector_t sector;
+ struct bvec_iter saved_bi_iter;
+
struct rb_node rb_node;
} CRYPTO_MINALIGN_ATTR;
@@ -224,7 +226,7 @@ struct crypt_config {
struct mutex bio_alloc_lock;
u8 *authenc_key; /* space for keys in authenc() format (if used) */
- u8 key[];
+ u8 key[] __counted_by(key_size);
};
#define MIN_IOS 64
@@ -652,13 +654,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
for (i = 0; i < 4; i++) {
- r = crypto_shash_init(desc);
- if (r)
- goto out;
- r = crypto_shash_update(desc, &buf[i * 4], 4);
- if (r)
- goto out;
- r = crypto_shash_final(desc, &buf[i * 4]);
+ r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]);
if (r)
goto out;
}
@@ -1378,10 +1374,13 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
if (r == -EBADMSG) {
sector_t s = le64_to_cpu(*sector);
- DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
- ctx->bio_in->bi_bdev, s);
- dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
- ctx->bio_in, s, 0);
+ ctx->aead_failed = true;
+ if (ctx->aead_recheck) {
+ DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
+ ctx->bio_in->bi_bdev, s);
+ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
+ ctx->bio_in, s, 0);
+ }
}
if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
@@ -1765,10 +1764,11 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
io->base_bio = bio;
io->sector = sector;
io->error = 0;
+ io->ctx.aead_recheck = false;
+ io->ctx.aead_failed = false;
io->ctx.r.req = NULL;
io->integrity_metadata = NULL;
io->integrity_metadata_from_pool = false;
- io->in_tasklet = false;
atomic_set(&io->io_pending, 0);
}
@@ -1777,12 +1777,7 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
atomic_inc(&io->io_pending);
}
-static void kcryptd_io_bio_endio(struct work_struct *work)
-{
- struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-
- bio_endio(io->base_bio);
-}
+static void kcryptd_queue_read(struct dm_crypt_io *io);
/*
* One of the bios was finished. Check for completion of
@@ -1797,6 +1792,15 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (!atomic_dec_and_test(&io->io_pending))
return;
+ if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) &&
+ cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) {
+ io->ctx.aead_recheck = true;
+ io->ctx.aead_failed = false;
+ io->error = 0;
+ kcryptd_queue_read(io);
+ return;
+ }
+
if (io->ctx.r.req)
crypt_free_req(cc, io->ctx.r.req, base_bio);
@@ -1807,20 +1811,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
base_bio->bi_status = error;
- /*
- * If we are running this function from our tasklet,
- * we can't call bio_endio() here, because it will call
- * clone_endio() from dm.c, which in turn will
- * free the current struct dm_crypt_io structure with
- * our tasklet. In this case we need to delay bio_endio()
- * execution to after the tasklet is done and dequeued.
- */
- if (io->in_tasklet) {
- INIT_WORK(&io->work, kcryptd_io_bio_endio);
- queue_work(cc->io_queue, &io->work);
- return;
- }
-
bio_endio(base_bio);
}
@@ -1846,15 +1836,19 @@ static void crypt_endio(struct bio *clone)
struct dm_crypt_io *io = clone->bi_private;
struct crypt_config *cc = io->cc;
unsigned int rw = bio_data_dir(clone);
- blk_status_t error;
+ blk_status_t error = clone->bi_status;
+
+ if (io->ctx.aead_recheck && !error) {
+ kcryptd_queue_crypt(io);
+ return;
+ }
/*
* free the processed pages
*/
- if (rw == WRITE)
+ if (rw == WRITE || io->ctx.aead_recheck)
crypt_free_buffer_pages(cc, clone);
- error = clone->bi_status;
bio_put(clone);
if (rw == READ && !error) {
@@ -1875,6 +1869,22 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
struct crypt_config *cc = io->cc;
struct bio *clone;
+ if (io->ctx.aead_recheck) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM))
+ return 1;
+ crypt_inc_pending(io);
+ clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+ if (unlikely(!clone)) {
+ crypt_dec_pending(io);
+ return 1;
+ }
+ clone->bi_iter.bi_sector = cc->start + io->sector;
+ crypt_convert_init(cc, &io->ctx, clone, clone, io->sector);
+ io->saved_bi_iter = clone->bi_iter;
+ dm_submit_bio_remap(io->base_bio, clone);
+ return 0;
+ }
+
/*
* We need the original biovec array in order to decrypt the whole bio
* data *afterwards* -- thanks to immutable biovecs we don't need to
@@ -2101,6 +2111,12 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
io->ctx.bio_out = clone;
io->ctx.iter_out = clone->bi_iter;
+ if (crypt_integrity_aead(cc)) {
+ bio_copy_data(clone, io->base_bio);
+ io->ctx.bio_in = clone;
+ io->ctx.iter_in = clone->bi_iter;
+ }
+
sector += bio_sectors(clone);
crypt_inc_pending(io);
@@ -2137,6 +2153,14 @@ dec:
static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
{
+ if (io->ctx.aead_recheck) {
+ if (!io->error) {
+ io->ctx.bio_in->bi_iter = io->saved_bi_iter;
+ bio_copy_data(io->base_bio, io->ctx.bio_in);
+ }
+ crypt_free_buffer_pages(io->cc, io->ctx.bio_in);
+ bio_put(io->ctx.bio_in);
+ }
crypt_dec_pending(io);
}
@@ -2166,11 +2190,17 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
crypt_inc_pending(io);
- crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
- io->sector);
+ if (io->ctx.aead_recheck) {
+ io->ctx.cc_sector = io->sector + cc->iv_offset;
+ r = crypt_convert(cc, &io->ctx,
+ test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true);
+ } else {
+ crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
+ io->sector);
- r = crypt_convert(cc, &io->ctx,
- test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true);
+ r = crypt_convert(cc, &io->ctx,
+ test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true);
+ }
/*
* Crypto API backlogged the request, because its queue was full
* and we're in softirq context, so continue from a workqueue
@@ -2212,10 +2242,13 @@ static void kcryptd_async_done(void *data, int error)
if (error == -EBADMSG) {
sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq));
- DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
- ctx->bio_in->bi_bdev, s);
- dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
- ctx->bio_in, s, 0);
+ ctx->aead_failed = true;
+ if (ctx->aead_recheck) {
+ DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
+ ctx->bio_in->bi_bdev, s);
+ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
+ ctx->bio_in, s, 0);
+ }
io->error = BLK_STS_PROTECTION;
} else if (error < 0)
io->error = BLK_STS_IOERR;
@@ -2252,11 +2285,6 @@ static void kcryptd_crypt(struct work_struct *work)
kcryptd_crypt_write_convert(io);
}
-static void kcryptd_crypt_tasklet(unsigned long work)
-{
- kcryptd_crypt((struct work_struct *)work);
-}
-
static void kcryptd_queue_crypt(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
@@ -2268,15 +2296,10 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
* irqs_disabled(): the kernel may run some IO completion from the idle thread, but
* it is being executed with irqs disabled.
*/
- if (in_hardirq() || irqs_disabled()) {
- io->in_tasklet = true;
- tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work);
- tasklet_schedule(&io->tasklet);
+ if (!(in_hardirq() || irqs_disabled())) {
+ kcryptd_crypt(&io->work);
return;
}
-
- kcryptd_crypt(&io->work);
- return;
}
INIT_WORK(&io->work, kcryptd_crypt);
@@ -2868,10 +2891,9 @@ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
if (!start || !end || ++start > end)
return -EINVAL;
- mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
+ mac_alg = kmemdup_nul(start, end - start, GFP_KERNEL);
if (!mac_alg)
return -ENOMEM;
- strncpy(mac_alg, start, end - start);
mac = crypto_alloc_ahash(mac_alg, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
kfree(mac_alg);
@@ -3151,7 +3173,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
if (!strcasecmp(sval, "aead")) {
set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
- } else if (strcasecmp(sval, "none")) {
+ } else if (strcasecmp(sval, "none")) {
ti->error = "Unknown integrity profile";
return -EINVAL;
}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 3726fae30..5eabdb06c 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/slab.h>
+#include <linux/kthread.h>
#include <linux/device-mapper.h>
@@ -31,6 +32,7 @@ struct delay_c {
struct workqueue_struct *kdelayd_wq;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
+ struct task_struct *worker;
bool may_delay;
struct delay_class read;
@@ -66,6 +68,11 @@ static void queue_timeout(struct delay_c *dc, unsigned long expires)
mutex_unlock(&dc->timer_lock);
}
+static inline bool delay_is_fast(struct delay_c *dc)
+{
+ return !!dc->worker;
+}
+
static void flush_bios(struct bio *bio)
{
struct bio *n;
@@ -78,36 +85,61 @@ static void flush_bios(struct bio *bio)
}
}
-static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
+static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
{
struct dm_delay_info *delayed, *next;
+ struct bio_list flush_bio_list;
unsigned long next_expires = 0;
- unsigned long start_timer = 0;
- struct bio_list flush_bios = { };
+ bool start_timer = false;
+ bio_list_init(&flush_bio_list);
mutex_lock(&delayed_bios_lock);
list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+ cond_resched();
if (flush_all || time_after_eq(jiffies, delayed->expires)) {
struct bio *bio = dm_bio_from_per_bio_data(delayed,
sizeof(struct dm_delay_info));
list_del(&delayed->list);
- bio_list_add(&flush_bios, bio);
+ bio_list_add(&flush_bio_list, bio);
delayed->class->ops--;
continue;
}
- if (!start_timer) {
- start_timer = 1;
- next_expires = delayed->expires;
- } else
- next_expires = min(next_expires, delayed->expires);
+ if (!delay_is_fast(dc)) {
+ if (!start_timer) {
+ start_timer = true;
+ next_expires = delayed->expires;
+ } else {
+ next_expires = min(next_expires, delayed->expires);
+ }
+ }
}
mutex_unlock(&delayed_bios_lock);
if (start_timer)
queue_timeout(dc, next_expires);
- return bio_list_get(&flush_bios);
+ flush_bios(bio_list_get(&flush_bio_list));
+}
+
+static int flush_worker_fn(void *data)
+{
+ struct delay_c *dc = data;
+
+ while (!kthread_should_stop()) {
+ flush_delayed_bios(dc, false);
+ mutex_lock(&delayed_bios_lock);
+ if (unlikely(list_empty(&dc->delayed_bios))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ mutex_unlock(&delayed_bios_lock);
+ schedule();
+ } else {
+ mutex_unlock(&delayed_bios_lock);
+ cond_resched();
+ }
+ }
+
+ return 0;
}
static void flush_expired_bios(struct work_struct *work)
@@ -115,7 +147,7 @@ static void flush_expired_bios(struct work_struct *work)
struct delay_c *dc;
dc = container_of(work, struct delay_c, flush_expired_bios);
- flush_bios(flush_delayed_bios(dc, 0));
+ flush_delayed_bios(dc, false);
}
static void delay_dtr(struct dm_target *ti)
@@ -131,6 +163,8 @@ static void delay_dtr(struct dm_target *ti)
dm_put_device(ti, dc->write.dev);
if (dc->flush.dev)
dm_put_device(ti, dc->flush.dev);
+ if (dc->worker)
+ kthread_stop(dc->worker);
mutex_destroy(&dc->timer_lock);
@@ -175,6 +209,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
int ret;
+ unsigned int max_delay;
if (argc != 3 && argc != 6 && argc != 9) {
ti->error = "Requires exactly 3, 6 or 9 arguments";
@@ -188,8 +223,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ti->private = dc;
- timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
- INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
mutex_init(&dc->timer_lock);
dc->may_delay = true;
@@ -198,6 +231,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->read, argv);
if (ret)
goto bad;
+ max_delay = dc->read.delay;
if (argc == 3) {
ret = delay_class_ctr(ti, &dc->write, argv);
@@ -206,6 +240,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->flush, argv);
if (ret)
goto bad;
+ max_delay = max(max_delay, dc->write.delay);
+ max_delay = max(max_delay, dc->flush.delay);
goto out;
}
@@ -216,19 +252,37 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
if (ret)
goto bad;
+ max_delay = max(max_delay, dc->flush.delay);
goto out;
}
ret = delay_class_ctr(ti, &dc->flush, argv + 6);
if (ret)
goto bad;
+ max_delay = max(max_delay, dc->flush.delay);
out:
- dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
- if (!dc->kdelayd_wq) {
- ret = -EINVAL;
- DMERR("Couldn't start kdelayd");
- goto bad;
+ if (max_delay < 50) {
+ /*
+ * In case of small requested delays, use kthread instead of
+ * timers and workqueue to achieve better latency.
+ */
+ dc->worker = kthread_create(&flush_worker_fn, dc,
+ "dm-delay-flush-worker");
+ if (IS_ERR(dc->worker)) {
+ ret = PTR_ERR(dc->worker);
+ dc->worker = NULL;
+ goto bad;
+ }
+ } else {
+ timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
+ INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
+ dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+ if (!dc->kdelayd_wq) {
+ ret = -EINVAL;
+ DMERR("Couldn't start kdelayd");
+ goto bad;
+ }
}
ti->num_flush_bios = 1;
@@ -264,7 +318,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
list_add_tail(&delayed->list, &dc->delayed_bios);
mutex_unlock(&delayed_bios_lock);
- queue_timeout(dc, expires);
+ if (delay_is_fast(dc))
+ wake_up_process(dc->worker);
+ else
+ queue_timeout(dc, expires);
return DM_MAPIO_SUBMITTED;
}
@@ -277,8 +334,9 @@ static void delay_presuspend(struct dm_target *ti)
dc->may_delay = false;
mutex_unlock(&delayed_bios_lock);
- del_timer_sync(&dc->delay_timer);
- flush_bios(flush_delayed_bios(dc, 1));
+ if (!delay_is_fast(dc))
+ del_timer_sync(&dc->delay_timer);
+ flush_delayed_bios(dc, true);
}
static void delay_resume(struct dm_target *ti)
@@ -363,7 +421,7 @@ out:
static struct target_type delay_target = {
.name = "delay",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.features = DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
.ctr = delay_ctr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 120153e44..f57fb8215 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
remaining_size = size;
- order = MAX_ORDER - 1;
+ order = MAX_ORDER;
while (remaining_size) {
struct page *pages;
unsigned size_to_add, to_copy;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 9261bbebd..e8e8fc33d 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -278,6 +278,8 @@ struct dm_integrity_c {
atomic64_t number_of_mismatches;
+ mempool_t recheck_pool;
+
struct notifier_block reboot_notifier;
};
@@ -493,42 +495,32 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
{
SHASH_DESC_ON_STACK(desc, ic->journal_mac);
int r;
- unsigned int size = crypto_shash_digestsize(ic->journal_mac);
+ unsigned int mac_size = crypto_shash_digestsize(ic->journal_mac);
+ __u8 *sb = (__u8 *)ic->sb;
+ __u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size;
- if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) {
+ if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT) {
dm_integrity_io_error(ic, "digest is too long", -EINVAL);
return -EINVAL;
}
desc->tfm = ic->journal_mac;
- r = crypto_shash_init(desc);
- if (unlikely(r < 0)) {
- dm_integrity_io_error(ic, "crypto_shash_init", r);
- return r;
- }
-
- r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size);
- if (unlikely(r < 0)) {
- dm_integrity_io_error(ic, "crypto_shash_update", r);
- return r;
- }
-
if (likely(wr)) {
- r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size);
+ r = crypto_shash_digest(desc, sb, mac - sb, mac);
if (unlikely(r < 0)) {
- dm_integrity_io_error(ic, "crypto_shash_final", r);
+ dm_integrity_io_error(ic, "crypto_shash_digest", r);
return r;
}
} else {
- __u8 result[HASH_MAX_DIGESTSIZE];
+ __u8 actual_mac[HASH_MAX_DIGESTSIZE];
- r = crypto_shash_final(desc, result);
+ r = crypto_shash_digest(desc, sb, mac - sb, actual_mac);
if (unlikely(r < 0)) {
- dm_integrity_io_error(ic, "crypto_shash_final", r);
+ dm_integrity_io_error(ic, "crypto_shash_digest", r);
return r;
}
- if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) {
+ if (memcmp(mac, actual_mac, mac_size)) {
dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
return -EILSEQ;
@@ -1699,6 +1691,77 @@ failed:
get_random_bytes(result, ic->tag_size);
}
+static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum)
+{
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ struct dm_integrity_c *ic = dio->ic;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ sector_t sector, logical_sector, area, offset;
+ struct page *page;
+ void *buffer;
+
+ get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
+ dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset,
+ &dio->metadata_offset);
+ sector = get_data_sector(ic, area, offset);
+ logical_sector = dio->range.logical_sector;
+
+ page = mempool_alloc(&ic->recheck_pool, GFP_NOIO);
+ buffer = page_to_virt(page);
+
+ __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
+ unsigned pos = 0;
+
+ do {
+ char *mem;
+ int r;
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+ io_req.bi_opf = REQ_OP_READ;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = buffer;
+ io_req.notify.fn = NULL;
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = sector;
+ io_loc.count = ic->sectors_per_block;
+
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r)) {
+ dio->bi_status = errno_to_blk_status(r);
+ goto free_ret;
+ }
+
+ integrity_sector_checksum(ic, logical_sector, buffer, checksum);
+ r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block,
+ &dio->metadata_offset, ic->tag_size, TAG_CMP);
+ if (r) {
+ if (r > 0) {
+ DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
+ bio->bi_bdev, logical_sector);
+ atomic64_inc(&ic->number_of_mismatches);
+ dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
+ bio, logical_sector, 0);
+ r = -EILSEQ;
+ }
+ dio->bi_status = errno_to_blk_status(r);
+ goto free_ret;
+ }
+
+ mem = bvec_kmap_local(&bv);
+ memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT);
+ kunmap_local(mem);
+
+ pos += ic->sectors_per_block << SECTOR_SHIFT;
+ sector += ic->sectors_per_block;
+ logical_sector += ic->sectors_per_block;
+ } while (pos < bv.bv_len);
+ }
+free_ret:
+ mempool_free(page, &ic->recheck_pool);
+}
+
static void integrity_metadata(struct work_struct *w)
{
struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
@@ -1786,15 +1849,8 @@ again:
checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
if (unlikely(r)) {
if (r > 0) {
- sector_t s;
-
- s = sector - ((r + ic->tag_size - 1) / ic->tag_size);
- DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
- bio->bi_bdev, s);
- r = -EILSEQ;
- atomic64_inc(&ic->number_of_mismatches);
- dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
- bio, s, 0);
+ integrity_recheck(dio, checksums);
+ goto skip_io;
}
if (likely(checksums != checksums_onstack))
kfree(checksums);
@@ -4271,6 +4327,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
goto bad;
}
+ r = mempool_init_page_pool(&ic->recheck_pool, 1, 0);
+ if (r) {
+ ti->error = "Cannot allocate mempool";
+ goto bad;
+ }
+
ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
if (!ic->metadata_wq) {
@@ -4619,6 +4681,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
kvfree(ic->bbs);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
+ mempool_exit(&ic->recheck_pool);
mempool_exit(&ic->journal_io_mempool);
if (ic->io)
dm_io_client_destroy(ic->io);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 21ebb6c39..3b1ad7127 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1295,8 +1295,8 @@ static void retrieve_status(struct dm_table *table,
spec->status = 0;
spec->sector_start = ti->begin;
spec->length = ti->len;
- strncpy(spec->target_type, ti->type->name,
- sizeof(spec->target_type) - 1);
+ strscpy_pad(spec->target_type, ti->type->name,
+ sizeof(spec->target_type));
outptr += sizeof(struct dm_target_spec);
remaining = len - (outptr - outbuf);
@@ -1941,7 +1941,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
minimum_data_size - sizeof(param_kernel->version)))
return -EFAULT;
- if (param_kernel->data_size < minimum_data_size) {
+ if (unlikely(param_kernel->data_size < minimum_data_size) ||
+ unlikely(param_kernel->data_size > DM_MAX_TARGETS * DM_MAX_TARGET_PARAMS)) {
DMERR("Invalid data size in the ioctl structure: %u",
param_kernel->data_size);
return -EINVAL;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index f4448d520..2d3e186ca 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -85,7 +85,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
return lc->start + dm_target_offset(ti, bi_sector);
}
-static int linear_map(struct dm_target *ti, struct bio *bio)
+int linear_map(struct dm_target *ti, struct bio *bio)
{
struct linear_c *lc = ti->private;
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 5aace6ee6..7e4f27e86 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -224,7 +224,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
lc->usr_argc = argc;
- strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+ strscpy(lc->uuid, argv[0], sizeof(lc->uuid));
argc--;
argv++;
spin_lock_init(&lc->flush_lock);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5f9991765..eb009d6bb 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -254,7 +254,7 @@ struct raid_set {
int mode;
} journal_dev;
- struct raid_dev dev[];
+ struct raid_dev dev[] __counted_by(raid_disks);
};
static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
@@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return ERR_PTR(-ENOMEM);
}
- mddev_init(&rs->md);
+ if (mddev_init(&rs->md)) {
+ kfree(rs);
+ ti->error = "Cannot initialize raid context";
+ return ERR_PTR(-ENOMEM);
+ }
rs->raid_disks = raid_devs;
rs->delta_disks = 0;
@@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev);
}
+ mddev_destroy(&rs->md);
kfree(rs);
}
@@ -3239,7 +3244,7 @@ size_check:
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
- mddev_lock_nointr(&rs->md);
+ mddev_suspend_and_lock_nointr(&rs->md);
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@@ -3263,7 +3268,6 @@ size_check:
}
}
- mddev_suspend(&rs->md);
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3313,6 +3317,9 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
mddev_unlock(&rs->md);
+
+ if (work_pending(&rs->md.event_work))
+ flush_work(&rs->md.event_work);
raid_set_free(rs);
}
@@ -3793,9 +3800,7 @@ static void raid_postsuspend(struct dm_target *ti)
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
md_stop_writes(&rs->md);
- mddev_lock_nointr(&rs->md);
- mddev_suspend(&rs->md);
- mddev_unlock(&rs->md);
+ mddev_suspend(&rs->md, false);
}
}
@@ -4054,8 +4059,7 @@ static void raid_resume(struct dm_target *ti)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
- mddev_resume(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
}
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index db2d997a6..bdc14ec99 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -56,7 +56,7 @@ struct dm_stat {
size_t percpu_alloc_size;
size_t histogram_alloc_size;
struct dm_stat_percpu *stat_percpu[NR_CPUS];
- struct dm_stat_shared stat_shared[];
+ struct dm_stat_shared stat_shared[] __counted_by(n_entries);
};
#define STAT_PRECISE_TIMESTAMPS 1
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e2854a3cb..16b93ae51 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -44,7 +44,7 @@ struct stripe_c {
/* Work struct used for triggering events*/
struct work_struct trigger_event;
- struct stripe stripe[];
+ struct stripe stripe[] __counted_by(stripes);
};
/*
@@ -268,7 +268,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
return DM_MAPIO_SUBMITTED;
}
-static int stripe_map(struct dm_target *ti, struct bio *bio)
+int stripe_map(struct dm_target *ti, struct bio *bio)
{
struct stripe_c *sc = ti->private;
uint32_t stripe;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 37b48f63a..08c9c20f9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -129,7 +129,12 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
int dm_table_create(struct dm_table **result, blk_mode_t mode,
unsigned int num_targets, struct mapped_device *md)
{
- struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
+ struct dm_table *t;
+
+ if (num_targets > DM_MAX_TARGETS)
+ return -EOVERFLOW;
+
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
if (!t)
return -ENOMEM;
@@ -144,7 +149,7 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode,
if (!num_targets) {
kfree(t);
- return -ENOMEM;
+ return -EOVERFLOW;
}
if (alloc_targets(t, num_targets)) {
@@ -844,7 +849,8 @@ static bool dm_table_supports_dax(struct dm_table *t,
if (!ti->type->direct_access)
return false;
- if (!ti->type->iterate_devices ||
+ if (dm_target_is_wildcard(ti->type) ||
+ !ti->type->iterate_devices ||
ti->type->iterate_devices(ti, iterate_fn, NULL))
return false;
}
@@ -1587,6 +1593,14 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
return blk_queue_zoned_model(q) != *zoned_model;
}
+static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return blk_queue_zoned_model(q) != BLK_ZONED_NONE;
+}
+
/*
* Check the device zoned model based on the target feature flag. If the target
* has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are
@@ -1600,6 +1614,18 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
+ /*
+ * For the wildcard target (dm-error), if we do not have a
+ * backing device, we must always return false. If we have a
+ * backing device, the result must depend on checking zoned
+ * model, like for any other target. So for this, check directly
+ * if the target backing device is zoned as we get "false" when
+ * dm-error was set without a backing device.
+ */
+ if (dm_target_is_wildcard(ti->type) &&
+ !ti->type->iterate_devices(ti, device_is_zoned_model, NULL))
+ return false;
+
if (dm_target_supports_zoned_hm(ti->type)) {
if (!ti->type->iterate_devices ||
ti->type->iterate_devices(ti, device_not_zoned_model,
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 27e2992ff..0c4efb0be 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -116,9 +116,63 @@ EXPORT_SYMBOL(dm_unregister_target);
* io-err: always fails an io, useful for bringing
* up LVs that have holes in them.
*/
+struct io_err_c {
+ struct dm_dev *dev;
+ sector_t start;
+};
+
+static int io_err_get_args(struct dm_target *tt, unsigned int argc, char **args)
+{
+ unsigned long long start;
+ struct io_err_c *ioec;
+ char dummy;
+ int ret;
+
+ ioec = kmalloc(sizeof(*ioec), GFP_KERNEL);
+ if (!ioec) {
+ tt->error = "Cannot allocate io_err context";
+ return -ENOMEM;
+ }
+
+ ret = -EINVAL;
+ if (sscanf(args[1], "%llu%c", &start, &dummy) != 1 ||
+ start != (sector_t)start) {
+ tt->error = "Invalid device sector";
+ goto bad;
+ }
+ ioec->start = start;
+
+ ret = dm_get_device(tt, args[0], dm_table_get_mode(tt->table), &ioec->dev);
+ if (ret) {
+ tt->error = "Device lookup failed";
+ goto bad;
+ }
+
+ tt->private = ioec;
+
+ return 0;
+
+bad:
+ kfree(ioec);
+
+ return ret;
+}
+
static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
{
/*
+ * If we have arguments, assume it is the path to the backing
+ * block device and its mapping start sector (same as dm-linear).
+ * In this case, get the device so that we can get its limits.
+ */
+ if (argc == 2) {
+ int ret = io_err_get_args(tt, argc, args);
+
+ if (ret)
+ return ret;
+ }
+
+ /*
* Return error for discards instead of -EOPNOTSUPP
*/
tt->num_discard_bios = 1;
@@ -129,7 +183,12 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
static void io_err_dtr(struct dm_target *tt)
{
- /* empty */
+ struct io_err_c *ioec = tt->private;
+
+ if (ioec) {
+ dm_put_device(tt, ioec->dev);
+ kfree(ioec);
+ }
}
static int io_err_map(struct dm_target *tt, struct bio *bio)
@@ -149,6 +208,45 @@ static void io_err_release_clone_rq(struct request *clone,
{
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static sector_t io_err_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+ struct io_err_c *ioec = ti->private;
+
+ return ioec->start + dm_target_offset(ti, bi_sector);
+}
+
+static int io_err_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ struct io_err_c *ioec = ti->private;
+
+ /*
+ * This should never be called when we do not have a backing device
+ * as that mean the target is not a zoned one.
+ */
+ if (WARN_ON_ONCE(!ioec))
+ return -EIO;
+
+ return dm_report_zones(ioec->dev->bdev, ioec->start,
+ io_err_map_sector(ti, args->next_sector),
+ args, nr_zones);
+}
+#else
+#define io_err_report_zones NULL
+#endif
+
+static int io_err_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct io_err_c *ioec = ti->private;
+
+ if (!ioec)
+ return 0;
+
+ return fn(ti, ioec->dev, ioec->start, ti->len, data);
+}
+
static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
limits->max_discard_sectors = UINT_MAX;
@@ -165,15 +263,17 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
static struct target_type error_target = {
.name = "error",
- .version = {1, 6, 0},
- .features = DM_TARGET_WILDCARD,
+ .version = {1, 7, 0},
+ .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM,
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
.clone_and_map_rq = io_err_clone_and_map_rq,
.release_clone_rq = io_err_release_clone_rq,
+ .iterate_devices = io_err_iterate_devices,
.io_hints = io_err_io_hints,
.direct_access = io_err_dax_direct_access,
+ .report_zones = io_err_report_zones,
};
int __init dm_target_init(void)
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 14e58ae70..7b620b187 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -482,6 +482,63 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
return 0;
}
+static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *data, size_t len)
+{
+ memcpy(data, io->recheck_buffer, len);
+ io->recheck_buffer += len;
+
+ return 0;
+}
+
+static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
+ struct bvec_iter start, sector_t cur_block)
+{
+ struct page *page;
+ void *buffer;
+ int r;
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+
+ page = mempool_alloc(&v->recheck_pool, GFP_NOIO);
+ buffer = page_to_virt(page);
+
+ io_req.bi_opf = REQ_OP_READ;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = buffer;
+ io_req.notify.fn = NULL;
+ io_req.client = v->io;
+ io_loc.bdev = v->data_dev->bdev;
+ io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT);
+ io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT);
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r))
+ goto free_ret;
+
+ r = verity_hash(v, verity_io_hash_req(v, io), buffer,
+ 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io), true);
+ if (unlikely(r))
+ goto free_ret;
+
+ if (memcmp(verity_io_real_digest(v, io),
+ verity_io_want_digest(v, io), v->digest_size)) {
+ r = -EIO;
+ goto free_ret;
+ }
+
+ io->recheck_buffer = buffer;
+ r = verity_for_bv_block(v, io, &start, verity_recheck_copy);
+ if (unlikely(r))
+ goto free_ret;
+
+ r = 0;
+free_ret:
+ mempool_free(page, &v->recheck_pool);
+
+ return r;
+}
+
static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
u8 *data, size_t len)
{
@@ -508,9 +565,7 @@ static int verity_verify_io(struct dm_verity_io *io)
{
bool is_zero;
struct dm_verity *v = io->v;
-#if defined(CONFIG_DM_VERITY_FEC)
struct bvec_iter start;
-#endif
struct bvec_iter iter_copy;
struct bvec_iter *iter;
struct crypto_wait wait;
@@ -561,10 +616,7 @@ static int verity_verify_io(struct dm_verity_io *io)
if (unlikely(r < 0))
return r;
-#if defined(CONFIG_DM_VERITY_FEC)
- if (verity_fec_is_enabled(v))
- start = *iter;
-#endif
+ start = *iter;
r = verity_for_io_block(v, io, iter, &wait);
if (unlikely(r < 0))
return r;
@@ -586,6 +638,10 @@ static int verity_verify_io(struct dm_verity_io *io)
* tasklet since it may sleep, so fallback to work-queue.
*/
return -EAGAIN;
+ } else if (verity_recheck(v, io, start, cur_block) == 0) {
+ if (v->validated_blocks)
+ set_bit(cur_block, v->validated_blocks);
+ continue;
#if defined(CONFIG_DM_VERITY_FEC)
} else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
cur_block, NULL, &start) == 0) {
@@ -645,23 +701,6 @@ static void verity_work(struct work_struct *w)
verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
}
-static void verity_tasklet(unsigned long data)
-{
- struct dm_verity_io *io = (struct dm_verity_io *)data;
- int err;
-
- io->in_tasklet = true;
- err = verity_verify_io(io);
- if (err == -EAGAIN || err == -ENOMEM) {
- /* fallback to retrying with work-queue */
- INIT_WORK(&io->work, verity_work);
- queue_work(io->v->verify_wq, &io->work);
- return;
- }
-
- verity_finish_io(io, errno_to_blk_status(err));
-}
-
static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
@@ -674,13 +713,8 @@ static void verity_end_io(struct bio *bio)
return;
}
- if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) {
- tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io);
- tasklet_schedule(&io->tasklet);
- } else {
- INIT_WORK(&io->work, verity_work);
- queue_work(io->v->verify_wq, &io->work);
- }
+ INIT_WORK(&io->work, verity_work);
+ queue_work(io->v->verify_wq, &io->work);
}
/*
@@ -963,6 +997,10 @@ static void verity_dtr(struct dm_target *ti)
if (v->verify_wq)
destroy_workqueue(v->verify_wq);
+ mempool_exit(&v->recheck_pool);
+ if (v->io)
+ dm_io_client_destroy(v->io);
+
if (v->bufio)
dm_bufio_client_destroy(v->bufio);
@@ -1401,6 +1439,20 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
v->hash_blocks = hash_position;
+ r = mempool_init_page_pool(&v->recheck_pool, 1, 0);
+ if (unlikely(r)) {
+ ti->error = "Cannot allocate mempool";
+ goto bad;
+ }
+
+ v->io = dm_io_client_create();
+ if (IS_ERR(v->io)) {
+ r = PTR_ERR(v->io);
+ v->io = NULL;
+ ti->error = "Cannot allocate dm io";
+ goto bad;
+ }
+
v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
dm_bufio_alloc_callback, NULL,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index f9d522c87..4620a98c9 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -11,6 +11,7 @@
#ifndef DM_VERITY_H
#define DM_VERITY_H
+#include <linux/dm-io.h>
#include <linux/dm-bufio.h>
#include <linux/device-mapper.h>
#include <linux/interrupt.h>
@@ -68,6 +69,9 @@ struct dm_verity {
unsigned long *validated_blocks; /* bitset blocks validated */
char *signature_key_desc; /* signature keyring reference */
+
+ struct dm_io_client *io;
+ mempool_t recheck_pool;
};
struct dm_verity_io {
@@ -83,7 +87,8 @@ struct dm_verity_io {
struct bvec_iter iter;
struct work_struct work;
- struct tasklet_struct tasklet;
+
+ char *recheck_buffer;
/*
* Three variably-size fields follow this struct:
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 9d3cca8e3..60a4dc01e 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -187,7 +187,7 @@ struct dmz_metadata {
struct rb_root mblk_rbtree;
struct list_head mblk_lru_list;
struct list_head mblk_dirty_list;
- struct shrinker mblk_shrinker;
+ struct shrinker *mblk_shrinker;
/* Zone allocation management */
struct mutex map_lock;
@@ -615,7 +615,7 @@ static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
+ struct dmz_metadata *zmd = shrink->private_data;
return atomic_read(&zmd->nr_mblks);
}
@@ -626,7 +626,7 @@ static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
+ struct dmz_metadata *zmd = shrink->private_data;
unsigned long count;
spin_lock(&zmd->mblk_lock);
@@ -2936,19 +2936,23 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
*/
zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
- zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
- zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
- zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
/* Metadata cache shrinker */
- ret = register_shrinker(&zmd->mblk_shrinker, "dm-zoned-meta:(%u:%u)",
- MAJOR(dev->bdev->bd_dev),
- MINOR(dev->bdev->bd_dev));
- if (ret) {
- dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
+ zmd->mblk_shrinker = shrinker_alloc(0, "dm-zoned-meta:(%u:%u)",
+ MAJOR(dev->bdev->bd_dev),
+ MINOR(dev->bdev->bd_dev));
+ if (!zmd->mblk_shrinker) {
+ ret = -ENOMEM;
+ dmz_zmd_err(zmd, "Allocate metadata cache shrinker failed");
goto err;
}
+ zmd->mblk_shrinker->count_objects = dmz_mblock_shrinker_count;
+ zmd->mblk_shrinker->scan_objects = dmz_mblock_shrinker_scan;
+ zmd->mblk_shrinker->private_data = zmd;
+
+ shrinker_register(zmd->mblk_shrinker);
+
dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version);
for (i = 0; i < zmd->nr_devs; i++)
dmz_print_dev(zmd, i);
@@ -2995,7 +2999,7 @@ err:
*/
void dmz_dtr_metadata(struct dmz_metadata *zmd)
{
- unregister_shrinker(&zmd->mblk_shrinker);
+ shrinker_free(zmd->mblk_shrinker);
dmz_cleanup_metadata(zmd);
kfree(zmd);
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 64a1f306c..23c32cd1f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -570,13 +570,15 @@ static void dm_end_io_acct(struct dm_io *io)
dm_io_acct(io, true);
}
-static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t gfp_mask)
{
struct dm_io *io;
struct dm_target_io *tio;
struct bio *clone;
- clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
+ clone = bio_alloc_clone(NULL, bio, gfp_mask, &md->mempools->io_bs);
+ if (unlikely(!clone))
+ return NULL;
tio = clone_to_tio(clone);
tio->flags = 0;
dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
@@ -724,7 +726,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
dev_t dev, blk_mode_t mode)
{
struct table_device *td;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
u64 part_off;
int r;
@@ -733,9 +735,9 @@ static struct table_device *open_table_device(struct mapped_device *md,
return ERR_PTR(-ENOMEM);
refcount_set(&td->count, 1);
- bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL);
- if (IS_ERR(bdev)) {
- r = PTR_ERR(bdev);
+ bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
+ if (IS_ERR(bdev_handle)) {
+ r = PTR_ERR(bdev_handle);
goto out_free_td;
}
@@ -745,20 +747,22 @@ static struct table_device *open_table_device(struct mapped_device *md,
* called.
*/
if (md->disk->slave_dir) {
- r = bd_link_disk_holder(bdev, md->disk);
+ r = bd_link_disk_holder(bdev_handle->bdev, md->disk);
if (r)
goto out_blkdev_put;
}
td->dm_dev.mode = mode;
- td->dm_dev.bdev = bdev;
- td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
+ td->dm_dev.bdev = bdev_handle->bdev;
+ td->dm_dev.bdev_handle = bdev_handle;
+ td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off,
+ NULL, NULL);
format_dev_t(td->dm_dev.name, dev);
list_add(&td->list, &md->table_devices);
return td;
out_blkdev_put:
- blkdev_put(bdev, _dm_claim_ptr);
+ bdev_release(bdev_handle);
out_free_td:
kfree(td);
return ERR_PTR(r);
@@ -771,7 +775,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
{
if (md->disk->slave_dir)
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
- blkdev_put(td->dm_dev.bdev, _dm_claim_ptr);
+ bdev_release(td->dm_dev.bdev_handle);
put_dax(td->dm_dev.dax_dev);
list_del(&td->list);
kfree(td);
@@ -1424,9 +1428,16 @@ static void __map_bio(struct bio *clone)
if (unlikely(dm_emulate_zone_append(md)))
r = dm_zone_map_bio(tio);
else
+ goto do_map;
+ } else {
+do_map:
+ if (likely(ti->type->map == linear_map))
+ r = linear_map(ti, clone);
+ else if (ti->type->map == stripe_map)
+ r = stripe_map(ti, clone);
+ else
r = ti->type->map(ti, clone);
- } else
- r = ti->type->map(ti, clone);
+ }
switch (r) {
case DM_MAPIO_SUBMITTED:
@@ -1471,15 +1482,15 @@ static void setup_split_accounting(struct clone_info *ci, unsigned int len)
static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
struct dm_target *ti, unsigned int num_bios,
- unsigned *len)
+ unsigned *len, gfp_t gfp_flag)
{
struct bio *bio;
- int try;
+ int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1;
- for (try = 0; try < 2; try++) {
+ for (; try < 2; try++) {
int bio_nr;
- if (try)
+ if (try && num_bios > 1)
mutex_lock(&ci->io->md->table_devices_lock);
for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
bio = alloc_tio(ci, ti, bio_nr, len,
@@ -1489,7 +1500,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
bio_list_add(blist, bio);
}
- if (try)
+ if (try && num_bios > 1)
mutex_unlock(&ci->io->md->table_devices_lock);
if (bio_nr == num_bios)
return;
@@ -1499,34 +1510,31 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
}
}
-static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
- unsigned int num_bios, unsigned int *len)
+static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
+ unsigned int num_bios, unsigned int *len,
+ gfp_t gfp_flag)
{
struct bio_list blist = BIO_EMPTY_LIST;
struct bio *clone;
unsigned int ret = 0;
- switch (num_bios) {
- case 0:
- break;
- case 1:
- if (len)
- setup_split_accounting(ci, *len);
- clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
- __map_bio(clone);
- ret = 1;
- break;
- default:
- if (len)
- setup_split_accounting(ci, *len);
- /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
- alloc_multiple_bios(&blist, ci, ti, num_bios, len);
- while ((clone = bio_list_pop(&blist))) {
+ if (WARN_ON_ONCE(num_bios == 0)) /* num_bios = 0 is a bug in caller */
+ return 0;
+
+ /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
+ if (len)
+ setup_split_accounting(ci, *len);
+
+ /*
+ * Using alloc_multiple_bios(), even if num_bios is 1, to consistently
+ * support allocating using GFP_NOWAIT with GFP_NOIO fallback.
+ */
+ alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag);
+ while ((clone = bio_list_pop(&blist))) {
+ if (num_bios > 1)
dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
- __map_bio(clone);
- ret += 1;
- }
- break;
+ __map_bio(clone);
+ ret += 1;
}
return ret;
@@ -1553,8 +1561,12 @@ static void __send_empty_flush(struct clone_info *ci)
unsigned int bios;
struct dm_target *ti = dm_table_get_target(t, i);
+ if (unlikely(ti->num_flush_bios == 0))
+ continue;
+
atomic_add(ti->num_flush_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
+ NULL, GFP_NOWAIT);
atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
}
@@ -1567,10 +1579,9 @@ static void __send_empty_flush(struct clone_info *ci)
bio_uninit(ci->bio);
}
-static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
- unsigned int num_bios,
- unsigned int max_granularity,
- unsigned int max_sectors)
+static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti,
+ unsigned int num_bios, unsigned int max_granularity,
+ unsigned int max_sectors)
{
unsigned int len, bios;
@@ -1578,7 +1589,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target
__max_io_len(ti, ci->sector, max_granularity, max_sectors));
atomic_add(num_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, num_bios, &len);
+ bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO);
/*
* alloc_io() takes one extra reference for submission, so the
* reference won't reach 0 without the following (+1) subtraction
@@ -1647,8 +1658,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
if (unlikely(!num_bios))
return BLK_STS_NOTSUPP;
- __send_changing_extent_only(ci, ti, num_bios,
- max_granularity, max_sectors);
+ __send_abnormal_io(ci, ti, num_bios, max_granularity, max_sectors);
+
return BLK_STS_OK;
}
@@ -1707,10 +1718,6 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
if (unlikely(!ti))
return BLK_STS_IOERR;
- if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
- unlikely(!dm_target_supports_nowait(ti->type)))
- return BLK_STS_NOTSUPP;
-
if (unlikely(ci->is_abnormal_io))
return __process_abnormal_io(ci, ti);
@@ -1722,7 +1729,17 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
setup_split_accounting(ci, len);
- clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
+
+ if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) {
+ if (unlikely(!dm_target_supports_nowait(ti->type)))
+ return BLK_STS_NOTSUPP;
+
+ clone = alloc_tio(ci, ti, 0, &len, GFP_NOWAIT);
+ if (unlikely(!clone))
+ return BLK_STS_AGAIN;
+ } else {
+ clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
+ }
__map_bio(clone);
ci->sector += len;
@@ -1731,11 +1748,11 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
return BLK_STS_OK;
}
-static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
+static void init_clone_info(struct clone_info *ci, struct dm_io *io,
struct dm_table *map, struct bio *bio, bool is_abnormal)
{
ci->map = map;
- ci->io = alloc_io(md, bio);
+ ci->io = io;
ci->bio = bio;
ci->is_abnormal_io = is_abnormal;
ci->submit_as_polled = false;
@@ -1770,8 +1787,18 @@ static void dm_split_and_process_bio(struct mapped_device *md,
return;
}
- init_clone_info(&ci, md, map, bio, is_abnormal);
- io = ci.io;
+ /* Only support nowait for normal IO */
+ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
+ io = alloc_io(md, bio, GFP_NOWAIT);
+ if (unlikely(!io)) {
+ /* Unable to do anything without dm_io. */
+ bio_wouldblock_error(bio);
+ return;
+ }
+ } else {
+ io = alloc_io(md, bio, GFP_NOIO);
+ }
+ init_clone_info(&ci, io, map, bio, is_abnormal);
if (bio->bi_opf & REQ_PREFLUSH) {
__send_empty_flush(&ci);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f682295af..7f1acbf6b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -188,9 +188,11 @@ void dm_kobject_release(struct kobject *kobj);
/*
* Targets for linear and striped mappings
*/
+int linear_map(struct dm_target *ti, struct bio *bio);
int dm_linear_init(void);
void dm_linear_exit(void);
+int stripe_map(struct dm_target *ti, struct bio *bio);
int dm_stripe_init(void);
void dm_stripe_exit(void);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 6eaa0eab4..4b80165af 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
return;
}
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
goto out_mddev_put;
@@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
if (err)
pr_warn("md: starting %s failed\n", name);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
out_mddev_put:
mddev_put(mddev);
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 6f9ff1497..9672f75c3 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev)
md_bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev)
goto out;
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, true);
+ mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2348,14 +2348,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
{
int rv;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
+
if (mddev->pers) {
- if (!mddev->pers->quiesce) {
- rv = -EBUSY;
- goto out;
- }
if (mddev->recovery || mddev->sync_thread) {
rv = -EBUSY;
goto out;
@@ -2369,11 +2366,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EBUSY;
goto out;
}
- if (mddev->pers) {
- mddev_suspend(mddev);
- md_bitmap_destroy(mddev);
- mddev_resume(mddev);
- }
+
+ md_bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
@@ -2383,6 +2377,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
/* No bitmap, OK to set a location */
long long offset;
+ struct bitmap *bitmap;
+
if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */;
else if (strncmp(buf, "file:", 5) == 0) {
@@ -2406,25 +2402,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL;
goto out;
}
+
mddev->bitmap_info.offset = offset;
- if (mddev->pers) {
- struct bitmap *bitmap;
- bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
- if (IS_ERR(bitmap))
- rv = PTR_ERR(bitmap);
- else {
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
- if (rv)
- mddev->bitmap_info.offset = 0;
- }
- if (rv) {
- md_bitmap_destroy(mddev);
- mddev_resume(mddev);
- goto out;
- }
- mddev_resume(mddev);
+ bitmap = md_bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap)) {
+ rv = PTR_ERR(bitmap);
+ goto out;
+ }
+
+ mddev->bitmap = bitmap;
+ rv = md_bitmap_load(mddev);
+ if (rv) {
+ mddev->bitmap_info.offset = 0;
+ md_bitmap_destroy(mddev);
+ goto out;
}
}
}
@@ -2437,7 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
rv = 0;
out:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (rv)
return rv;
return len;
@@ -2546,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
@@ -2571,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!backlog && mddev->serial_info_pool) {
/* serial_info_pool is not needed if backlog is zero */
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, false);
+ mddev_destroy_serial_pool(mddev, NULL);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return len;
}
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1e26eb223..8e36a0fee 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -501,7 +501,7 @@ static void process_suspend_info(struct mddev *mddev,
mddev->pers->quiesce(mddev, 0);
}
-static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
+static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
{
char disk_uuid[64];
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -509,6 +509,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
char raid_slot[16];
char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
int len;
+ int res = 0;
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
@@ -517,9 +518,14 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
- wait_for_completion_timeout(&cinfo->newdisk_completion,
- NEW_DEV_TIMEOUT);
+ if (!wait_for_completion_timeout(&cinfo->newdisk_completion,
+ NEW_DEV_TIMEOUT)) {
+ pr_err("md-cluster(%s:%d): timeout on a new disk adding\n",
+ __func__, __LINE__);
+ res = -1;
+ }
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+ return res;
}
@@ -594,7 +600,8 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
le64_to_cpu(msg->high));
break;
case NEWDISK:
- process_add_new_disk(mddev, msg);
+ if (process_add_new_disk(mddev, msg))
+ ret = -1;
break;
case REMOVE:
process_remove_disk(mddev, msg);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 71ac99646..8eca7693b 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
if (!conf)
return NULL;
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
cnt = 0;
conf->array_sectors = 0;
@@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
- /*
- * conf->raid_disks is copy of mddev->raid_disks. The reason to
- * keep a copy of mddev->raid_disks in struct linear_conf is,
- * mddev->raid_disks may not be consistent with pointers number of
- * conf->disks[] when it is updated in linear_add() and used to
- * iterate old conf->disks[] earray in linear_congested().
- * Here conf->raid_disks is always consitent with number of
- * pointers in conf->disks[] array, and mddev->private is updated
- * with rcu_assign_pointer() in linear_addr(), such race can be
- * avoided.
- */
- conf->raid_disks = raid_disks;
-
return conf;
out:
@@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
* in linear_congested(), therefore kfree_rcu() is used to free
* oldconf until no one uses it anymore.
*/
- mddev_suspend(mddev);
oldconf = rcu_dereference_protected(mddev->private,
lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
@@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
- mddev_resume(mddev);
kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h
index 24e97db50..5587eeedb 100644
--- a/drivers/md/md-linear.h
+++ b/drivers/md/md-linear.h
@@ -12,6 +12,6 @@ struct linear_conf
struct rcu_head rcu;
sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */
- struct dev_info disks[];
+ struct dev_info disks[] __counted_by(raid_disks);
};
#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8c40c1c39..58889bc72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -82,6 +82,14 @@ static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
+
+/*
+ * This workqueue is used for sync_work to register new sync_thread, and for
+ * del_work to remove rdev, and for event_work that is only set by dm-raid.
+ *
+ * Noted that sync_work will grab reconfig_mutex, hence never flush this
+ * workqueue whith reconfig_mutex grabbed.
+ */
static struct workqueue_struct *md_misc_wq;
struct workqueue_struct *md_bitmap_wq;
@@ -91,6 +99,18 @@ static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@@ -206,8 +226,7 @@ static int rdev_need_serial(struct md_rdev *rdev)
* 1. rdev is the first device which return true from rdev_enable_serial.
* 2. rdev is NULL, means we want to enable serialization for all rdevs.
*/
-void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
int ret = 0;
@@ -215,15 +234,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
!test_bit(CollisionCheck, &rdev->flags))
return;
- if (!is_suspend)
- mddev_suspend(mddev);
-
if (!rdev)
ret = rdevs_init_serial(mddev);
else
ret = rdev_init_serial(rdev);
if (ret)
- goto abort;
+ return;
if (mddev->serial_info_pool == NULL) {
/*
@@ -238,10 +254,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
pr_err("can't alloc memory pool for serialization\n");
}
}
-
-abort:
- if (!is_suspend)
- mddev_resume(mddev);
}
/*
@@ -250,8 +262,7 @@ abort:
* 2. when bitmap is destroyed while policy is not enabled.
* 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/
-void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return;
@@ -260,8 +271,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *temp;
int num = 0; /* used to track if other rdevs need the pool */
- if (!is_suspend)
- mddev_suspend(mddev);
rdev_for_each(temp, mddev) {
if (!rdev) {
if (!mddev->serialize_policy ||
@@ -283,8 +292,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
- if (!is_suspend)
- mddev_resume(mddev);
}
}
@@ -305,7 +312,6 @@ static struct ctl_table raid_table[] = {
.mode = S_IRUGO|S_IWUSR,
.proc_handler = proc_dointvec,
},
- { }
};
static int start_readonly;
@@ -346,6 +352,10 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
+static bool is_md_suspended(struct mddev *mddev)
+{
+ return percpu_ref_is_dying(&mddev->active_io);
+}
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@@ -359,11 +369,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
if (bio_data_dir(bio) != WRITE)
return false;
- if (mddev->suspend_lo >= mddev->suspend_hi)
+ if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
+ if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio_end_sector(bio) < mddev->suspend_lo)
+ if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
return false;
return true;
}
@@ -431,42 +441,73 @@ static void md_submit_bio(struct bio *bio)
md_handle_request(mddev, bio);
}
-/* mddev_suspend makes sure no new requests are submitted
- * to the device, and that any requests that have been submitted
- * are completely handled.
- * Once mddev_detach() is called and completes, the module will be
- * completely unused.
+/*
+ * Make sure no new requests are submitted to the device, and any requests that
+ * have been submitted are completely handled.
*/
-void mddev_suspend(struct mddev *mddev)
+int mddev_suspend(struct mddev *mddev, bool interruptible)
{
- struct md_thread *thread = rcu_dereference_protected(mddev->thread,
- lockdep_is_held(&mddev->reconfig_mutex));
+ int err = 0;
- WARN_ON_ONCE(thread && current == thread->tsk);
- if (mddev->suspended++)
- return;
- wake_up(&mddev->sb_wait);
- set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
- percpu_ref_kill(&mddev->active_io);
+ /*
+ * hold reconfig_mutex to wait for normal io will deadlock, because
+ * other context can't update super_block, and normal io can rely on
+ * updating super_block.
+ */
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
- if (mddev->pers && mddev->pers->prepare_suspend)
- mddev->pers->prepare_suspend(mddev);
+ if (interruptible)
+ err = mutex_lock_interruptible(&mddev->suspend_mutex);
+ else
+ mutex_lock(&mddev->suspend_mutex);
+ if (err)
+ return err;
- wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
- clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
- wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
+ if (mddev->suspended) {
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
+ }
+
+ percpu_ref_kill(&mddev->active_io);
+ if (interruptible)
+ err = wait_event_interruptible(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ else
+ wait_event(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ if (err) {
+ percpu_ref_resurrect(&mddev->active_io);
+ mutex_unlock(&mddev->suspend_mutex);
+ return err;
+ }
+
+ /*
+ * For raid456, io might be waiting for reshape to make progress,
+ * allow new reshape to start while waiting for io to be done to
+ * prevent deadlock.
+ */
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
del_timer_sync(&mddev->safemode_timer);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
+
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_suspend);
-void mddev_resume(struct mddev *mddev)
+static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
{
- lockdep_assert_held(&mddev->reconfig_mutex);
- if (--mddev->suspended)
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
+
+ mutex_lock(&mddev->suspend_mutex);
+ WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
+ if (mddev->suspended) {
+ mutex_unlock(&mddev->suspend_mutex);
return;
+ }
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
@@ -474,9 +515,17 @@ void mddev_resume(struct mddev *mddev)
percpu_ref_resurrect(&mddev->active_io);
wake_up(&mddev->sb_wait);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ if (recovery_needed)
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+
+ mutex_unlock(&mddev->suspend_mutex);
+}
+
+void mddev_resume(struct mddev *mddev)
+{
+ return __mddev_resume(mddev, true);
}
EXPORT_SYMBOL_GPL(mddev_resume);
@@ -530,8 +579,12 @@ static void submit_flushes(struct work_struct *ws)
rcu_read_lock();
}
rcu_read_unlock();
- if (atomic_dec_and_test(&mddev->flush_pending))
+ if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pair is percpu_ref_get() from md_flush_request() */
+ percpu_ref_put(&mddev->active_io);
+
queue_work(md_wq, &mddev->flush_work);
+ }
}
static void md_submit_flush_data(struct work_struct *ws)
@@ -626,34 +679,63 @@ static inline struct mddev *mddev_get(struct mddev *mddev)
static void mddev_delayed_delete(struct work_struct *ws);
+static void __mddev_put(struct mddev *mddev)
+{
+ if (mddev->raid_disks || !list_empty(&mddev->disks) ||
+ mddev->ctime || mddev->hold_active)
+ return;
+
+ /* Array is not configured at all, and not held active, so destroy it */
+ set_bit(MD_DELETED, &mddev->flags);
+
+ /*
+ * Call queue_work inside the spinlock so that flush_workqueue() after
+ * mddev_find will succeed in waiting for the work to be done.
+ */
+ queue_work(md_misc_wq, &mddev->del_work);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
- if (!mddev->raid_disks && list_empty(&mddev->disks) &&
- mddev->ctime == 0 && !mddev->hold_active) {
- /* Array is not configured at all, and not held active,
- * so destroy it */
- set_bit(MD_DELETED, &mddev->flags);
- /*
- * Call queue_work inside the spinlock so that
- * flush_workqueue() after mddev_find will succeed in waiting
- * for the work to be done.
- */
- INIT_WORK(&mddev->del_work, mddev_delayed_delete);
- queue_work(md_misc_wq, &mddev->del_work);
- }
+ __mddev_put(mddev);
spin_unlock(&all_mddevs_lock);
}
static void md_safemode_timeout(struct timer_list *t);
+static void md_start_sync(struct work_struct *ws);
+
+static void active_io_release(struct percpu_ref *ref)
+{
+ struct mddev *mddev = container_of(ref, struct mddev, active_io);
+
+ wake_up(&mddev->sb_wait);
+}
+
+static void no_op(struct percpu_ref *r) {}
-void mddev_init(struct mddev *mddev)
+int mddev_init(struct mddev *mddev)
{
+
+ if (percpu_ref_init(&mddev->active_io, active_io_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ percpu_ref_exit(&mddev->active_io);
+ return -ENOMEM;
+ }
+
+ /* We want to start with the refcount at zero */
+ percpu_ref_put(&mddev->writes_pending);
+
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex);
+ mutex_init(&mddev->suspend_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -672,9 +754,21 @@ void mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
+
+ INIT_WORK(&mddev->sync_work, md_start_sync);
+ INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_init);
+void mddev_destroy(struct mddev *mddev)
+{
+ percpu_ref_exit(&mddev->active_io);
+ percpu_ref_exit(&mddev->writes_pending);
+}
+EXPORT_SYMBOL_GPL(mddev_destroy);
+
static struct mddev *mddev_find_locked(dev_t unit)
{
struct mddev *mddev;
@@ -718,13 +812,16 @@ static struct mddev *mddev_alloc(dev_t unit)
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
- mddev_init(new);
+
+ error = mddev_init(new);
+ if (error)
+ goto out_free_new;
spin_lock(&all_mddevs_lock);
if (unit) {
error = -EEXIST;
if (mddev_find_locked(unit))
- goto out_free_new;
+ goto out_destroy_new;
new->unit = unit;
if (MAJOR(unit) == MD_MAJOR)
new->md_minor = MINOR(unit);
@@ -735,7 +832,7 @@ static struct mddev *mddev_alloc(dev_t unit)
error = -ENODEV;
new->unit = mddev_alloc_unit();
if (!new->unit)
- goto out_free_new;
+ goto out_destroy_new;
new->md_minor = MINOR(new->unit);
new->hold_active = UNTIL_STOP;
}
@@ -743,8 +840,11 @@ static struct mddev *mddev_alloc(dev_t unit)
list_add(&new->all_mddevs, &all_mddevs);
spin_unlock(&all_mddevs_lock);
return new;
-out_free_new:
+
+out_destroy_new:
spin_unlock(&all_mddevs_lock);
+ mddev_destroy(new);
+out_free_new:
kfree(new);
return ERR_PTR(error);
}
@@ -755,6 +855,7 @@ static void mddev_free(struct mddev *mddev)
list_del(&mddev->all_mddevs);
spin_unlock(&all_mddevs_lock);
+ mddev_destroy(mddev);
kfree(mddev);
}
@@ -940,9 +1041,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
return;
bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
- 1,
- REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
- GFP_NOIO, &mddev->sync_set);
+ 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
+ | REQ_PREFLUSH | REQ_FUA,
+ GFP_NOIO, &mddev->sync_set);
atomic_inc(&rdev->nr_pending);
@@ -1122,6 +1224,7 @@ struct super_type {
struct md_rdev *refdev,
int minor_version);
int (*validate_super)(struct mddev *mddev,
+ struct md_rdev *freshest,
struct md_rdev *rdev);
void (*sync_super)(struct mddev *mddev,
struct md_rdev *rdev);
@@ -1259,8 +1362,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
/*
* validate_super for 0.90.0
+ * note: we are not using "freshest" for 0.9 superblock
*/
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{
mdp_disk_t *desc;
mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1772,7 +1876,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
return ret;
}
-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events);
@@ -1868,13 +1972,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
- * spares (which don't need an event count) */
- ++ev1;
+ * spares (which don't need an event count).
+ * Similar to mdadm, we allow event counter difference of 1
+ * from the freshest device.
+ */
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
- if (ev1 < mddev->events)
+ if (ev1 + 1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
/* If adding to array with a bitmap, then we can accept an
@@ -1895,8 +2001,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
- } else
+ } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
+ /*
+ * If we are assembling, and our event counter is smaller than the
+ * highest event counter, we cannot trust our superblock about the role.
+ * It could happen that our rdev was marked as Faulty, and all other
+ * superblocks were updated with +1 event counter.
+ * Then, before the next superblock update, which typically happens when
+ * remove_and_add_spares() removes the device from the array, there was
+ * a crash or reboot.
+ * If we allow current rdev without consulting the freshest superblock,
+ * we could cause data corruption.
+ * Note that in this case our event counter is smaller by 1 than the
+ * highest, otherwise, this rdev would not be allowed into array;
+ * both kernel and mdadm allow event counter difference of 1.
+ */
+ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
+ u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
+
+ if (rdev->desc_nr >= freshest_max_dev) {
+ /* this is unexpected, better not proceed */
+ pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+ mdname(mddev), rdev->bdev, rdev->desc_nr,
+ freshest->bdev, freshest_max_dev);
+ return -EUCLEAN;
+ }
+
+ role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+ pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
+ mdname(mddev), rdev->bdev, role, role, freshest->bdev);
+ } else {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ }
switch(role) {
case MD_DISK_ROLE_SPARE: /* spare */
break;
@@ -2422,7 +2558,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2462,8 +2598,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
#endif
- blkdev_put(rdev->bdev,
- test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev);
+ bdev_release(rdev->bdev_handle);
rdev->bdev = NULL;
kobject_put(&rdev->kobj);
}
@@ -2475,7 +2610,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2804,12 +2939,8 @@ static int add_bound_rdev(struct md_rdev *rdev)
* and should be added immediately.
*/
super_types[mddev->major_version].
- validate_super(mddev, rdev);
- if (add_journal)
- mddev_suspend(mddev);
+ validate_super(mddev, NULL/*freshest*/, rdev);
err = mddev->pers->hot_add_disk(mddev, rdev);
- if (add_journal)
- mddev_resume(mddev);
if (err) {
md_kick_rdev_from_array(rdev);
return err;
@@ -2946,11 +3077,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
- mddev_create_serial_pool(rdev->mddev, rdev, false);
+ mddev_create_serial_pool(rdev->mddev, rdev);
need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
need_update_sb = true;
err = 0;
@@ -3562,6 +3693,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
struct kernfs_node *kn = NULL;
+ bool suspend = false;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
@@ -3569,17 +3701,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+ if (!mddev)
+ return -ENODEV;
- if (entry->store == state_store && cmd_match(page, "remove"))
- kn = sysfs_break_active_protection(kobj, attr);
+ if (entry->store == state_store) {
+ if (cmd_match(page, "remove"))
+ kn = sysfs_break_active_protection(kobj, attr);
+ if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
+ cmd_match(page, "writemostly") ||
+ cmd_match(page, "-writemostly"))
+ suspend = true;
+ }
- rv = mddev ? mddev_lock(mddev) : -ENODEV;
+ rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
if (!rv) {
if (rdev->mddev == NULL)
rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
- mddev_unlock(mddev);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
}
if (kn)
@@ -3643,7 +3783,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
{
struct md_rdev *rdev;
- struct md_rdev *holder;
sector_t size;
int err;
@@ -3658,21 +3797,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
if (err)
goto out_clear_rdev;
- if (super_format == -2) {
- holder = &claim_rdev;
- } else {
- holder = rdev;
- set_bit(Holder, &rdev->flags);
- }
-
- rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
- holder, NULL);
- if (IS_ERR(rdev->bdev)) {
+ rdev->bdev_handle = bdev_open_by_dev(newdev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE,
+ super_format == -2 ? &claim_rdev : rdev, NULL);
+ if (IS_ERR(rdev->bdev_handle)) {
pr_warn("md: could not open device unknown-block(%u,%u).\n",
MAJOR(newdev), MINOR(newdev));
- err = PTR_ERR(rdev->bdev);
+ err = PTR_ERR(rdev->bdev_handle);
goto out_clear_rdev;
}
+ rdev->bdev = rdev->bdev_handle->bdev;
kobject_init(&rdev->kobj, &rdev_ktype);
@@ -3703,7 +3837,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
return rdev;
out_blkdev_put:
- blkdev_put(rdev->bdev, holder);
+ bdev_release(rdev->bdev_handle);
out_clear_rdev:
md_rdev_clear(rdev);
out_free_rdev:
@@ -3742,7 +3876,7 @@ static int analyze_sbs(struct mddev *mddev)
}
super_types[mddev->major_version].
- validate_super(mddev, freshest);
+ validate_super(mddev, NULL/*freshest*/, freshest);
i = 0;
rdev_for_each_safe(rdev, tmp, mddev) {
@@ -3757,7 +3891,7 @@ static int analyze_sbs(struct mddev *mddev)
}
if (rdev != freshest) {
if (super_types[mddev->major_version].
- validate_super(mddev, rdev)) {
+ validate_super(mddev, freshest, rdev)) {
pr_warn("md: kicking non-fresh %pg from array!\n",
rdev->bdev);
md_kick_rdev_from_array(rdev);
@@ -3884,12 +4018,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (slen == 0 || slen >= sizeof(clevel))
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
if (mddev->pers == NULL) {
- strncpy(mddev->clevel, buf, slen);
+ memcpy(mddev->clevel, buf, slen);
if (mddev->clevel[slen-1] == '\n')
slen--;
mddev->clevel[slen] = 0;
@@ -3922,7 +4056,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Now find the new personality */
- strncpy(clevel, buf, slen);
+ memcpy(clevel, buf, slen);
if (clevel[slen-1] == '\n')
slen--;
clevel[slen] = 0;
@@ -3977,7 +4111,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Looks like we have a winner */
- mddev_suspend(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
@@ -4063,14 +4196,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event();
rv = len;
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return rv;
}
@@ -4378,6 +4510,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
int err = 0;
enum array_state st = match_word(buf, array_states);
+ /* No lock dependent actions */
+ switch (st) {
+ case suspended: /* not supported yet */
+ case write_pending: /* cannot be set */
+ case active_idle: /* cannot be set */
+ case broken: /* cannot be set */
+ case bad_word:
+ return -EINVAL;
+ default:
+ break;
+ }
+
if (mddev->pers && (st == active || st == clean) &&
mddev->ro != MD_RDONLY) {
/* don't take reconfig_mutex when toggling between
@@ -4402,23 +4546,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_lock(mddev);
if (err)
return err;
- err = -EINVAL;
- switch(st) {
- case bad_word:
- break;
- case clear:
- /* stopping an active array */
- err = do_md_stop(mddev, 0, NULL);
- break;
+
+ switch (st) {
case inactive:
- /* stopping an active array */
+ /* stop an active array, return 0 otherwise */
if (mddev->pers)
err = do_md_stop(mddev, 2, NULL);
- else
- err = 0; /* already inactive */
break;
- case suspended:
- break; /* not supported yet */
+ case clear:
+ err = do_md_stop(mddev, 0, NULL);
+ break;
case readonly:
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
@@ -4469,10 +4606,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = do_md_run(mddev);
}
break;
- case write_pending:
- case active_idle:
- case broken:
- /* these cannot be set */
+ default:
+ err = -EINVAL;
break;
}
@@ -4545,7 +4680,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->persistent) {
@@ -4566,14 +4701,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
rdev = md_import_device(dev, -1, -1);
if (IS_ERR(rdev)) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return PTR_ERR(rdev);
}
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
export_rdev(rdev, mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (!err)
md_new_event();
return err ? err : len;
@@ -4708,7 +4843,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
size_t namelen = len-9;
if (namelen >= sizeof(mddev->metadata_type))
namelen = sizeof(mddev->metadata_type)-1;
- strncpy(mddev->metadata_type, buf+9, namelen);
+ memcpy(mddev->metadata_type, buf+9, namelen);
mddev->metadata_type[namelen] = 0;
if (namelen && mddev->metadata_type[namelen-1] == '\n')
mddev->metadata_type[--namelen] = 0;
@@ -4768,25 +4903,29 @@ action_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", type);
}
-static void stop_sync_thread(struct mddev *mddev)
+/**
+ * stop_sync_thread() - wait for sync_thread to stop if it's running.
+ * @mddev: the array.
+ * @locked: if set, reconfig_mutex will still be held after this function
+ * return; if not set, reconfig_mutex will be released after this
+ * function return.
+ * @check_seq: if set, only wait for curent running sync_thread to stop, noted
+ * that new sync_thread can still start.
+ */
+static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
{
- if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return;
+ int sync_seq;
- if (mddev_lock(mddev))
- return;
+ if (check_seq)
+ sync_seq = atomic_read(&mddev->sync_seq);
- /*
- * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
- * held.
- */
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
- mddev_unlock(mddev);
+ if (!locked)
+ mddev_unlock(mddev);
return;
}
- if (work_pending(&mddev->del_work))
- flush_workqueue(md_misc_wq);
+ mddev_unlock(mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
/*
@@ -4794,21 +4933,28 @@ static void stop_sync_thread(struct mddev *mddev)
* never happen
*/
md_wakeup_thread_directly(mddev->sync_thread);
+ if (work_pending(&mddev->sync_work))
+ flush_work(&mddev->sync_work);
- mddev_unlock(mddev);
+ wait_event(resync_wait,
+ !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ (check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
+
+ if (locked)
+ mddev_lock_nointr(mddev);
}
static void idle_sync_thread(struct mddev *mddev)
{
- int sync_seq = atomic_read(&mddev->sync_seq);
-
mutex_lock(&mddev->sync_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev);
- wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
- !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ if (mddev_lock(mddev)) {
+ mutex_unlock(&mddev->sync_mutex);
+ return;
+ }
+ stop_sync_thread(mddev, false, true);
mutex_unlock(&mddev->sync_mutex);
}
@@ -4816,11 +4962,13 @@ static void frozen_sync_thread(struct mddev *mddev)
{
mutex_lock(&mddev->sync_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev);
- wait_event(resync_wait, mddev->sync_thread == NULL &&
- !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ if (mddev_lock(mddev)) {
+ mutex_unlock(&mddev->sync_mutex);
+ return;
+ }
+ stop_sync_thread(mddev, false, false);
mutex_unlock(&mddev->sync_mutex);
}
@@ -4882,6 +5030,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
+ flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
@@ -5138,7 +5287,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
static ssize_t
suspend_lo_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_lo));
}
static ssize_t
@@ -5153,21 +5303,14 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- err = -EINVAL;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- goto unlock;
- mddev_suspend(mddev);
- mddev->suspend_lo = new;
+
+ WRITE_ONCE(mddev->suspend_lo, new);
mddev_resume(mddev);
- err = 0;
-unlock:
- mddev_unlock(mddev);
- return err ?: len;
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -5175,7 +5318,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
static ssize_t
suspend_hi_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_hi));
}
static ssize_t
@@ -5190,21 +5334,14 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- err = -EINVAL;
- if (mddev->pers == NULL)
- goto unlock;
- mddev_suspend(mddev);
- mddev->suspend_hi = new;
+ WRITE_ONCE(mddev->suspend_hi, new);
mddev_resume(mddev);
- err = 0;
-unlock:
- mddev_unlock(mddev);
- return err ?: len;
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -5451,7 +5588,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
if (value == mddev->serialize_policy)
return len;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->pers == NULL || (mddev->pers->level != 1)) {
@@ -5460,15 +5597,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
goto unlock;
}
- mddev_suspend(mddev);
if (value)
- mddev_create_serial_pool(mddev, NULL, true);
+ mddev_create_serial_pool(mddev, NULL);
else
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mddev->serialize_policy = value;
- mddev_resume(mddev);
unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -5607,21 +5742,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
-static void no_op(struct percpu_ref *r) {}
-
-int mddev_init_writes_pending(struct mddev *mddev)
-{
- if (mddev->writes_pending.percpu_count_ptr)
- return 0;
- if (percpu_ref_init(&mddev->writes_pending, no_op,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
- return -ENOMEM;
- /* We want to start with the refcount at zero */
- percpu_ref_put(&mddev->writes_pending);
- return 0;
-}
-EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-
struct mddev *md_alloc(dev_t dev, char *name)
{
/*
@@ -5793,12 +5913,6 @@ static void md_safemode_timeout(struct timer_list *t)
}
static int start_dirty_degraded;
-static void active_io_release(struct percpu_ref *ref)
-{
- struct mddev *mddev = container_of(ref, struct mddev, active_io);
-
- wake_up(&mddev->sb_wait);
-}
int md_run(struct mddev *mddev)
{
@@ -5879,15 +5993,10 @@ int md_run(struct mddev *mddev)
nowait = nowait && bdev_nowait(rdev->bdev);
}
- err = percpu_ref_init(&mddev->active_io, active_io_release,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
- if (err)
- return err;
-
if (!bioset_initialized(&mddev->bio_set)) {
err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (err)
- goto exit_active_io;
+ return err;
}
if (!bioset_initialized(&mddev->sync_set)) {
err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
@@ -6084,8 +6193,6 @@ exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
bioset_exit(&mddev->bio_set);
-exit_active_io:
- percpu_ref_exit(&mddev->active_io);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -6233,14 +6340,7 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (work_pending(&mddev->del_work))
- flush_workqueue(md_misc_wq);
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
- }
-
+ stop_sync_thread(mddev, true, false);
del_timer_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) {
@@ -6259,7 +6359,7 @@ static void __md_stop_writes(struct mddev *mddev)
}
/* disable policy to guarantee rdevs free resources for serialization */
mddev->serialize_policy = 0;
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
}
void md_stop_writes(struct mddev *mddev)
@@ -6287,9 +6387,6 @@ static void __md_stop(struct mddev *mddev)
struct md_personality *pers = mddev->pers;
md_bitmap_destroy(mddev);
mddev_detach(mddev);
- /* Ensure ->event_work is done */
- if (mddev->event_work.func)
- flush_workqueue(md_misc_wq);
spin_lock(&mddev->lock);
mddev->pers = NULL;
spin_unlock(&mddev->lock);
@@ -6301,7 +6398,6 @@ static void __md_stop(struct mddev *mddev)
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
bioset_exit(&mddev->io_clone_set);
@@ -6316,7 +6412,6 @@ void md_stop(struct mddev *mddev)
*/
__md_stop_writes(mddev);
__md_stop(mddev);
- percpu_ref_exit(&mddev->writes_pending);
}
EXPORT_SYMBOL_GPL(md_stop);
@@ -6334,18 +6429,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- /*
- * Thread might be blocked waiting for metadata update which will now
- * never happen
- */
- md_wakeup_thread_directly(mddev->sync_thread);
-
- mddev_unlock(mddev);
- wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
- &mddev->recovery));
+ stop_sync_thread(mddev, false, false);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
@@ -6399,20 +6484,8 @@ static int do_md_stop(struct mddev *mddev, int mode,
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-
- /*
- * Thread might be blocked waiting for metadata update which will now
- * never happen
- */
- md_wakeup_thread_directly(mddev->sync_thread);
- mddev_unlock(mddev);
- wait_event(resync_wait, (mddev->sync_thread == NULL &&
- !test_bit(MD_RECOVERY_RUNNING,
- &mddev->recovery)));
- mddev_lock_nointr(mddev);
+ stop_sync_thread(mddev, true, false);
mutex_lock(&mddev->open_mutex);
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
@@ -6555,13 +6628,13 @@ static void autorun_devices(int part)
if (IS_ERR(mddev))
break;
- if (mddev_lock(mddev))
+ if (mddev_suspend_and_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev));
else if (mddev->raid_disks || mddev->major_version
|| !list_empty(&mddev->disks)) {
pr_warn("md: %s already running, cannot run %pg\n",
mdname(mddev), rdev0->bdev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
} else {
pr_debug("md: created %s\n", mdname(mddev));
mddev->persistent = 1;
@@ -6571,7 +6644,7 @@ static void autorun_devices(int part)
export_rdev(rdev, mddev);
}
autorun_array(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
/* on success, candidates will be empty, on error
* it won't...
@@ -6809,7 +6882,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
rdev->saved_raid_disk = rdev->raid_disk;
} else
super_types[mddev->major_version].
- validate_super(mddev, rdev);
+ validate_super(mddev, NULL/*freshest*/, rdev);
if ((info->state & (1<<MD_DISK_SYNC)) &&
rdev->raid_disk != info->raid_disk) {
/* This was a hot-add request, but events doesn't
@@ -7121,7 +7194,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = md_bitmap_load(mddev);
@@ -7131,11 +7203,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
md_bitmap_destroy(mddev);
fd = -1;
}
- mddev_resume(mddev);
} else if (fd < 0) {
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
}
}
if (fd < 0) {
@@ -7424,7 +7493,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
@@ -7432,7 +7500,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = PTR_ERR(bitmap);
if (rv)
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
@@ -7457,9 +7524,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7530,6 +7595,20 @@ static inline bool md_ioctl_valid(unsigned int cmd)
}
}
+static bool md_ioctl_need_suspend(unsigned int cmd)
+{
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ case HOT_ADD_DISK:
+ case HOT_REMOVE_DISK:
+ case SET_BITMAP_FILE:
+ case SET_ARRAY_INFO:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int __md_set_array_info(struct mddev *mddev, void __user *argp)
{
mdu_array_info_t info;
@@ -7658,7 +7737,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
- err = mddev_lock(mddev);
+
+ if (!md_is_rdwr(mddev))
+ flush_work(&mddev->sync_work);
+
+ err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
+ mddev_lock(mddev);
if (err) {
pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
err, cmd);
@@ -7786,7 +7870,10 @@ unlock:
if (mddev->hold_active == UNTIL_IOCTL &&
err != -EINVAL)
mddev->hold_active = 0;
- mddev_unlock(mddev);
+
+ md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
+ mddev_unlock(mddev);
+
out:
if(did_set_md_closing)
clear_bit(MD_CLOSING, &mddev->flags);
@@ -7898,7 +7985,6 @@ static void md_free_disk(struct gendisk *disk)
{
struct mddev *mddev = disk->private_data;
- percpu_ref_exit(&mddev->writes_pending);
mddev_free(mddev);
}
@@ -8075,6 +8161,19 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "\n");
}
+static void status_personalities(struct seq_file *seq)
+{
+ struct md_personality *pers;
+
+ seq_puts(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_puts(seq, "\n");
+}
+
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
@@ -8214,105 +8313,43 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&all_mddevs_lock)
{
- struct list_head *tmp;
- loff_t l = *pos;
- struct mddev *mddev;
-
- if (l == 0x10000) {
- ++*pos;
- return (void *)2;
- }
- if (l > 0x10000)
- return NULL;
- if (!l--)
- /* header */
- return (void*)1;
-
+ seq->poll_event = atomic_read(&md_event_count);
spin_lock(&all_mddevs_lock);
- list_for_each(tmp,&all_mddevs)
- if (!l--) {
- mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (!mddev_get(mddev))
- continue;
- spin_unlock(&all_mddevs_lock);
- return mddev;
- }
- spin_unlock(&all_mddevs_lock);
- if (!l--)
- return (void*)2;/* tail */
- return NULL;
+
+ return seq_list_start_head(&all_mddevs, *pos);
}
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *tmp;
- struct mddev *next_mddev, *mddev = v;
- struct mddev *to_put = NULL;
-
- ++*pos;
- if (v == (void*)2)
- return NULL;
-
- spin_lock(&all_mddevs_lock);
- if (v == (void*)1) {
- tmp = all_mddevs.next;
- } else {
- to_put = mddev;
- tmp = mddev->all_mddevs.next;
- }
-
- for (;;) {
- if (tmp == &all_mddevs) {
- next_mddev = (void*)2;
- *pos = 0x10000;
- break;
- }
- next_mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (mddev_get(next_mddev))
- break;
- mddev = next_mddev;
- tmp = mddev->all_mddevs.next;
- }
- spin_unlock(&all_mddevs_lock);
-
- if (to_put)
- mddev_put(to_put);
- return next_mddev;
-
+ return seq_list_next(v, &all_mddevs, pos);
}
static void md_seq_stop(struct seq_file *seq, void *v)
+ __releases(&all_mddevs_lock)
{
- struct mddev *mddev = v;
-
- if (mddev && v != (void*)1 && v != (void*)2)
- mddev_put(mddev);
+ spin_unlock(&all_mddevs_lock);
}
static int md_seq_show(struct seq_file *seq, void *v)
{
- struct mddev *mddev = v;
+ struct mddev *mddev;
sector_t sectors;
struct md_rdev *rdev;
- if (v == (void*)1) {
- struct md_personality *pers;
- seq_printf(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
-
- spin_unlock(&pers_lock);
- seq_printf(seq, "\n");
- seq->poll_event = atomic_read(&md_event_count);
+ if (v == &all_mddevs) {
+ status_personalities(seq);
+ if (list_empty(&all_mddevs))
+ status_unused(seq);
return 0;
}
- if (v == (void*)2) {
- status_unused(seq);
+
+ mddev = list_entry(v, struct mddev, all_mddevs);
+ if (!mddev_get(mddev))
return 0;
- }
+ spin_unlock(&all_mddevs_lock);
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
@@ -8383,6 +8420,13 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ spin_lock(&all_mddevs_lock);
+
+ if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
+ status_unused(seq);
+
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
return 0;
}
@@ -8582,6 +8626,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
+ flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -8772,12 +8817,16 @@ void md_do_sync(struct md_thread *thread)
int ret;
/* just incase thread restarts... */
- if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
- test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return;
- if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ goto skip;
+
+ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
+ !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- return;
+ goto skip;
}
if (mddev_is_clustered(mddev)) {
@@ -9168,12 +9217,90 @@ void md_do_sync(struct md_thread *thread)
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
- wake_up(&mddev->sb_wait);
md_wakeup_thread(mddev->thread);
return;
}
EXPORT_SYMBOL_GPL(md_do_sync);
+static bool rdev_removeable(struct md_rdev *rdev)
+{
+ /* rdev is not used. */
+ if (rdev->raid_disk < 0)
+ return false;
+
+ /* There are still inflight io, don't remove this rdev. */
+ if (atomic_read(&rdev->nr_pending))
+ return false;
+
+ /*
+ * An error occurred but has not yet been acknowledged by the metadata
+ * handler, don't remove this rdev.
+ */
+ if (test_bit(Blocked, &rdev->flags))
+ return false;
+
+ /* Fautly rdev is not used, it's safe to remove it. */
+ if (test_bit(Faulty, &rdev->flags))
+ return true;
+
+ /* Journal disk can only be removed if it's faulty. */
+ if (test_bit(Journal, &rdev->flags))
+ return false;
+
+ /*
+ * 'In_sync' is cleared while 'raid_disk' is valid, which means
+ * replacement has just become active from pers->spare_active(), and
+ * then pers->hot_remove_disk() will replace this rdev with replacement.
+ */
+ if (!test_bit(In_sync, &rdev->flags))
+ return true;
+
+ return false;
+}
+
+static bool rdev_is_spare(struct md_rdev *rdev)
+{
+ return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags);
+}
+
+static bool rdev_addable(struct md_rdev *rdev)
+{
+ /* rdev is already used, don't add it again. */
+ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
+ test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* Allow to add journal disk. */
+ if (test_bit(Journal, &rdev->flags))
+ return true;
+
+ /* Allow to add if array is read-write. */
+ if (md_is_rdwr(rdev->mddev))
+ return true;
+
+ /*
+ * For read-only array, only allow to readd a rdev. And if bitmap is
+ * used, don't allow to readd a rdev that is too old.
+ */
+ if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
+ return true;
+
+ return false;
+}
+
+static bool md_spares_need_change(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (rdev_removeable(rdev) || rdev_addable(rdev))
+ return true;
+ return false;
+}
+
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this)
{
@@ -9206,12 +9333,8 @@ static int remove_and_add_spares(struct mddev *mddev,
synchronize_rcu();
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) &&
- rdev->raid_disk >= 0 &&
- !test_bit(Blocked, &rdev->flags) &&
- ((test_bit(RemoveSynchronized, &rdev->flags) ||
- (!test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags))) &&
- atomic_read(&rdev->nr_pending)==0)) {
+ (test_bit(RemoveSynchronized, &rdev->flags) ||
+ rdev_removeable(rdev))) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
@@ -9233,25 +9356,12 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev_for_each(rdev, mddev) {
if (this && this != rdev)
continue;
- if (test_bit(Candidate, &rdev->flags))
- continue;
- if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags))
+ if (rdev_is_spare(rdev))
spares++;
- if (rdev->raid_disk >= 0)
+ if (!rdev_addable(rdev))
continue;
- if (test_bit(Faulty, &rdev->flags))
- continue;
- if (!test_bit(Journal, &rdev->flags)) {
- if (!md_is_rdwr(mddev) &&
- !(rdev->saved_raid_disk >= 0 &&
- !test_bit(Bitmap_sync, &rdev->flags)))
- continue;
-
+ if (!test_bit(Journal, &rdev->flags))
rdev->recovery_offset = 0;
- }
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
/* failure here is OK */
sysfs_link_rdev(mddev, rdev);
@@ -9267,30 +9377,152 @@ no_add:
return spares;
}
+static bool md_choose_sync_action(struct mddev *mddev, int *spares)
+{
+ /* Check if reshape is in progress first. */
+ if (mddev->reshape_position != MaxSector) {
+ if (mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev) != 0)
+ return false;
+
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /*
+ * Remove any failed drives, then add spares if possible. Spares are
+ * also removed and re-added, to allow the personality to fail the
+ * re-add.
+ */
+ *spares = remove_and_add_spares(mddev, NULL);
+ if (*spares) {
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+
+ /* Start new recovery. */
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Check if recovery is in progress. */
+ if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Delay to choose resync/check/repair in md_do_sync(). */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ return true;
+
+ /* Nothing to be done */
+ return false;
+}
+
static void md_start_sync(struct work_struct *ws)
{
- struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ struct mddev *mddev = container_of(ws, struct mddev, sync_work);
+ int spares = 0;
+ bool suspend = false;
+ char *name;
+
+ /*
+ * If reshape is still in progress, spares won't be added or removed
+ * from conf until reshape is done.
+ */
+ if (mddev->reshape_position == MaxSector &&
+ md_spares_need_change(mddev)) {
+ suspend = true;
+ mddev_suspend(mddev, false);
+ }
+
+ mddev_lock_nointr(mddev);
+ if (!md_is_rdwr(mddev)) {
+ /*
+ * On a read-only array we can:
+ * - remove failed devices
+ * - add already-in_sync devices if the array itself is in-sync.
+ * As we only add devices that are already in-sync, we can
+ * activate the spares immediately.
+ */
+ remove_and_add_spares(mddev, NULL);
+ goto not_running;
+ }
+
+ if (!md_choose_sync_action(mddev, &spares))
+ goto not_running;
+
+ if (!mddev->pers->sync_request)
+ goto not_running;
+ /*
+ * We are adding a device or devices to an array which has the bitmap
+ * stored on all devices. So make sure all bitmap pages get written.
+ */
+ if (spares)
+ md_bitmap_write_all(mddev->bitmap);
+
+ name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
+ "reshape" : "resync";
rcu_assign_pointer(mddev->sync_thread,
- md_register_thread(md_do_sync, mddev, "resync"));
+ md_register_thread(md_do_sync, mddev, name));
if (!mddev->sync_thread) {
pr_warn("%s: could not start resync thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- wake_up(&resync_wait);
- if (test_and_clear_bit(MD_RECOVERY_RECOVER,
- &mddev->recovery))
- if (mddev->sysfs_action)
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- } else
- md_wakeup_thread(mddev->sync_thread);
+ goto not_running;
+ }
+
+ mddev_unlock(mddev);
+ /*
+ * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
+ * not set it again. Otherwise, we may cause issue like this one:
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218200
+ * Therefore, use __mddev_resume(mddev, false).
+ */
+ if (suspend)
+ __mddev_resume(mddev, false);
+ md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
+ return;
+
+not_running:
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev_unlock(mddev);
+ /*
+ * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
+ * not set it again. Otherwise, we may cause issue like this one:
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218200
+ * Therefore, use __mddev_resume(mddev, false).
+ */
+ if (suspend)
+ __mddev_resume(mddev, false);
+
+ wake_up(&resync_wait);
+ if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+ mddev->sysfs_action)
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+}
+
+static void unregister_sync_thread(struct mddev *mddev)
+{
+ if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+ /* resync/recovery still happening */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ return;
+ }
+
+ if (WARN_ON_ONCE(!mddev->sync_thread))
+ return;
+
+ md_reap_sync_thread(mddev);
}
/*
@@ -9317,21 +9549,6 @@ static void md_start_sync(struct work_struct *ws)
*/
void md_check_recovery(struct mddev *mddev)
{
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
- }
-
- if (is_md_suspended(mddev))
- return;
-
if (mddev->bitmap)
md_bitmap_daemon_work(mddev);
@@ -9345,7 +9562,8 @@ void md_check_recovery(struct mddev *mddev)
}
if (!md_is_rdwr(mddev) &&
- !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return;
if ( ! (
(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
@@ -9358,7 +9576,6 @@ void md_check_recovery(struct mddev *mddev)
return;
if (mddev_trylock(mddev)) {
- int spares = 0;
bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
@@ -9366,30 +9583,42 @@ void md_check_recovery(struct mddev *mddev)
if (!md_is_rdwr(mddev)) {
struct md_rdev *rdev;
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ unregister_sync_thread(mddev);
+ goto unlock;
+ }
+
if (!mddev->external && mddev->in_sync)
- /* 'Blocked' flag not needed as failed devices
+ /*
+ * 'Blocked' flag not needed as failed devices
* will be recorded if array switched to read/write.
* Leaving it set will prevent the device
* from being removed.
*/
rdev_for_each(rdev, mddev)
clear_bit(Blocked, &rdev->flags);
- /* On a read-only array we can:
- * - remove failed devices
- * - add already-in_sync devices if the array itself
- * is in-sync.
- * As we only add devices that are already in-sync,
- * we can activate the spares immediately.
- */
- remove_and_add_spares(mddev, NULL);
- /* There is no thread, but we need to call
+
+ /*
+ * There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+
+ /*
+ * Let md_start_sync() to remove and add rdevs to the
+ * array.
+ */
+ if (md_spares_need_change(mddev)) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ queue_work(md_misc_wq, &mddev->sync_work);
+ }
+
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
+
goto unlock;
}
@@ -9419,16 +9648,7 @@ void md_check_recovery(struct mddev *mddev)
* still set.
*/
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
- if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
- /* resync/recovery still happening */
- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- goto unlock;
- }
-
- if (WARN_ON_ONCE(!mddev->sync_thread))
- goto unlock;
-
- md_reap_sync_thread(mddev);
+ unregister_sync_thread(mddev);
goto unlock;
}
@@ -9445,56 +9665,14 @@ void md_check_recovery(struct mddev *mddev)
clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
- if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
- test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
- goto not_running;
- /* no recovery is running.
- * remove any failed drives, then
- * add spares if possible.
- * Spares are also removed and re-added, to allow
- * the personality to fail the re-add.
- */
-
- if (mddev->reshape_position != MaxSector) {
- if (mddev->pers->check_reshape == NULL ||
- mddev->pers->check_reshape(mddev) != 0)
- /* Cannot proceed */
- goto not_running;
- set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if ((spares = remove_and_add_spares(mddev, NULL))) {
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- /* nothing to be done ... */
- goto not_running;
-
- if (mddev->pers->sync_request) {
- if (spares) {
- /* We are adding a device or devices to an array
- * which has the bitmap stored on all devices.
- * So make sure all bitmap pages get written
- */
- md_bitmap_write_all(mddev->bitmap);
- }
- INIT_WORK(&mddev->del_work, md_start_sync);
- queue_work(md_misc_wq, &mddev->del_work);
- goto unlock;
- }
- not_running:
- if (!mddev->sync_thread) {
+ if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ queue_work(md_misc_wq, &mddev->sync_work);
+ } else {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
wake_up(&resync_wait);
- if (test_and_clear_bit(MD_RECOVERY_RECOVER,
- &mddev->recovery))
- if (mddev->sysfs_action)
- sysfs_notify_dirent_safe(mddev->sysfs_action);
}
+
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7c9c13abd..ade83af12 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -59,6 +59,7 @@ struct md_rdev {
*/
struct block_device *meta_bdev;
struct block_device *bdev; /* block device handle */
+ struct bdev_handle *bdev_handle; /* Handle from open for bdev */
struct page *sb_page, *bb_page;
int sb_loaded;
@@ -211,9 +212,6 @@ enum flag_bits {
* check if there is collision between raid1
* serial bios.
*/
- Holder, /* rdev is used as holder while opening
- * underlying disk exclusively.
- */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -248,10 +246,6 @@ struct md_cluster_info;
* become failed.
* @MD_HAS_PPL: The raid array has PPL feature set.
* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
- * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
- * without taking reconfig_mutex.
- * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
- * explicitly holding reconfig_mutex.
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
@@ -268,8 +262,6 @@ enum mddev_flags {
MD_FAILFAST_SUPPORTED,
MD_HAS_PPL,
MD_HAS_MULTIPLE_PPLS,
- MD_ALLOW_SB_UPDATE,
- MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
MD_DELETED,
@@ -316,6 +308,7 @@ struct mddev {
unsigned long sb_flags;
int suspended;
+ struct mutex suspend_mutex;
struct percpu_ref active_io;
int ro;
int sysfs_active; /* set when sysfs deletes
@@ -453,7 +446,10 @@ struct mddev {
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
struct kernfs_node *sysfs_level; /*handle for 'level' */
- struct work_struct del_work; /* used for delayed sysfs removal */
+ /* used for delayed sysfs removal */
+ struct work_struct del_work;
+ /* used for register new sync thread */
+ struct work_struct sync_work;
/* "lock" protects:
* flush_bio transition from NULL to !NULL
@@ -567,23 +563,6 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
-enum md_ro_state {
- MD_RDWR,
- MD_RDONLY,
- MD_AUTO_READ,
- MD_MAX_STATE
-};
-
-static inline bool md_is_rdwr(struct mddev *mddev)
-{
- return (mddev->ro == MD_RDWR);
-}
-
-static inline bool is_md_suspended(struct mddev *mddev)
-{
- return percpu_ref_is_dying(&mddev->active_io);
-}
-
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -643,7 +622,6 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
- void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@@ -768,7 +746,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t
extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
-extern int mddev_init_writes_pending(struct mddev *mddev);
extern bool md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
@@ -795,7 +772,8 @@ extern int md_integrity_register(struct mddev *mddev);
extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
-extern void mddev_init(struct mddev *mddev);
+extern int mddev_init(struct mddev *mddev);
+extern void mddev_destroy(struct mddev *mddev);
struct mddev *md_alloc(dev_t dev, char *name);
void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
@@ -806,15 +784,14 @@ extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio);
-extern void mddev_suspend(struct mddev *mddev);
+extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
-extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
-extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
+extern void mddev_destroy_serial_pool(struct mddev *mddev,
+ struct md_rdev *rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
@@ -852,6 +829,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+static inline int mddev_suspend_and_lock(struct mddev *mddev)
+{
+ int ret;
+
+ ret = mddev_suspend(mddev, true);
+ if (ret)
+ return ret;
+
+ ret = mddev_lock(mddev);
+ if (ret)
+ mddev_resume(mddev);
+
+ return ret;
+}
+
+static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
+{
+ mddev_suspend(mddev, false);
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline void mddev_unlock_and_resume(struct mddev *mddev)
+{
+ mddev_unlock(mddev);
+ mddev_resume(mddev);
+}
+
struct mdu_array_info_s;
struct mdu_disk_info_s;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 911670273..e138922d5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
int max_sectors;
bool write_behind = false;
+ bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* write-mostly, which means we could allocate write behind
* bio later.
*/
- if (rdev && test_bit(WriteMostly, &rdev->flags))
+ if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -3122,8 +3123,7 @@ static int raid1_run(struct mddev *mddev)
mdname(mddev));
return -EIO;
}
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
+
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 023413120..b7b0a573e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev)
sector_t min_offset_diff = 0;
int first = 1;
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
-
if (mddev->private == NULL) {
conf = setup_conf(mddev);
if (IS_ERR(conf))
@@ -4310,11 +4307,7 @@ static int raid10_run(struct mddev *mddev)
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- rcu_assign_pointer(mddev->sync_thread,
- md_register_thread(md_do_sync, mddev, "reshape"));
- if (!mddev->sync_thread)
- goto out_free_conf;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
return 0;
@@ -4710,16 +4703,8 @@ out:
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-
- rcu_assign_pointer(mddev->sync_thread,
- md_register_thread(md_do_sync, mddev, "reshape"));
- if (!mddev->sync_thread) {
- ret = -EAGAIN;
- goto abort;
- }
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
conf->reshape_checkpoint = jiffies;
- md_wakeup_thread(mddev->sync_thread);
md_new_event();
return 0;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 518b7cfa7..6157f5beb 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
int total_cached;
+ struct r5l_log *log = READ_ONCE(conf->log);
- if (!r5c_is_writeback(conf->log))
+ if (!r5c_is_writeback(log))
return;
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
@@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
void r5c_check_cached_full_stripe(struct r5conf *conf)
{
- if (!r5c_is_writeback(conf->log))
+ struct r5l_log *log = READ_ONCE(conf->log);
+
+ if (!r5c_is_writeback(log))
return;
/*
@@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!r5c_is_writeback(log))
return 0;
@@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log)
void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
BUG_ON(!r5c_is_writeback(log));
@@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh)
*/
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{
- struct r5l_log *log = sh->raid_conf->log;
+ struct r5l_log *log = READ_ONCE(sh->raid_conf->log);
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
@@ -683,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work)
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
- int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
@@ -692,14 +694,14 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
- conf->log == NULL ||
- (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
- (locked = mddev_trylock(mddev))));
- if (locked) {
- mddev_suspend(mddev);
+ !READ_ONCE(conf->log) ||
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
+ log = READ_ONCE(conf->log);
+ if (log) {
+ mddev_suspend(mddev, false);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
- mddev_unlock(mddev);
}
}
@@ -1151,7 +1153,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
static sector_t r5c_calculate_new_cp(struct r5conf *conf)
{
struct stripe_head *sh;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t new_cp;
unsigned long flags;
@@ -1159,12 +1161,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf)
return log->next_checkpoint;
spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
- if (list_empty(&conf->log->stripe_in_journal_list)) {
+ if (list_empty(&log->stripe_in_journal_list)) {
/* all stripes flushed */
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
return log->next_checkpoint;
}
- sh = list_first_entry(&conf->log->stripe_in_journal_list,
+ sh = list_first_entry(&log->stripe_in_journal_list,
struct stripe_head, r5c);
new_cp = sh->log_start;
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -1399,7 +1401,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
struct stripe_head *sh, *next;
lockdep_assert_held(&conf->device_lock);
- if (!conf->log)
+ if (!READ_ONCE(conf->log))
return;
count = 0;
@@ -1420,7 +1422,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
static void r5c_do_reclaim(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
struct stripe_head *sh;
int count = 0;
unsigned long flags;
@@ -1549,7 +1551,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
@@ -1591,7 +1593,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
/* don't allow write if journal disk is missing */
if (!log)
@@ -2583,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode)
mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
- mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
- mddev_resume(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
@@ -2610,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
if (strlen(r5c_journal_mode_str[mode]) == len &&
!strncmp(page, r5c_journal_mode_str[mode], len))
break;
- ret = mddev_lock(mddev);
+ ret = mddev_suspend_and_lock(mddev);
if (ret)
return ret;
ret = r5c_journal_mode_set(mddev, mode);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return ret ?: length;
}
@@ -2635,7 +2635,7 @@ int r5c_try_caching_write(struct r5conf *conf,
struct stripe_head_state *s,
int disks)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
struct r5dev *dev;
int to_cache = 0;
@@ -2802,7 +2802,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
int do_wakeup = 0;
sector_t tree_index;
@@ -2941,7 +2941,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
/* check whether this big stripe is in write back cache. */
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t tree_index;
void *slot;
@@ -3049,14 +3049,14 @@ int r5l_start(struct r5l_log *log)
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
- conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
@@ -3145,7 +3145,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
- conf->log = log;
+ WRITE_ONCE(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@@ -3173,7 +3173,7 @@ void r5l_exit_log(struct r5conf *conf)
* 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
* ensure disable_writeback_work wakes up and exits.
*/
- conf->log = NULL;
+ WRITE_ONCE(conf->log, NULL);
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 68d86dbec..6fe334bb9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
+static void raid5_quiesce(struct mddev *mddev, int quiesce);
+
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
@@ -2499,15 +2501,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
unsigned long cpu;
int err = 0;
- /*
- * Never shrink. And mddev_suspend() could deadlock if this is called
- * from raid5d. In that case, scribble_disks and scribble_sectors
- * should equal to new_disks and new_sectors
- */
+ /* Never shrink. */
if (conf->scribble_disks >= new_disks &&
conf->scribble_sectors >= new_sectors)
return 0;
- mddev_suspend(conf->mddev);
+
+ raid5_quiesce(conf->mddev, true);
cpus_read_lock();
for_each_present_cpu(cpu) {
@@ -2521,7 +2520,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
}
cpus_read_unlock();
- mddev_resume(conf->mddev);
+ raid5_quiesce(conf->mddev, false);
+
if (!err) {
conf->scribble_disks = new_disks;
conf->scribble_sectors = new_sectors;
@@ -5960,19 +5960,6 @@ out:
return ret;
}
-static bool reshape_inprogress(struct mddev *mddev)
-{
- return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_INTR, &mddev->recovery);
-}
-
-static bool reshape_disabled(struct mddev *mddev)
-{
- return is_md_suspended(mddev) || !md_is_rdwr(mddev);
-}
-
static enum stripe_result make_stripe_request(struct mddev *mddev,
struct r5conf *conf, struct stripe_request_ctx *ctx,
sector_t logical_sector, struct bio *bi)
@@ -6004,8 +5991,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
+ return STRIPE_SCHEDULE_AND_RETRY;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6084,15 +6070,6 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
-out:
- if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) &&
- reshape_disabled(mddev)) {
- bi->bi_status = BLK_STS_IOERR;
- ret = STRIPE_FAIL;
- pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n",
- mdname(mddev));
- }
-
return ret;
}
@@ -7032,7 +7009,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
new != roundup_pow_of_two(new))
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
@@ -7056,7 +7033,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
goto out_unlock;
}
- mddev_suspend(mddev);
mutex_lock(&conf->cache_size_mutex);
size = conf->max_nr_stripes;
@@ -7071,10 +7047,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
err = -ENOMEM;
}
mutex_unlock(&conf->cache_size_mutex);
- mddev_resume(mddev);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7160,7 +7135,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
new = !!new;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
@@ -7169,15 +7144,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue;
- mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7232,15 +7205,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (new > 8192)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf)
err = -ENODEV;
else if (new != conf->worker_cnt_per_group) {
- mddev_suspend(mddev);
-
old_groups = conf->worker_groups;
if (old_groups)
flush_workqueue(raid5_wq);
@@ -7257,9 +7228,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups[0].workers);
kfree(old_groups);
}
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7408,7 +7378,7 @@ static void free_conf(struct r5conf *conf)
log_exit(conf);
- unregister_shrinker(&conf->shrinker);
+ shrinker_free(conf->shrinker);
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
@@ -7456,7 +7426,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
static unsigned long raid5_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+ struct r5conf *conf = shrink->private_data;
unsigned long ret = SHRINK_STOP;
if (mutex_trylock(&conf->cache_size_mutex)) {
@@ -7477,7 +7447,7 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+ struct r5conf *conf = shrink->private_data;
if (conf->max_nr_stripes < conf->min_nr_stripes)
/* unlikely, but not impossible */
@@ -7712,18 +7682,22 @@ static struct r5conf *setup_conf(struct mddev *mddev)
* it reduces the queue depth and so can hurt throughput.
* So set it rather large, scaled by number of devices.
*/
- conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
- conf->shrinker.scan_objects = raid5_cache_scan;
- conf->shrinker.count_objects = raid5_cache_count;
- conf->shrinker.batch = 128;
- conf->shrinker.flags = 0;
- ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
- if (ret) {
- pr_warn("md/raid:%s: couldn't register shrinker.\n",
+ conf->shrinker = shrinker_alloc(0, "md-raid5:%s", mdname(mddev));
+ if (!conf->shrinker) {
+ ret = -ENOMEM;
+ pr_warn("md/raid:%s: couldn't allocate shrinker.\n",
mdname(mddev));
goto abort;
}
+ conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+ conf->shrinker->scan_objects = raid5_cache_scan;
+ conf->shrinker->count_objects = raid5_cache_count;
+ conf->shrinker->batch = 128;
+ conf->shrinker->private_data = conf;
+
+ shrinker_register(conf->shrinker);
+
sprintf(pers_name, "raid%d", mddev->new_level);
rcu_assign_pointer(conf->thread,
md_register_thread(raid5d, mddev, pers_name));
@@ -7785,9 +7759,6 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0;
int first = 1;
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
-
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -8031,11 +8002,7 @@ static int raid5_run(struct mddev *mddev)
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- rcu_assign_pointer(mddev->sync_thread,
- md_register_thread(md_do_sync, mddev, "reshape"));
- if (!mddev->sync_thread)
- goto abort;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
/* Ok, everything is just fine now */
@@ -8568,8 +8535,8 @@ static int raid5_start_reshape(struct mddev *mddev)
* the reshape wasn't running - like Discard or Read - have
* completed.
*/
- mddev_suspend(mddev);
- mddev_resume(mddev);
+ raid5_quiesce(mddev, true);
+ raid5_quiesce(mddev, false);
/* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work.
@@ -8614,29 +8581,8 @@ static int raid5_start_reshape(struct mddev *mddev)
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- rcu_assign_pointer(mddev->sync_thread,
- md_register_thread(md_do_sync, mddev, "reshape"));
- if (!mddev->sync_thread) {
- mddev->recovery = 0;
- spin_lock_irq(&conf->device_lock);
- write_seqcount_begin(&conf->gen_lock);
- mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
- mddev->new_chunk_sectors =
- conf->chunk_sectors = conf->prev_chunk_sectors;
- mddev->new_layout = conf->algorithm = conf->prev_algo;
- rdev_for_each(rdev, mddev)
- rdev->new_data_offset = rdev->data_offset;
- smp_wmb();
- conf->generation --;
- conf->reshape_progress = MaxSector;
- mddev->reshape_position = MaxSector;
- write_seqcount_end(&conf->gen_lock);
- spin_unlock_irq(&conf->device_lock);
- return -EAGAIN;
- }
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
conf->reshape_checkpoint = jiffies;
- md_wakeup_thread(mddev->sync_thread);
md_new_event();
return 0;
}
@@ -8984,12 +8930,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
struct r5conf *conf;
int err;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return -ENODEV;
}
@@ -8999,19 +8945,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
- if (err) {
- mddev_suspend(mddev);
+ if (err)
log_exit(conf);
- mddev_resume(mddev);
- }
}
} else
err = -EINVAL;
} else if (strncmp(buf, "resync", 6) == 0) {
if (raid5_has_ppl(conf)) {
- mddev_suspend(mddev);
log_exit(conf);
- mddev_resume(mddev);
err = resize_stripes(conf, conf->pool_size);
} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
r5l_log_disk_error(conf)) {
@@ -9024,11 +8965,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
break;
}
- if (!journal_dev_exists) {
- mddev_suspend(mddev);
+ if (!journal_dev_exists)
clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- mddev_resume(mddev);
- } else /* need remove journal device first */
+ else /* need remove journal device first */
err = -EBUSY;
} else
err = -EINVAL;
@@ -9039,7 +8978,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
if (!err)
md_update_sb(mddev, 1);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err;
}
@@ -9051,22 +8990,6 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
-static void raid5_prepare_suspend(struct mddev *mddev)
-{
- struct r5conf *conf = mddev->private;
-
- wait_event(mddev->sb_wait, !reshape_inprogress(mddev) ||
- percpu_ref_is_zero(&mddev->active_io));
- if (percpu_ref_is_zero(&mddev->active_io))
- return;
-
- /*
- * Reshape is not in progress, and array is suspended, io that is
- * waiting for reshpape can never be done.
- */
- wake_up(&conf->wait_for_overlap);
-}
-
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -9087,7 +9010,6 @@ static struct md_personality raid6_personality =
.check_reshape = raid6_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
- .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9112,7 +9034,6 @@ static struct md_personality raid5_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
- .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9138,7 +9059,6 @@ static struct md_personality raid4_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
- .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 97a795979..22bea20ec 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -670,7 +670,7 @@ struct r5conf {
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
unsigned long cache_state;
- struct shrinker shrinker;
+ struct shrinker *shrinker;
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;