summaryrefslogtreecommitdiffstats
path: root/kernel/locking
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/Makefile32
-rw-r--r--kernel/locking/lockdep.c4546
-rw-r--r--kernel/locking/lockdep_internals.h187
-rw-r--r--kernel/locking/lockdep_proc.c665
-rw-r--r--kernel/locking/lockdep_states.h8
-rw-r--r--kernel/locking/locktorture.c1062
-rw-r--r--kernel/locking/mcs_spinlock.h121
-rw-r--r--kernel/locking/mutex-debug.c107
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c1451
-rw-r--r--kernel/locking/mutex.h23
-rw-r--r--kernel/locking/osq_lock.c231
-rw-r--r--kernel/locking/percpu-rwsem.c192
-rw-r--r--kernel/locking/qrwlock.c93
-rw-r--r--kernel/locking/qspinlock.c541
-rw-r--r--kernel/locking/qspinlock_paravirt.h563
-rw-r--r--kernel/locking/qspinlock_stat.h291
-rw-r--r--kernel/locking/rtmutex-debug.c182
-rw-r--r--kernel/locking/rtmutex-debug.h37
-rw-r--r--kernel/locking/rtmutex.c1923
-rw-r--r--kernel/locking/rtmutex.h35
-rw-r--r--kernel/locking/rtmutex_common.h165
-rw-r--r--kernel/locking/rwsem-spinlock.c339
-rw-r--r--kernel/locking/rwsem-xadd.c730
-rw-r--r--kernel/locking/rwsem.c224
-rw-r--r--kernel/locking/rwsem.h87
-rw-r--r--kernel/locking/semaphore.c263
-rw-r--r--kernel/locking/spinlock.c392
-rw-r--r--kernel/locking/spinlock_debug.c226
-rw-r--r--kernel/locking/test-ww_mutex.c647
30 files changed, 15392 insertions, 0 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
new file mode 100644
index 000000000..392c7f23a
--- /dev/null
+++ b/kernel/locking/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
+endif
+
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
+obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
+obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
new file mode 100644
index 000000000..4dc79f57a
--- /dev/null
+++ b/kernel/locking/lockdep.c
@@ -0,0 +1,4546 @@
+/*
+ * kernel/lockdep.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * this code maps all the lock dependencies as they occur in a live kernel
+ * and will warn about the following classes of locking bugs:
+ *
+ * - lock inversion scenarios
+ * - circular lock dependencies
+ * - hardirq/softirq safe/unsafe locking bugs
+ *
+ * Bugs are reported even if the current locking scenario does not cause
+ * any deadlock at this point.
+ *
+ * I.e. if anytime in the past two locks were taken in a different order,
+ * even if it happened for another task, even if those were different
+ * locks (but of the same class as this lock), this code will detect it.
+ *
+ * Thanks to Arjan van de Ven for coming up with the initial idea of
+ * mapping lock dependencies runtime.
+ */
+#define DISABLE_BRANCH_PROFILING
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/stacktrace.h>
+#include <linux/debug_locks.h>
+#include <linux/irqflags.h>
+#include <linux/utsname.h>
+#include <linux/hash.h>
+#include <linux/ftrace.h>
+#include <linux/stringify.h>
+#include <linux/bitops.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/nmi.h>
+
+#include <asm/sections.h>
+
+#include "lockdep_internals.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifdef CONFIG_PROVE_LOCKING
+int prove_locking = 1;
+module_param(prove_locking, int, 0644);
+#else
+#define prove_locking 0
+#endif
+
+#ifdef CONFIG_LOCK_STAT
+int lock_stat = 1;
+module_param(lock_stat, int, 0644);
+#else
+#define lock_stat 0
+#endif
+
+/*
+ * lockdep_lock: protects the lockdep graph, the hashes and the
+ * class/list/hash allocators.
+ *
+ * This is one of the rare exceptions where it's justified
+ * to use a raw spinlock - we really dont want the spinlock
+ * code to recurse back into the lockdep code...
+ */
+static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+static int graph_lock(void)
+{
+ arch_spin_lock(&lockdep_lock);
+ /*
+ * Make sure that if another CPU detected a bug while
+ * walking the graph we dont change it (while the other
+ * CPU is busy printing out stuff with the graph lock
+ * dropped already)
+ */
+ if (!debug_locks) {
+ arch_spin_unlock(&lockdep_lock);
+ return 0;
+ }
+ /* prevent any recursions within lockdep from causing deadlocks */
+ current->lockdep_recursion++;
+ return 1;
+}
+
+static inline int graph_unlock(void)
+{
+ if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+ /*
+ * The lockdep graph lock isn't locked while we expect it to
+ * be, we're confused now, bye!
+ */
+ return DEBUG_LOCKS_WARN_ON(1);
+ }
+
+ current->lockdep_recursion--;
+ arch_spin_unlock(&lockdep_lock);
+ return 0;
+}
+
+/*
+ * Turn lock debugging off and return with 0 if it was off already,
+ * and also release the graph lock:
+ */
+static inline int debug_locks_off_graph_unlock(void)
+{
+ int ret = debug_locks_off();
+
+ arch_spin_unlock(&lockdep_lock);
+
+ return ret;
+}
+
+unsigned long nr_list_entries;
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+
+/*
+ * All data structures here are protected by the global debug_lock.
+ *
+ * Mutex key structs only get allocated, once during bootup, and never
+ * get freed - this significantly simplifies the debugging code.
+ */
+unsigned long nr_lock_classes;
+static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+ if (!hlock->class_idx) {
+ /*
+ * Someone passed in garbage, we give up.
+ */
+ DEBUG_LOCKS_WARN_ON(1);
+ return NULL;
+ }
+ return lock_classes + hlock->class_idx - 1;
+}
+
+#ifdef CONFIG_LOCK_STAT
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
+
+static inline u64 lockstat_clock(void)
+{
+ return local_clock();
+}
+
+static int lock_point(unsigned long points[], unsigned long ip)
+{
+ int i;
+
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ if (points[i] == 0) {
+ points[i] = ip;
+ break;
+ }
+ if (points[i] == ip)
+ break;
+ }
+
+ return i;
+}
+
+static void lock_time_inc(struct lock_time *lt, u64 time)
+{
+ if (time > lt->max)
+ lt->max = time;
+
+ if (time < lt->min || !lt->nr)
+ lt->min = time;
+
+ lt->total += time;
+ lt->nr++;
+}
+
+static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
+{
+ if (!src->nr)
+ return;
+
+ if (src->max > dst->max)
+ dst->max = src->max;
+
+ if (src->min < dst->min || !dst->nr)
+ dst->min = src->min;
+
+ dst->total += src->total;
+ dst->nr += src->nr;
+}
+
+struct lock_class_stats lock_stats(struct lock_class *class)
+{
+ struct lock_class_stats stats;
+ int cpu, i;
+
+ memset(&stats, 0, sizeof(struct lock_class_stats));
+ for_each_possible_cpu(cpu) {
+ struct lock_class_stats *pcs =
+ &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
+
+ for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
+ stats.contention_point[i] += pcs->contention_point[i];
+
+ for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+ stats.contending_point[i] += pcs->contending_point[i];
+
+ lock_time_add(&pcs->read_waittime, &stats.read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+
+ lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+
+ for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
+ stats.bounces[i] += pcs->bounces[i];
+ }
+
+ return stats;
+}
+
+void clear_lock_stats(struct lock_class *class)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct lock_class_stats *cpu_stats =
+ &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
+
+ memset(cpu_stats, 0, sizeof(struct lock_class_stats));
+ }
+ memset(class->contention_point, 0, sizeof(class->contention_point));
+ memset(class->contending_point, 0, sizeof(class->contending_point));
+}
+
+static struct lock_class_stats *get_lock_stats(struct lock_class *class)
+{
+ return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes];
+}
+
+static void lock_release_holdtime(struct held_lock *hlock)
+{
+ struct lock_class_stats *stats;
+ u64 holdtime;
+
+ if (!lock_stat)
+ return;
+
+ holdtime = lockstat_clock() - hlock->holdtime_stamp;
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (hlock->read)
+ lock_time_inc(&stats->read_holdtime, holdtime);
+ else
+ lock_time_inc(&stats->write_holdtime, holdtime);
+}
+#else
+static inline void lock_release_holdtime(struct held_lock *hlock)
+{
+}
+#endif
+
+/*
+ * We keep a global list of all lock classes. The list only grows,
+ * never shrinks. The list is only accessed with the lockdep
+ * spinlock lock held.
+ */
+LIST_HEAD(all_lock_classes);
+
+/*
+ * The lockdep classes are in a hash-table as well, for fast lookup:
+ */
+#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
+#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
+#define classhashentry(key) (classhash_table + __classhashfn((key)))
+
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
+
+/*
+ * We put the lock dependency chains into a hash-table as well, to cache
+ * their existence:
+ */
+#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
+#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
+#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
+#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
+
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * The hash key of the lock dependency chains is a hash itself too:
+ * it's a hash of all locks taken up to that lock, including that lock.
+ * It's a 64-bit hash, because it's important for the keys to be
+ * unique.
+ */
+static inline u64 iterate_chain_key(u64 key, u32 idx)
+{
+ u32 k0 = key, k1 = key >> 32;
+
+ __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+
+ return k0 | (u64)k1 << 32;
+}
+
+void lockdep_off(void)
+{
+ current->lockdep_recursion++;
+}
+EXPORT_SYMBOL(lockdep_off);
+
+void lockdep_on(void)
+{
+ current->lockdep_recursion--;
+}
+EXPORT_SYMBOL(lockdep_on);
+
+/*
+ * Debugging switches:
+ */
+
+#define VERBOSE 0
+#define VERY_VERBOSE 0
+
+#if VERBOSE
+# define HARDIRQ_VERBOSE 1
+# define SOFTIRQ_VERBOSE 1
+#else
+# define HARDIRQ_VERBOSE 0
+# define SOFTIRQ_VERBOSE 0
+#endif
+
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+/*
+ * Quick filtering for interesting events:
+ */
+static int class_filter(struct lock_class *class)
+{
+#if 0
+ /* Example */
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "lockname"))
+ return 1;
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "&struct->lockfield"))
+ return 1;
+#endif
+ /* Filter everything else. 1 would be to allow everything else */
+ return 0;
+}
+#endif
+
+static int verbose(struct lock_class *class)
+{
+#if VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+unsigned long nr_stack_trace_entries;
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+#ifdef CONFIG_LOCK_STAT
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+#endif
+}
+
+static int save_trace(struct stack_trace *trace)
+{
+ trace->nr_entries = 0;
+ trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->entries = stack_trace + nr_stack_trace_entries;
+
+ trace->skip = 3;
+
+ save_stack_trace(trace);
+
+ /*
+ * Some daft arches put -1 at the end to indicate its a full trace.
+ *
+ * <rant> this is buggy anyway, since it takes a whole extra entry so a
+ * complete trace that maxes out the entries provided will be reported
+ * as incomplete, friggin useless </rant>
+ */
+ if (trace->nr_entries != 0 &&
+ trace->entries[trace->nr_entries-1] == ULONG_MAX)
+ trace->nr_entries--;
+
+ trace->max_entries = trace->nr_entries;
+
+ nr_stack_trace_entries += trace->nr_entries;
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+ dump_stack();
+
+ return 0;
+ }
+
+ return 1;
+}
+
+unsigned int nr_hardirq_chains;
+unsigned int nr_softirq_chains;
+unsigned int nr_process_chains;
+unsigned int max_lockdep_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Various lockdep statistics:
+ */
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
+#endif
+
+/*
+ * Locking printouts:
+ */
+
+#define __USAGE(__STATE) \
+ [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
+ [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
+ [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
+ [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
+
+static const char *usage_str[] =
+{
+#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ [LOCK_USED] = "INITIAL USE",
+};
+
+const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+{
+ return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
+}
+
+static inline unsigned long lock_flag(enum lock_usage_bit bit)
+{
+ return 1UL << bit;
+}
+
+static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
+{
+ char c = '.';
+
+ if (class->usage_mask & lock_flag(bit + 2))
+ c = '+';
+ if (class->usage_mask & lock_flag(bit)) {
+ c = '-';
+ if (class->usage_mask & lock_flag(bit + 2))
+ c = '?';
+ }
+
+ return c;
+}
+
+void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
+{
+ int i = 0;
+
+#define LOCKDEP_STATE(__STATE) \
+ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
+ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+ usage[i] = '\0';
+}
+
+static void __print_lock_name(struct lock_class *class)
+{
+ char str[KSYM_NAME_LEN];
+ const char *name;
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ printk(KERN_CONT "%s", name);
+ } else {
+ printk(KERN_CONT "%s", name);
+ if (class->name_version > 1)
+ printk(KERN_CONT "#%d", class->name_version);
+ if (class->subclass)
+ printk(KERN_CONT "/%d", class->subclass);
+ }
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+ char usage[LOCK_USAGE_CHARS];
+
+ get_usage_chars(class, usage);
+
+ printk(KERN_CONT " (");
+ __print_lock_name(class);
+ printk(KERN_CONT "){%s}", usage);
+}
+
+static void print_lockdep_cache(struct lockdep_map *lock)
+{
+ const char *name;
+ char str[KSYM_NAME_LEN];
+
+ name = lock->name;
+ if (!name)
+ name = __get_key_name(lock->key->subkeys, str);
+
+ printk(KERN_CONT "%s", name);
+}
+
+static void print_lock(struct held_lock *hlock)
+{
+ /*
+ * We can be called locklessly through debug_show_all_locks() so be
+ * extra careful, the hlock might have been released and cleared.
+ */
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
+ barrier();
+
+ if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ printk(KERN_CONT "<RELEASED>\n");
+ return;
+ }
+
+ printk(KERN_CONT "%p", hlock->instance);
+ print_lock_name(lock_classes + class_idx - 1);
+ printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
+}
+
+static void lockdep_print_held_locks(struct task_struct *p)
+{
+ int i, depth = READ_ONCE(p->lockdep_depth);
+
+ if (!depth)
+ printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+ else
+ printk("%d lock%s held by %s/%d:\n", depth,
+ depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+ /*
+ * It's not reliable to print a task's held locks if it's not sleeping
+ * and it's not the current task.
+ */
+ if (p->state == TASK_RUNNING && p != current)
+ return;
+ for (i = 0; i < depth; i++) {
+ printk(" #%d: ", i);
+ print_lock(p->held_locks + i);
+ }
+}
+
+static void print_kernel_ident(void)
+{
+ printk("%s %.*s %s\n", init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version,
+ print_tainted());
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+/*
+ * Is this the address of a static object:
+ */
+#ifdef __KERNEL__
+static int static_obj(void *obj)
+{
+ unsigned long start = (unsigned long) &_stext,
+ end = (unsigned long) &_end,
+ addr = (unsigned long) obj;
+
+ /*
+ * static variable?
+ */
+ if ((addr >= start) && (addr < end))
+ return 1;
+
+ if (arch_is_kernel_data(addr))
+ return 1;
+
+ /*
+ * in-kernel percpu var?
+ */
+ if (is_kernel_percpu_address(addr))
+ return 1;
+
+ /*
+ * module static or percpu var?
+ */
+ return is_module_address(addr) || is_module_percpu_address(addr);
+}
+#endif
+
+/*
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
+ */
+static int count_matching_names(struct lock_class *new_class)
+{
+ struct lock_class *class;
+ int count = 0;
+
+ if (!new_class->name)
+ return 0;
+
+ list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
+ if (new_class->key - new_class->subclass == class->key)
+ return class->name_version;
+ if (class->name && !strcmp(class->name, new_class->name))
+ count = max(count, class->name_version);
+ }
+
+ return count + 1;
+}
+
+static inline struct lock_class *
+look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct hlist_head *hash_head;
+ struct lock_class *class;
+
+ if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ debug_locks_off();
+ printk(KERN_ERR
+ "BUG: looking up invalid subclass: %u\n", subclass);
+ printk(KERN_ERR
+ "turning off the locking correctness validator.\n");
+ dump_stack();
+ return NULL;
+ }
+
+ /*
+ * If it is not initialised then it has never been locked,
+ * so it won't be present in the hash table.
+ */
+ if (unlikely(!lock->key))
+ return NULL;
+
+ /*
+ * NOTE: the class-key must be unique. For dynamic locks, a static
+ * lock_class_key variable is passed in through the mutex_init()
+ * (or spin_lock_init()) call - which acts as the key. For static
+ * locks we use the lock object itself as the key.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) >
+ sizeof(struct lockdep_map));
+
+ key = lock->key->subkeys + subclass;
+
+ hash_head = classhashentry(key);
+
+ /*
+ * We do an RCU walk of the hash, see lockdep_free_key_range().
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return NULL;
+
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
+ if (class->key == key) {
+ /*
+ * Huh! same key, different name? Did someone trample
+ * on some memory? We're most confused.
+ */
+ WARN_ON_ONCE(class->name != lock->name);
+ return class;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Static locks do not have their class-keys yet - for them the key is
+ * the lock object itself. If the lock is in the per cpu area, the
+ * canonical address of the lock (per cpu offset removed) is used.
+ */
+static bool assign_lock_key(struct lockdep_map *lock)
+{
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else {
+ /* Debug-check: all keys must be persistent! */
+ debug_locks_off();
+ pr_err("INFO: trying to register non-static key.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
+ pr_err("turning off the locking correctness validator.\n");
+ dump_stack();
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
+{
+ struct lockdep_subclass_key *key;
+ struct hlist_head *hash_head;
+ struct lock_class *class;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ class = look_up_lock_class(lock, subclass);
+ if (likely(class))
+ goto out_set_class_cache;
+
+ if (!lock->key) {
+ if (!assign_lock_key(lock))
+ return NULL;
+ } else if (!static_obj(lock->key)) {
+ return NULL;
+ }
+
+ key = lock->key->subkeys + subclass;
+ hash_head = classhashentry(key);
+
+ if (!graph_lock()) {
+ return NULL;
+ }
+ /*
+ * We have to do the hash-walk again, to avoid races
+ * with another CPU:
+ */
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
+ if (class->key == key)
+ goto out_unlock_set;
+ }
+
+ /*
+ * Allocate a new key from the static array, and add it to
+ * the hash:
+ */
+ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ if (!debug_locks_off_graph_unlock()) {
+ return NULL;
+ }
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
+ dump_stack();
+ return NULL;
+ }
+ class = lock_classes + nr_lock_classes++;
+ debug_atomic_inc(nr_unused_locks);
+ class->key = key;
+ class->name = lock->name;
+ class->subclass = subclass;
+ INIT_LIST_HEAD(&class->lock_entry);
+ INIT_LIST_HEAD(&class->locks_before);
+ INIT_LIST_HEAD(&class->locks_after);
+ class->name_version = count_matching_names(class);
+ /*
+ * We use RCU's safe list-add method to make
+ * parallel walking of the hash-list safe:
+ */
+ hlist_add_head_rcu(&class->hash_entry, hash_head);
+ /*
+ * Add it to the global list of classes:
+ */
+ list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+
+ if (verbose(class)) {
+ graph_unlock();
+
+ printk("\nnew class %px: %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
+ dump_stack();
+
+ if (!graph_lock()) {
+ return NULL;
+ }
+ }
+out_unlock_set:
+ graph_unlock();
+
+out_set_class_cache:
+ if (!subclass || force)
+ lock->class_cache[0] = class;
+ else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+ lock->class_cache[subclass] = class;
+
+ /*
+ * Hash collision, did we smoke some? We found a class with a matching
+ * hash but the subclass -- which is hashed in -- didn't match.
+ */
+ if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
+ return NULL;
+
+ return class;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+/*
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+ if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ if (!debug_locks_off_graph_unlock())
+ return NULL;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
+ dump_stack();
+ return NULL;
+ }
+ return list_entries + nr_list_entries++;
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+ unsigned long ip, int distance,
+ struct stack_trace *trace)
+{
+ struct lock_list *entry;
+ /*
+ * Lock not present yet - get a new dependency struct and
+ * add it to the list:
+ */
+ entry = alloc_list_entry();
+ if (!entry)
+ return 0;
+
+ entry->class = this;
+ entry->distance = distance;
+ entry->trace = *trace;
+ /*
+ * Both allocation and removal are done under the graph lock; but
+ * iteration is under RCU-sched; see look_up_lock_class() and
+ * lockdep_free_key_range().
+ */
+ list_add_tail_rcu(&entry->entry, head);
+
+ return 1;
+}
+
+/*
+ * For good efficiency of modular, we use power of 2
+ */
+#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
+
+/*
+ * The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ */
+struct circular_queue {
+ unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ unsigned int front, rear;
+};
+
+static struct circular_queue lock_cq;
+
+unsigned int max_bfs_queue_depth;
+
+static unsigned int lockdep_dependency_gen_id;
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+ cq->front = cq->rear = 0;
+ lockdep_dependency_gen_id++;
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+ return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+ return ((cq->rear + 1) & CQ_MASK) == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+ if (__cq_full(cq))
+ return -1;
+
+ cq->element[cq->rear] = elem;
+ cq->rear = (cq->rear + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+ if (__cq_empty(cq))
+ return -1;
+
+ *elem = cq->element[cq->front];
+ cq->front = (cq->front + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
+{
+ return (cq->rear - cq->front) & CQ_MASK;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+ struct lock_list *parent)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ lock->parent = parent;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ return lock->class->dep_gen_id == lockdep_dependency_gen_id;
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+ return child->parent;
+}
+
+static inline int get_lock_depth(struct lock_list *child)
+{
+ int depth = 0;
+ struct lock_list *parent;
+
+ while ((parent = get_lock_parent(child))) {
+ child = parent;
+ depth++;
+ }
+ return depth;
+}
+
+static int __bfs(struct lock_list *source_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int forward)
+{
+ struct lock_list *entry;
+ struct list_head *head;
+ struct circular_queue *cq = &lock_cq;
+ int ret = 1;
+
+ if (match(source_entry, data)) {
+ *target_entry = source_entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (forward)
+ head = &source_entry->class->locks_after;
+ else
+ head = &source_entry->class->locks_before;
+
+ if (list_empty(head))
+ goto exit;
+
+ __cq_init(cq);
+ __cq_enqueue(cq, (unsigned long)source_entry);
+
+ while (!__cq_empty(cq)) {
+ struct lock_list *lock;
+
+ __cq_dequeue(cq, (unsigned long *)&lock);
+
+ if (!lock->class) {
+ ret = -2;
+ goto exit;
+ }
+
+ if (forward)
+ head = &lock->class->locks_after;
+ else
+ head = &lock->class->locks_before;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ list_for_each_entry_rcu(entry, head, entry) {
+ if (!lock_accessed(entry)) {
+ unsigned int cq_depth;
+ mark_lock_accessed(entry, lock);
+ if (match(entry, data)) {
+ *target_entry = entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (__cq_enqueue(cq, (unsigned long)entry)) {
+ ret = -1;
+ goto exit;
+ }
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
+ }
+ }
+ }
+exit:
+ return ret;
+}
+
+static inline int __bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 1);
+
+}
+
+static inline int __bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 0);
+
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ */
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, int depth)
+{
+ if (debug_locks_silent)
+ return 0;
+ printk("\n-> #%u", depth);
+ print_lock_name(target->class);
+ printk(KERN_CONT ":\n");
+ print_stack_trace(&target->trace, 6);
+
+ return 0;
+}
+
+static void
+print_circular_lock_scenario(struct held_lock *src,
+ struct held_lock *tgt,
+ struct lock_list *prt)
+{
+ struct lock_class *source = hlock_class(src);
+ struct lock_class *target = hlock_class(tgt);
+ struct lock_class *parent = prt->class;
+
+ /*
+ * A direct locking problem where unsafe_class lock is taken
+ * directly by safe_class lock, then all we need to show
+ * is the deadlock scenario, as it is obvious that the
+ * unsafe lock is taken under the safe lock.
+ *
+ * But if there is a chain instead, where the safe lock takes
+ * an intermediate lock (middle_class) where this lock is
+ * not the same as the safe lock, then the lock chain is
+ * used to describe the problem. Otherwise we would need
+ * to show a different CPU case for each link in the chain
+ * from the safe_class lock to the unsafe_class lock.
+ */
+ if (parent != source) {
+ printk("Chain exists of:\n ");
+ __print_lock_name(source);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(parent);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(target);
+ printk(KERN_CONT "\n\n");
+ }
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
+{
+ struct task_struct *curr = current;
+
+ if (debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("======================================================\n");
+ pr_warn("WARNING: possible circular locking dependency detected\n");
+ print_kernel_ident();
+ pr_warn("------------------------------------------------------\n");
+ pr_warn("%s/%d is trying to acquire lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(check_src);
+
+ pr_warn("\nbut task is already holding lock:\n");
+
+ print_lock(check_tgt);
+ pr_warn("\nwhich lock already depends on the new lock.\n\n");
+ pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
+
+ print_circular_bug_entry(entry, depth);
+
+ return 0;
+}
+
+static inline int class_equal(struct lock_list *entry, void *data)
+{
+ return entry->class == data;
+}
+
+static noinline int print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt,
+ struct stack_trace *trace)
+{
+ struct task_struct *curr = current;
+ struct lock_list *parent;
+ struct lock_list *first_parent;
+ int depth;
+
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ if (!save_trace(&this->trace))
+ return 0;
+
+ depth = get_lock_depth(target);
+
+ print_circular_bug_header(target, depth, check_src, check_tgt);
+
+ parent = get_lock_parent(target);
+ first_parent = parent;
+
+ while (parent) {
+ print_circular_bug_entry(parent, --depth);
+ parent = get_lock_parent(parent);
+ }
+
+ printk("\nother info that might help us debug this:\n\n");
+ print_circular_lock_scenario(check_src, check_tgt,
+ first_parent);
+
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static noinline int print_bfs_bug(int ret)
+{
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ /*
+ * Breadth-first-search failed, graph got corrupted?
+ */
+ WARN(1, "lockdep bfs error:%d\n", ret);
+
+ return 0;
+}
+
+static int noop_count(struct lock_list *entry, void *data)
+{
+ (*(unsigned long *)data)++;
+ return 0;
+}
+
+static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
+
+ __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+
+ return count;
+}
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ arch_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_forward_deps(&this);
+ arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return ret;
+}
+
+static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
+
+ __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+
+ return count;
+}
+
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ arch_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_backward_deps(&this);
+ arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return ret;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_cyclic_checks);
+
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
+}
+
+static noinline int
+check_redundant(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
+}
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ */
+
+static inline int usage_match(struct lock_list *entry, void *bit)
+{
+ return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+}
+
+
+
+/*
+ * Find a node in the forwards-direction dependency sub-graph starting
+ * at @root->class that matches @bit.
+ *
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
+ *
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
+ */
+static int
+find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_find_usage_forwards_checks);
+
+ result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
+}
+
+/*
+ * Find a node in the backwards-direction dependency sub-graph starting
+ * at @root->class that matches @bit.
+ *
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
+ *
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
+ */
+static int
+find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_find_usage_backwards_checks);
+
+ result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
+}
+
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
+
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(KERN_CONT " ops: %lu", class->ops);
+ printk(KERN_CONT " {\n");
+
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(KERN_CONT " at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
+ }
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: [<%px>] %pS\n",
+ depth, "", class->key, class->key);
+}
+
+/*
+ * printk the shortest lock dependencies from @start to @end in reverse order:
+ */
+static void __used
+print_shortest_lock_dependencies(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ printk("%*s ... acquired at:\n", depth, "");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+
+ return;
+}
+
+static void
+print_irq_lock_scenario(struct lock_list *safe_entry,
+ struct lock_list *unsafe_entry,
+ struct lock_class *prev_class,
+ struct lock_class *next_class)
+{
+ struct lock_class *safe_class = safe_entry->class;
+ struct lock_class *unsafe_class = unsafe_entry->class;
+ struct lock_class *middle_class = prev_class;
+
+ if (middle_class == safe_class)
+ middle_class = next_class;
+
+ /*
+ * A direct locking problem where unsafe_class lock is taken
+ * directly by safe_class lock, then all we need to show
+ * is the deadlock scenario, as it is obvious that the
+ * unsafe lock is taken under the safe lock.
+ *
+ * But if there is a chain instead, where the safe lock takes
+ * an intermediate lock (middle_class) where this lock is
+ * not the same as the safe lock, then the lock chain is
+ * used to describe the problem. Otherwise we would need
+ * to show a different CPU case for each link in the chain
+ * from the safe_class lock to the unsafe_class lock.
+ */
+ if (middle_class != unsafe_class) {
+ printk("Chain exists of:\n ");
+ __print_lock_name(safe_class);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(middle_class);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(unsafe_class);
+ printk(KERN_CONT "\n\n");
+ }
+
+ printk(" Possible interrupt unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(unsafe_class);
+ printk(KERN_CONT ");\n");
+ printk(" local_irq_disable();\n");
+ printk(" lock(");
+ __print_lock_name(safe_class);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(middle_class);
+ printk(KERN_CONT ");\n");
+ printk(" <Interrupt>\n");
+ printk(" lock(");
+ __print_lock_name(safe_class);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
+static int
+print_bad_irq_dependency(struct task_struct *curr,
+ struct lock_list *prev_root,
+ struct lock_list *next_root,
+ struct lock_list *backwards_entry,
+ struct lock_list *forwards_entry,
+ struct held_lock *prev,
+ struct held_lock *next,
+ enum lock_usage_bit bit1,
+ enum lock_usage_bit bit2,
+ const char *irqclass)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("=====================================================\n");
+ pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
+ irqclass, irqclass);
+ print_kernel_ident();
+ pr_warn("-----------------------------------------------------\n");
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ curr->comm, task_pid_nr(curr),
+ curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
+ curr->hardirqs_enabled,
+ curr->softirqs_enabled);
+ print_lock(next);
+
+ pr_warn("\nand this task is already holding:\n");
+ print_lock(prev);
+ pr_warn("which would create a new lock dependency:\n");
+ print_lock_name(hlock_class(prev));
+ pr_cont(" ->");
+ print_lock_name(hlock_class(next));
+ pr_cont("\n");
+
+ pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ irqclass);
+ print_lock_name(backwards_entry->class);
+ pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
+
+ print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+
+ pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
+ print_lock_name(forwards_entry->class);
+ pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
+ pr_warn("...");
+
+ print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+
+ pr_warn("\nother info that might help us debug this:\n\n");
+ print_irq_lock_scenario(backwards_entry, forwards_entry,
+ hlock_class(prev), hlock_class(next));
+
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
+ if (!save_trace(&prev_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(backwards_entry, prev_root);
+
+ pr_warn("\nthe dependencies between the lock to be acquired");
+ pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
+ if (!save_trace(&next_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(forwards_entry, next_root);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int
+check_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit_backwards,
+ enum lock_usage_bit bit_forwards, const char *irqclass)
+{
+ int ret;
+ struct lock_list this, that;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *uninitialized_var(target_entry1);
+
+ this.parent = NULL;
+
+ this.class = hlock_class(prev);
+ ret = find_usage_backwards(&this, bit_backwards, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ that.parent = NULL;
+ that.class = hlock_class(next);
+ ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ return print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ bit_backwards, bit_forwards, irqclass);
+}
+
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+ __stringify(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+ __stringify(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
+{
+ return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+}
+
+static int exclusive_bit(int new_bit)
+{
+ /*
+ * USED_IN
+ * USED_IN_READ
+ * ENABLED
+ * ENABLED_READ
+ *
+ * bit 0 - write/read
+ * bit 1 - used_in/enabled
+ * bit 2+ state
+ */
+
+ int state = new_bit & ~3;
+ int dir = new_bit & 2;
+
+ /*
+ * keep state, bit flip the direction and strip read.
+ */
+ return state | (dir ^ 2);
+}
+
+static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit)
+{
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, bit,
+ exclusive_bit(bit), state_name(bit)))
+ return 0;
+
+ bit++; /* _READ */
+
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe-read
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, bit,
+ exclusive_bit(bit), state_name(bit)))
+ return 0;
+
+ return 1;
+}
+
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+#define LOCKDEP_STATE(__STATE) \
+ if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
+ return 0;
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+ return 1;
+}
+
+static void inc_chains(void)
+{
+ if (current->hardirq_context)
+ nr_hardirq_chains++;
+ else {
+ if (current->softirq_context)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
+ }
+}
+
+#else
+
+static inline int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ return 1;
+}
+
+static inline void inc_chains(void)
+{
+ nr_process_chains++;
+}
+
+#endif
+
+static void
+print_deadlock_scenario(struct held_lock *nxt,
+ struct held_lock *prv)
+{
+ struct lock_class *next = hlock_class(nxt);
+ struct lock_class *prev = hlock_class(prv);
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0\n");
+ printk(" ----\n");
+ printk(" lock(");
+ __print_lock_name(prev);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(next);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ printk(" May be due to missing lock nesting notation\n\n");
+}
+
+static int
+print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: possible recursive locking detected\n");
+ print_kernel_ident();
+ pr_warn("--------------------------------------------\n");
+ pr_warn("%s/%d is trying to acquire lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(next);
+ pr_warn("\nbut task is already holding lock:\n");
+ print_lock(prev);
+
+ pr_warn("\nother info that might help us debug this:\n");
+ print_deadlock_scenario(next, prev);
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Check whether we are holding such a class already.
+ *
+ * (Note that this has to be done separately, because the graph cannot
+ * detect such classes of deadlocks.)
+ *
+ * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ */
+static int
+check_deadlock(struct task_struct *curr, struct held_lock *next,
+ struct lockdep_map *next_instance, int read)
+{
+ struct held_lock *prev;
+ struct held_lock *nest = NULL;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ prev = curr->held_locks + i;
+
+ if (prev->instance == next->nest_lock)
+ nest = prev;
+
+ if (hlock_class(prev) != hlock_class(next))
+ continue;
+
+ /*
+ * Allow read-after-read recursion of the same
+ * lock class (i.e. read_lock(lock)+read_lock(lock)):
+ */
+ if ((read == 2) && prev->read)
+ return 2;
+
+ /*
+ * We're holding the nest_lock, which serializes this lock's
+ * nesting behaviour.
+ */
+ if (nest)
+ return 2;
+
+ return print_deadlock_bug(curr, prev, next);
+ }
+ return 1;
+}
+
+/*
+ * There was a chain-cache miss, and we are about to add a new dependency
+ * to a previous lock. We recursively validate the following rules:
+ *
+ * - would the adding of the <prev> -> <next> dependency create a
+ * circular dependency in the graph? [== circular deadlock]
+ *
+ * - does the new prev->next dependency connect any hardirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * hardirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with hardirq contexts]
+ *
+ * - does the new prev->next dependency connect any softirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * softirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with softirq contexts]
+ *
+ * any of these scenarios could lead to a deadlock.
+ *
+ * Then if all the validations pass, we add the forwards and backwards
+ * dependency.
+ */
+static int
+check_prev_add(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, int distance, struct stack_trace *trace,
+ int (*save)(struct stack_trace *trace))
+{
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *entry;
+ struct lock_list this;
+ int ret;
+
+ /*
+ * Prove that the new <prev> -> <next> dependency would not
+ * create a circular dependency in the graph. (We do this by
+ * forward-recursing into the graph starting at <next>, and
+ * checking whether we can reach <prev>.)
+ *
+ * We are using global variables to control the recursion, to
+ * keep the stackframe size of the recursive functions low:
+ */
+ this.class = hlock_class(next);
+ this.parent = NULL;
+ ret = check_noncircular(&this, hlock_class(prev), &target_entry);
+ if (unlikely(!ret)) {
+ if (!trace->entries) {
+ /*
+ * If @save fails here, the printing might trigger
+ * a WARN but because of the !nr_entries it should
+ * not do bad things.
+ */
+ save(trace);
+ }
+ return print_circular_bug(&this, target_entry, next, prev, trace);
+ }
+ else if (unlikely(ret < 0))
+ return print_bfs_bug(ret);
+
+ if (!check_prev_add_irq(curr, prev, next))
+ return 0;
+
+ /*
+ * For recursive read-locks we do all the dependency checks,
+ * but we dont store read-triggered dependencies (only
+ * write-triggered dependencies). This ensures that only the
+ * write-side dependencies matter, and that if for example a
+ * write-lock never takes any other locks, then the reads are
+ * equivalent to a NOP.
+ */
+ if (next->read == 2 || prev->read == 2)
+ return 1;
+ /*
+ * Is the <prev> -> <next> dependency already present?
+ *
+ * (this may occur even though this is a new chain: consider
+ * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
+ * chains - the second one will be new, but L1 already has
+ * L2 added to its dependency list, due to the first chain.)
+ */
+ list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+ if (entry->class == hlock_class(next)) {
+ if (distance == 1)
+ entry->distance = 1;
+ return 1;
+ }
+ }
+
+ /*
+ * Is the <prev> -> <next> link redundant?
+ */
+ this.class = hlock_class(prev);
+ this.parent = NULL;
+ ret = check_redundant(&this, hlock_class(next), &target_entry);
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ return 2;
+ }
+ if (ret < 0)
+ return print_bfs_bug(ret);
+
+
+ if (!trace->entries && !save(trace))
+ return 0;
+
+ /*
+ * Ok, all validations passed, add the new lock
+ * to the previous lock's dependency list:
+ */
+ ret = add_lock_to_list(hlock_class(next),
+ &hlock_class(prev)->locks_after,
+ next->acquire_ip, distance, trace);
+
+ if (!ret)
+ return 0;
+
+ ret = add_lock_to_list(hlock_class(prev),
+ &hlock_class(next)->locks_before,
+ next->acquire_ip, distance, trace);
+ if (!ret)
+ return 0;
+
+ return 2;
+}
+
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+ int depth = curr->lockdep_depth;
+ struct held_lock *hlock;
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .max_entries = 0,
+ .entries = NULL,
+ .skip = 0,
+ };
+
+ /*
+ * Debugging checks.
+ *
+ * Depth must not be zero for a non-head lock:
+ */
+ if (!depth)
+ goto out_bug;
+ /*
+ * At least two relevant locks must exist for this
+ * to be a head:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ goto out_bug;
+
+ for (;;) {
+ int distance = curr->lockdep_depth - depth + 1;
+ hlock = curr->held_locks + depth - 1;
+
+ /*
+ * Only non-recursive-read entries get new dependencies
+ * added:
+ */
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ if (!ret)
+ return 0;
+
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
+
+ depth--;
+ /*
+ * End of lock-stack?
+ */
+ if (!depth)
+ break;
+ /*
+ * Stop the search if we cross into another context:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ break;
+ }
+ return 1;
+out_bug:
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ /*
+ * Clearly we all shouldn't be here, but since we made it we
+ * can reliable say we messed up our state. See the above two
+ * gotos for reasons why we could possibly end up here.
+ */
+ WARN_ON(1);
+
+ return 0;
+}
+
+unsigned long nr_lock_chains;
+struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+int nr_chain_hlocks;
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+ return lock_classes + chain_hlocks[chain->base + i];
+}
+
+/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ int i;
+ struct held_lock *hlock_curr;
+
+ for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+ hlock_curr = curr->held_locks + i;
+ if (hlock_curr->irq_context != hlock->irq_context)
+ break;
+
+ }
+
+ return ++i;
+}
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Returns the next chain_key iteration
+ */
+static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+{
+ u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+
+ printk(" class_idx:%d -> chain_key:%016Lx",
+ class_idx,
+ (unsigned long long)new_chain_key);
+ return new_chain_key;
+}
+
+static void
+print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
+{
+ struct held_lock *hlock;
+ u64 chain_key = 0;
+ int depth = curr->lockdep_depth;
+ int i;
+
+ printk("depth: %u\n", depth + 1);
+ for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ hlock = curr->held_locks + i;
+ chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+
+ print_lock(hlock);
+ }
+
+ print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_lock(hlock_next);
+}
+
+static void print_chain_keys_chain(struct lock_chain *chain)
+{
+ int i;
+ u64 chain_key = 0;
+ int class_id;
+
+ printk("depth: %u\n", chain->depth);
+ for (i = 0; i < chain->depth; i++) {
+ class_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+
+ print_lock_name(lock_classes + class_id);
+ printk("\n");
+ }
+}
+
+static void print_collision(struct task_struct *curr,
+ struct held_lock *hlock_next,
+ struct lock_chain *chain)
+{
+ pr_warn("\n");
+ pr_warn("============================\n");
+ pr_warn("WARNING: chain_key collision\n");
+ print_kernel_ident();
+ pr_warn("----------------------------\n");
+ pr_warn("%s/%d: ", current->comm, task_pid_nr(current));
+ pr_warn("Hash chain already cached but the contents don't match!\n");
+
+ pr_warn("Held locks:");
+ print_chain_keys_held_locks(curr, hlock_next);
+
+ pr_warn("Locks in cached chain:");
+ print_chain_keys_chain(chain);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+}
+#endif
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+ struct held_lock *hlock,
+ struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ int i, j, id;
+
+ i = get_first_held_lock(curr, hlock);
+
+ if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) {
+ print_collision(curr, hlock, chain);
+ return 0;
+ }
+
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ id = curr->held_locks[i].class_idx - 1;
+
+ if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
+ print_collision(curr, hlock, chain);
+ return 0;
+ }
+ }
+#endif
+ return 1;
+}
+
+/*
+ * This is for building a chain between just two different classes,
+ * instead of adding a new hlock upon current, which is done by
+ * add_chain_cache().
+ *
+ * This can be called in any context with two classes, while
+ * add_chain_cache() must be done within the lock owener's context
+ * since it uses hlock which might be racy in another context.
+ */
+static inline int add_chain_cache_classes(unsigned int prev,
+ unsigned int next,
+ unsigned int irq_context,
+ u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
+
+ /*
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+ dump_stack();
+ return 0;
+ }
+
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = irq_context;
+ chain->depth = 2;
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ nr_chain_hlocks += chain->depth;
+ chain_hlocks[chain->base] = prev - 1;
+ chain_hlocks[chain->base + 1] = next -1;
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * Important for check_no_collision().
+ */
+ else {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
+ }
+#endif
+
+ hlist_add_head_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ int i, j;
+
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
+
+ /*
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+ dump_stack();
+ return 0;
+ }
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = hlock->irq_context;
+ i = get_first_held_lock(curr, hlock);
+ chain->depth = curr->lockdep_depth + 1 - i;
+
+ BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+ BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
+ BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ int lock_id = curr->held_locks[i].class_idx - 1;
+ chain_hlocks[chain->base + j] = lock_id;
+ }
+ chain_hlocks[chain->base + j] = class - lock_classes;
+ }
+
+ if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
+ nr_chain_hlocks += chain->depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * Important for check_no_collision().
+ */
+ if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
+ }
+#endif
+
+ hlist_add_head_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+/*
+ * Look up a dependency chain.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ debug_atomic_inc(chain_lookup_hits);
+ return chain;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+ if (chain) {
+cache_hit:
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
+ if (very_verbose(class)) {
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%px] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ }
+
+ return 0;
+ }
+
+ if (very_verbose(class)) {
+ printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ }
+
+ if (!graph_lock())
+ return 0;
+
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ chain = lookup_chain_cache(chain_key);
+ if (chain) {
+ graph_unlock();
+ goto cache_hit;
+ }
+
+ if (!add_chain_cache(curr, hlock, chain_key))
+ return 0;
+
+ return 1;
+}
+
+static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
+ struct held_lock *hlock, int chain_head, u64 chain_key)
+{
+ /*
+ * Trylock needs to maintain the stack of held locks, but it
+ * does not add new dependencies, because trylock can be done
+ * in any order.
+ *
+ * We look up the chain_key and do the O(N^2) check and update of
+ * the dependencies only if this is a new dependency chain.
+ * (If lookup_chain_cache_add() return with 1 it acquires
+ * graph_lock for us)
+ */
+ if (!hlock->trylock && hlock->check &&
+ lookup_chain_cache_add(curr, hlock, chain_key)) {
+ /*
+ * Check whether last held lock:
+ *
+ * - is irq-safe, if this lock is irq-unsafe
+ * - is softirq-safe, if this lock is hardirq-unsafe
+ *
+ * And check whether the new lock's dependency graph
+ * could lead back to the previous lock.
+ *
+ * any of these scenarios could lead to a deadlock. If
+ * All validations
+ */
+ int ret = check_deadlock(curr, hlock, lock, hlock->read);
+
+ if (!ret)
+ return 0;
+ /*
+ * Mark recursive read, as we jump over it when
+ * building dependencies (just like we jump over
+ * trylock entries):
+ */
+ if (ret == 2)
+ hlock->read = 2;
+ /*
+ * Add dependency only if this lock is not the head
+ * of the chain, and if it's not a secondary read-lock:
+ */
+ if (!chain_head && ret != 2) {
+ if (!check_prevs_add(curr, hlock))
+ return 0;
+ }
+
+ graph_unlock();
+ } else {
+ /* after lookup_chain_cache_add(): */
+ if (unlikely(!debug_locks))
+ return 0;
+ }
+
+ return 1;
+}
+#else
+static inline int validate_chain(struct task_struct *curr,
+ struct lockdep_map *lock, struct held_lock *hlock,
+ int chain_head, u64 chain_key)
+{
+ return 1;
+}
+#endif
+
+/*
+ * We are building curr_chain_key incrementally, so double-check
+ * it from scratch, to make sure that it's done correctly:
+ */
+static void check_chain_key(struct task_struct *curr)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ struct held_lock *hlock, *prev_hlock = NULL;
+ unsigned int i;
+ u64 chain_key = 0;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+ if (chain_key != hlock->prev_chain_key) {
+ debug_locks_off();
+ /*
+ * We got mighty confused, our chain keys don't match
+ * with what we expect, someone trample on our task state?
+ */
+ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)hlock->prev_chain_key);
+ return;
+ }
+ /*
+ * Whoops ran out of static storage again?
+ */
+ if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+ return;
+
+ if (prev_hlock && (prev_hlock->irq_context !=
+ hlock->irq_context))
+ chain_key = 0;
+ chain_key = iterate_chain_key(chain_key, hlock->class_idx);
+ prev_hlock = hlock;
+ }
+ if (chain_key != curr->curr_chain_key) {
+ debug_locks_off();
+ /*
+ * More smoking hash instead of calculating it, damn see these
+ * numbers float.. I bet that a pink elephant stepped on my memory.
+ */
+ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)curr->curr_chain_key);
+ }
+#endif
+}
+
+static void
+print_usage_bug_scenario(struct held_lock *lock)
+{
+ struct lock_class *class = hlock_class(lock);
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0\n");
+ printk(" ----\n");
+ printk(" lock(");
+ __print_lock_name(class);
+ printk(KERN_CONT ");\n");
+ printk(" <Interrupt>\n");
+ printk(" lock(");
+ __print_lock_name(class);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("================================\n");
+ pr_warn("WARNING: inconsistent lock state\n");
+ print_kernel_ident();
+ pr_warn("--------------------------------\n");
+
+ pr_warn("inconsistent {%s} -> {%s} usage.\n",
+ usage_str[prev_bit], usage_str[new_bit]);
+
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ curr->comm, task_pid_nr(curr),
+ trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ trace_hardirqs_enabled(curr),
+ trace_softirqs_enabled(curr));
+ print_lock(this);
+
+ pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
+ print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+
+ print_irqtrace_events(curr);
+ pr_warn("\nother info that might help us debug this:\n");
+ print_usage_bug_scenario(this);
+
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
+ return print_usage_bug(curr, this, bad_bit, new_bit);
+ return 1;
+}
+
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+/*
+ * print irq inversion bug:
+ */
+static int
+print_irq_inversion_bug(struct task_struct *curr,
+ struct lock_list *root, struct lock_list *other,
+ struct held_lock *this, int forwards,
+ const char *irqclass)
+{
+ struct lock_list *entry = other;
+ struct lock_list *middle = NULL;
+ int depth;
+
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("========================================================\n");
+ pr_warn("WARNING: possible irq lock inversion dependency detected\n");
+ print_kernel_ident();
+ pr_warn("--------------------------------------------------------\n");
+ pr_warn("%s/%d just changed the state of lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(this);
+ if (forwards)
+ pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
+ else
+ pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+ print_lock_name(other->class);
+ pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+
+ pr_warn("\nother info that might help us debug this:\n");
+
+ /* Find a middle lock (if one exists) */
+ depth = get_lock_depth(other);
+ do {
+ if (depth == 0 && (entry != root)) {
+ pr_warn("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+ middle = entry;
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && entry != root && (depth >= 0));
+ if (forwards)
+ print_irq_lock_scenario(root, other,
+ middle ? middle->class : root->class, other->class);
+ else
+ print_irq_lock_scenario(other, root,
+ middle ? middle->class : other->class, root->class);
+
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ if (!save_trace(&root->trace))
+ return 0;
+ print_shortest_lock_dependencies(other, root);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Prove that in the forwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_forwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_forwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
+}
+
+/*
+ * Prove that in the backwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_backwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_backwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, irqclass);
+}
+
+void print_irqtrace_events(struct task_struct *curr)
+{
+ printk("irq event stamp: %u\n", curr->irq_events);
+ printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
+ curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
+ (void *)curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
+ curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
+ (void *)curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): [<%px>] %pS\n",
+ curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
+ (void *)curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): [<%px>] %pS\n",
+ curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
+ (void *)curr->softirq_disable_ip);
+}
+
+static int HARDIRQ_verbose(struct lock_class *class)
+{
+#if HARDIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+static int SOFTIRQ_verbose(struct lock_class *class)
+{
+#if SOFTIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#define STRICT_READ_CHECKS 1
+
+static int (*state_verbose_f[])(struct lock_class *class) = {
+#define LOCKDEP_STATE(__STATE) \
+ __STATE##_verbose,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline int state_verbose(enum lock_usage_bit bit,
+ struct lock_class *class)
+{
+ return state_verbose_f[bit >> 2](class);
+}
+
+typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
+ enum lock_usage_bit bit, const char *name);
+
+static int
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ int excl_bit = exclusive_bit(new_bit);
+ int read = new_bit & 1;
+ int dir = new_bit & 2;
+
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ *
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ check_usage_f usage = dir ?
+ check_usage_backwards : check_usage_forwards;
+
+ /*
+ * Validate that this particular lock does not have conflicting
+ * usage states.
+ */
+ if (!valid_state(curr, this, new_bit, excl_bit))
+ return 0;
+
+ /*
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
+ */
+ if ((!read || !dir || STRICT_READ_CHECKS) &&
+ !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ return 0;
+
+ /*
+ * Check for read in write conflicts
+ */
+ if (!read) {
+ if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ return 0;
+
+ if (STRICT_READ_CHECKS &&
+ !usage(curr, this, excl_bit + 1,
+ state_name(new_bit + 1)))
+ return 0;
+ }
+
+ if (state_verbose(new_bit, hlock_class(this)))
+ return 2;
+
+ return 1;
+}
+
+enum mark_type {
+#define LOCKDEP_STATE(__STATE) __STATE,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+/*
+ * Mark all held locks with a usage bit:
+ */
+static int
+mark_held_locks(struct task_struct *curr, enum mark_type mark)
+{
+ enum lock_usage_bit usage_bit;
+ struct held_lock *hlock;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ usage_bit = 2 + (mark << 2); /* ENABLED */
+ if (hlock->read)
+ usage_bit += 1; /* READ */
+
+ BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+
+ if (!hlock->check)
+ continue;
+
+ if (!mark_lock(curr, hlock, usage_bit))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Hardirqs will be enabled:
+ */
+static void __trace_hardirqs_on_caller(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+
+ /*
+ * We are going to turn hardirqs on, so set the
+ * usage bit for all held locks:
+ */
+ if (!mark_held_locks(curr, HARDIRQ))
+ return;
+ /*
+ * If we have softirqs enabled, then set the usage
+ * bit for all held locks. (disabled hardirqs prevented
+ * this bit from being set before)
+ */
+ if (curr->softirqs_enabled)
+ if (!mark_held_locks(curr, SOFTIRQ))
+ return;
+
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(hardirqs_on_events);
+}
+
+void lockdep_hardirqs_on(unsigned long ip)
+{
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (unlikely(current->hardirqs_enabled)) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ /*
+ * See the fine text that goes along with this variable definition.
+ */
+ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ return;
+
+ /*
+ * Can't allow enabling interrupts while in an interrupt handler,
+ * that's general bad form and such. Recursion, limited stack etc..
+ */
+ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ return;
+
+ current->lockdep_recursion = 1;
+ __trace_hardirqs_on_caller(ip);
+ current->lockdep_recursion = 0;
+}
+
+/*
+ * Hardirqs were disabled:
+ */
+void lockdep_hardirqs_off(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ /*
+ * So we're supposed to get called after you mask local IRQs, but for
+ * some reason the hardware doesn't quite think you did a proper job.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->hardirqs_enabled = 0;
+ curr->hardirq_disable_ip = ip;
+ curr->hardirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(hardirqs_off_events);
+ } else
+ debug_atomic_inc(redundant_hardirqs_off);
+}
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ /*
+ * We fancy IRQs being disabled here, see softirq.c, avoids
+ * funny state and nesting things.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ debug_atomic_inc(redundant_softirqs_on);
+ return;
+ }
+
+ current->lockdep_recursion = 1;
+ /*
+ * We'll do an OFF -> ON transition:
+ */
+ curr->softirqs_enabled = 1;
+ curr->softirq_enable_ip = ip;
+ curr->softirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(softirqs_on_events);
+ /*
+ * We are going to turn softirqs on, so set the
+ * usage bit for all held locks, if hardirqs are
+ * enabled too:
+ */
+ if (curr->hardirqs_enabled)
+ mark_held_locks(curr, SOFTIRQ);
+ current->lockdep_recursion = 0;
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ /*
+ * We fancy IRQs being disabled here, see softirq.c
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->softirqs_enabled = 0;
+ curr->softirq_disable_ip = ip;
+ curr->softirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(softirqs_off_events);
+ /*
+ * Whoops, we wanted softirqs off, so why aren't they?
+ */
+ DEBUG_LOCKS_WARN_ON(!softirq_count());
+ } else
+ debug_atomic_inc(redundant_softirqs_off);
+}
+
+static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+{
+ /*
+ * If non-trylock use in a hardirq or softirq context, then
+ * mark the lock as used in these contexts:
+ */
+ if (!hlock->trylock) {
+ if (hlock->read) {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_HARDIRQ_READ))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_SOFTIRQ_READ))
+ return 0;
+ } else {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+ }
+ }
+ if (!hlock->hardirqs_off) {
+ if (hlock->read) {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQ_READ))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQ_READ))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQ))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQ))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 2 * !!task->hardirq_context + !!task->softirq_context;
+}
+
+static int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ unsigned int depth = curr->lockdep_depth;
+
+ /*
+ * Keep track of points where we cross into an interrupt context:
+ */
+ if (depth) {
+ struct held_lock *prev_hlock;
+
+ prev_hlock = curr->held_locks + depth-1;
+ /*
+ * If we cross into another context, reset the
+ * hash key (this also prevents the checking and the
+ * adding of the dependency to 'prev'):
+ */
+ if (prev_hlock->irq_context != hlock->irq_context)
+ return 1;
+ }
+ return 0;
+}
+
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+static inline
+int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
+ return 1;
+}
+
+static inline int mark_irqflags(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+/*
+ * Mark a lock with a usage bit, and validate the state transition:
+ */
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ unsigned int new_mask = 1 << new_bit, ret = 1;
+
+ /*
+ * If already set then do not dirty the cacheline,
+ * nor do any checks:
+ */
+ if (likely(hlock_class(this)->usage_mask & new_mask))
+ return 1;
+
+ if (!graph_lock())
+ return 0;
+ /*
+ * Make sure we didn't race:
+ */
+ if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
+ graph_unlock();
+ return 1;
+ }
+
+ hlock_class(this)->usage_mask |= new_mask;
+
+ if (!save_trace(hlock_class(this)->usage_traces + new_bit))
+ return 0;
+
+ switch (new_bit) {
+#define LOCKDEP_STATE(__STATE) \
+ case LOCK_USED_IN_##__STATE: \
+ case LOCK_USED_IN_##__STATE##_READ: \
+ case LOCK_ENABLED_##__STATE: \
+ case LOCK_ENABLED_##__STATE##_READ:
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
+ return 0;
+ break;
+ case LOCK_USED:
+ debug_atomic_dec(nr_unused_locks);
+ break;
+ default:
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+ WARN_ON(1);
+ return 0;
+ }
+
+ graph_unlock();
+
+ /*
+ * We must printk outside of the graph_lock:
+ */
+ if (ret == 2) {
+ printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+ print_lock(this);
+ print_irqtrace_events(curr);
+ dump_stack();
+ }
+
+ return ret;
+}
+
+/*
+ * Initialize a lock instance's lock-class mapping info:
+ */
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ int i;
+
+ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+ lock->class_cache[i] = NULL;
+
+#ifdef CONFIG_LOCK_STAT
+ lock->cpu = raw_smp_processor_id();
+#endif
+
+ /*
+ * Can't be having no nameless bastards around this place!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!name)) {
+ lock->name = "NULL";
+ return;
+ }
+
+ lock->name = name;
+
+ /*
+ * No key, no joy, we need to hash something.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!key))
+ return;
+ /*
+ * Sanity check, the lock-class key must be persistent:
+ */
+ if (!static_obj(key)) {
+ printk("BUG: key %px not in .data!\n", key);
+ /*
+ * What it says above ^^^^^, I suggest you read it.
+ */
+ DEBUG_LOCKS_WARN_ON(1);
+ return;
+ }
+ lock->key = key;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (subclass) {
+ unsigned long flags;
+
+ if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ register_lock_class(lock, subclass, 1);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+ }
+}
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ __lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map);
+
+struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+
+static int
+print_lock_nested_lock_not_held(struct task_struct *curr,
+ struct held_lock *hlock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("==================================\n");
+ pr_warn("WARNING: Nested lock was not taken\n");
+ print_kernel_ident();
+ pr_warn("----------------------------------\n");
+
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ pr_warn("\nbut this task is not holding:\n");
+ pr_warn("%s\n", hlock->nest_lock->name);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ pr_warn("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int __lock_is_held(const struct lockdep_map *lock, int read);
+
+/*
+ * This gets called for every mutex_lock*()/spin_lock*() operation.
+ * We maintain the dependency maps and validate the locking attempt:
+ */
+static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, int hardirqs_off,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ int references, int pin_count)
+{
+ struct task_struct *curr = current;
+ struct lock_class *class = NULL;
+ struct held_lock *hlock;
+ unsigned int depth;
+ int chain_head = 0;
+ int class_idx;
+ u64 chain_key;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ /*
+ * Lockdep should run with IRQs disabled, otherwise we could
+ * get an interrupt which would want to take locks, which would
+ * end up in lockdep and have you got a head-ache already?
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ check = 0;
+
+ if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+ class = lock->class_cache[subclass];
+ /*
+ * Not cached?
+ */
+ if (unlikely(!class)) {
+ class = register_lock_class(lock, subclass, 0);
+ if (!class)
+ return 0;
+ }
+ atomic_inc((atomic_t *)&class->ops);
+ if (very_verbose(class)) {
+ printk("\nacquire class [%px] %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
+ dump_stack();
+ }
+
+ /*
+ * Add the lock to the list of currently held locks.
+ * (we dont increase the depth just yet, up until the
+ * dependency checks are done)
+ */
+ depth = curr->lockdep_depth;
+ /*
+ * Ran out of static storage for our per-task lock stack again have we?
+ */
+ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
+ return 0;
+
+ class_idx = class - lock_classes + 1;
+
+ if (depth) {
+ hlock = curr->held_locks + depth - 1;
+ if (hlock->class_idx == class_idx && nest_lock) {
+ if (!references)
+ references++;
+
+ if (!hlock->references)
+ hlock->references++;
+
+ hlock->references += references;
+
+ /* Overflow */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+ return 0;
+
+ return 1;
+ }
+ }
+
+ hlock = curr->held_locks + depth;
+ /*
+ * Plain impossible, we just registered it and checked it weren't no
+ * NULL like.. I bet this mushroom I ate was good!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!class))
+ return 0;
+ hlock->class_idx = class_idx;
+ hlock->acquire_ip = ip;
+ hlock->instance = lock;
+ hlock->nest_lock = nest_lock;
+ hlock->irq_context = task_irq_context(curr);
+ hlock->trylock = trylock;
+ hlock->read = read;
+ hlock->check = check;
+ hlock->hardirqs_off = !!hardirqs_off;
+ hlock->references = references;
+#ifdef CONFIG_LOCK_STAT
+ hlock->waittime_stamp = 0;
+ hlock->holdtime_stamp = lockstat_clock();
+#endif
+ hlock->pin_count = pin_count;
+
+ if (check && !mark_irqflags(curr, hlock))
+ return 0;
+
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
+ /*
+ * Calculate the chain hash: it's the combined hash of all the
+ * lock keys along the dependency chain. We save the hash value
+ * at every step so that we can get the current hash easily
+ * after unlock. The chain hash is then used to cache dependency
+ * results.
+ *
+ * The 'key ID' is what is the most compact key value to drive
+ * the hash, not class->key.
+ */
+ /*
+ * Whoops, we did it again.. ran straight out of our static allocation.
+ */
+ if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+ return 0;
+
+ chain_key = curr->curr_chain_key;
+ if (!depth) {
+ /*
+ * How can we have a chain hash when we ain't got no keys?!
+ */
+ if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ return 0;
+ chain_head = 1;
+ }
+
+ hlock->prev_chain_key = chain_key;
+ if (separate_irq_context(curr, hlock)) {
+ chain_key = 0;
+ chain_head = 1;
+ }
+ chain_key = iterate_chain_key(chain_key, class_idx);
+
+ if (nest_lock && !__lock_is_held(nest_lock, -1))
+ return print_lock_nested_lock_not_held(curr, hlock, ip);
+
+ if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ return 0;
+
+ curr->curr_chain_key = chain_key;
+ curr->lockdep_depth++;
+ check_chain_key(curr);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return 0;
+#endif
+ if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
+ debug_locks_off();
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
+ curr->lockdep_depth, MAX_LOCK_DEPTH);
+
+ lockdep_print_held_locks(current);
+ debug_show_all_locks();
+ dump_stack();
+
+ return 0;
+ }
+
+ if (unlikely(curr->lockdep_depth > max_lockdep_depth))
+ max_lockdep_depth = curr->lockdep_depth;
+
+ return 1;
+}
+
+static int
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("=====================================\n");
+ pr_warn("WARNING: bad unlock balance detected!\n");
+ print_kernel_ident();
+ pr_warn("-------------------------------------\n");
+ pr_warn("%s/%d is trying to release lock (",
+ curr->comm, task_pid_nr(curr));
+ print_lockdep_cache(lock);
+ pr_cont(") at:\n");
+ print_ip_sym(ip);
+ pr_warn("but there are no more locks to release!\n");
+ pr_warn("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
+{
+ if (hlock->instance == lock)
+ return 1;
+
+ if (hlock->references) {
+ const struct lock_class *class = lock->class_cache[0];
+
+ if (!class)
+ class = look_up_lock_class(lock, 0);
+
+ /*
+ * If look_up_lock_class() failed to find a class, we're trying
+ * to test if we hold a lock that has never yet been acquired.
+ * Clearly if the lock hasn't been acquired _ever_, we're not
+ * holding it either, so report failure.
+ */
+ if (!class)
+ return 0;
+
+ /*
+ * References, but not a lock we're actually ref-counting?
+ * State got messed up, follow the sites that change ->references
+ * and try to make sense of it.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
+ return 0;
+
+ if (hlock->class_idx == class - lock_classes + 1)
+ return 1;
+ }
+
+ return 0;
+}
+
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned int depth, int *idx)
+{
+ struct held_lock *ret, *hlock, *prev_hlock;
+ int i;
+
+ i = depth - 1;
+ hlock = curr->held_locks + i;
+ ret = hlock;
+ if (match_held_lock(hlock, lock))
+ goto out;
+
+ ret = NULL;
+ for (i--, prev_hlock = hlock--;
+ i >= 0;
+ i--, prev_hlock = hlock--) {
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ ret = NULL;
+ break;
+ }
+ if (match_held_lock(hlock, lock)) {
+ ret = hlock;
+ break;
+ }
+ }
+
+out:
+ *idx = i;
+ return ret;
+}
+
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+ int idx)
+{
+ struct held_lock *hlock;
+
+ for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass,
+ hlock->trylock,
+ hlock->read, hlock->check,
+ hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references, hlock->pin_count))
+ return 1;
+ }
+ return 0;
+}
+
+static int
+__lock_set_class(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, unsigned int subclass,
+ unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ struct lock_class *class;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ lockdep_init_map(lock, name, key, 0);
+ class = register_lock_class(lock, subclass, 0);
+ hlock->class_idx = class - lock_classes + 1;
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned int depth;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ WARN(hlock->read, "downgrading a read lock");
+ hlock->read = 1;
+ hlock->acquire_ip = ip;
+
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
+ */
+static int
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned int depth;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ depth = curr->lockdep_depth;
+ /*
+ * So we're all set to release this lock.. wait what lock? We don't
+ * own any locks, you've been drinking again?
+ */
+ if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ if (hlock->instance == lock)
+ lock_release_holdtime(hlock);
+
+ WARN(hlock->pin_count, "releasing a pinned lock\n");
+
+ if (hlock->references) {
+ hlock->references--;
+ if (hlock->references) {
+ /*
+ * We had, and after removing one, still have
+ * references, the current lock stack is still
+ * valid. We're done!
+ */
+ return 1;
+ }
+ }
+
+ /*
+ * We have the right lock to unlock, 'hlock' points to it.
+ * Now we remove it from the stack, and add back the other
+ * entries (if any), recalculating the hash along the way:
+ */
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ if (reacquire_held_locks(curr, depth, i + 1))
+ return 0;
+
+ /*
+ * We had N bottles of beer on the wall, we drank one, but now
+ * there's not N-1 bottles of beer left on the wall...
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
+ return 0;
+
+ return 1;
+}
+
+static int __lock_is_held(const struct lockdep_map *lock, int read)
+{
+ struct task_struct *curr = current;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ if (read == -1 || hlock->read == read)
+ return 1;
+
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+ struct pin_cookie cookie = NIL_COOKIE;
+ struct task_struct *curr = current;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return cookie;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ /*
+ * Grab 16bits of randomness; this is sufficient to not
+ * be guessable and still allows some pin nesting in
+ * our u32 pin_count.
+ */
+ cookie.val = 1 + (prandom_u32() >> 16);
+ hlock->pin_count += cookie.val;
+ return cookie;
+ }
+ }
+
+ WARN(1, "pinning an unheld lock\n");
+ return cookie;
+}
+
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+ struct task_struct *curr = current;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ hlock->pin_count += cookie.val;
+ return;
+ }
+ }
+
+ WARN(1, "pinning an unheld lock\n");
+}
+
+static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+ struct task_struct *curr = current;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+ return;
+
+ hlock->pin_count -= cookie.val;
+
+ if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n"))
+ hlock->pin_count = 0;
+
+ return;
+ }
+ }
+
+ WARN(1, "unpinning an unheld lock\n");
+}
+
+/*
+ * Check whether we follow the irq-flags state precisely:
+ */
+static void check_flags(unsigned long flags)
+{
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
+ defined(CONFIG_TRACE_IRQFLAGS)
+ if (!debug_locks)
+ return;
+
+ if (irqs_disabled_flags(flags)) {
+ if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ printk("possible reason: unannotated irqs-off.\n");
+ }
+ } else {
+ if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ printk("possible reason: unannotated irqs-on.\n");
+ }
+ }
+
+ /*
+ * We dont accurately track softirq state in e.g.
+ * hardirq contexts (such as on 4KSTACKS), so only
+ * check if not in hardirq contexts:
+ */
+ if (!hardirq_count()) {
+ if (softirq_count()) {
+ /* like the above, but with softirqs */
+ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
+ } else {
+ /* lick the above, does it taste good? */
+ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+ }
+ }
+
+ if (!debug_locks)
+ print_irqtrace_events(current);
+#endif
+}
+
+void lock_set_class(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, unsigned int subclass,
+ unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_set_class(lock, name, key, subclass, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_set_class);
+
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_downgrade(lock, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
+/*
+ * We are not always called with irqs disabled - do that here,
+ * and also avoid lockdep recursion:
+ */
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+ __lock_acquire(lock, subclass, trylock, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquire);
+
+void lock_release(struct lockdep_map *lock, int nested,
+ unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ trace_lock_release(lock, ip);
+ if (__lock_release(lock, nested, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_release);
+
+int lock_is_held_type(const struct lockdep_map *lock, int read)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (unlikely(current->lockdep_recursion))
+ return 1; /* avoid false negative lockdep_assert_held() */
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ ret = __lock_is_held(lock, read);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lock_is_held_type);
+
+struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
+{
+ struct pin_cookie cookie = NIL_COOKIE;
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return cookie;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ cookie = __lock_pin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return cookie;
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_repin_lock(lock, cookie);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_repin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_unpin_lock(lock, cookie);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
+#ifdef CONFIG_LOCK_STAT
+static int
+print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("=================================\n");
+ pr_warn("WARNING: bad contention detected!\n");
+ print_kernel_ident();
+ pr_warn("---------------------------------\n");
+ pr_warn("%s/%d is trying to contend lock (",
+ curr->comm, task_pid_nr(curr));
+ print_lockdep_cache(lock);
+ pr_cont(") at:\n");
+ print_ip_sym(ip);
+ pr_warn("but there are no locks held!\n");
+ pr_warn("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static void
+__lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ struct lock_class_stats *stats;
+ unsigned int depth;
+ int i, contention_point, contending_point;
+
+ depth = curr->lockdep_depth;
+ /*
+ * Whee, we contended on this lock, except it seems we're not
+ * actually trying to acquire anything much at all..
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, ip);
+ return;
+ }
+
+ if (hlock->instance != lock)
+ return;
+
+ hlock->waittime_stamp = lockstat_clock();
+
+ contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+ contending_point = lock_point(hlock_class(hlock)->contending_point,
+ lock->ip);
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (contention_point < LOCKSTAT_POINTS)
+ stats->contention_point[contention_point]++;
+ if (contending_point < LOCKSTAT_POINTS)
+ stats->contending_point[contending_point]++;
+ if (lock->cpu != smp_processor_id())
+ stats->bounces[bounce_contended + !!hlock->read]++;
+}
+
+static void
+__lock_acquired(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ struct lock_class_stats *stats;
+ unsigned int depth;
+ u64 now, waittime = 0;
+ int i, cpu;
+
+ depth = curr->lockdep_depth;
+ /*
+ * Yay, we acquired ownership of this lock we didn't try to
+ * acquire, how the heck did that happen?
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
+ }
+
+ if (hlock->instance != lock)
+ return;
+
+ cpu = smp_processor_id();
+ if (hlock->waittime_stamp) {
+ now = lockstat_clock();
+ waittime = now - hlock->waittime_stamp;
+ hlock->holdtime_stamp = now;
+ }
+
+ trace_lock_acquired(lock, ip);
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (waittime) {
+ if (hlock->read)
+ lock_time_inc(&stats->read_waittime, waittime);
+ else
+ lock_time_inc(&stats->write_waittime, waittime);
+ }
+ if (lock->cpu != cpu)
+ stats->bounces[bounce_acquired + !!hlock->read]++;
+
+ lock->cpu = cpu;
+ lock->ip = ip;
+}
+
+void lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lock_stat || !debug_locks))
+ return;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ trace_lock_contended(lock, ip);
+ __lock_contended(lock, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_contended);
+
+void lock_acquired(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lock_stat || !debug_locks))
+ return;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_acquired(lock, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquired);
+#endif
+
+/*
+ * Used by the testsuite, sanitize the validator state
+ * after a simulated failure:
+ */
+
+void lockdep_reset(void)
+{
+ unsigned long flags;
+ int i;
+
+ raw_local_irq_save(flags);
+ current->curr_chain_key = 0;
+ current->lockdep_depth = 0;
+ current->lockdep_recursion = 0;
+ memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
+ nr_hardirq_chains = 0;
+ nr_softirq_chains = 0;
+ nr_process_chains = 0;
+ debug_locks = 1;
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_HLIST_HEAD(chainhash_table + i);
+ raw_local_irq_restore(flags);
+}
+
+static void zap_class(struct lock_class *class)
+{
+ int i;
+
+ /*
+ * Remove all dependencies this lock is
+ * involved in:
+ */
+ for (i = 0; i < nr_list_entries; i++) {
+ if (list_entries[i].class == class)
+ list_del_rcu(&list_entries[i].entry);
+ }
+ /*
+ * Unhash the class and remove it from the all_lock_classes list:
+ */
+ hlist_del_rcu(&class->hash_entry);
+ list_del_rcu(&class->lock_entry);
+
+ RCU_INIT_POINTER(class->key, NULL);
+ RCU_INIT_POINTER(class->name, NULL);
+}
+
+static inline int within(const void *addr, void *start, unsigned long size)
+{
+ return addr >= start && addr < start + size;
+}
+
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ unsigned long flags;
+ int i;
+ int locked;
+
+ raw_local_irq_save(flags);
+ locked = graph_lock();
+
+ /*
+ * Unhash all classes that were created by this module:
+ */
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
+ if (within(class->key, start, size))
+ zap_class(class);
+ else if (within(class->name, start, size))
+ zap_class(class);
+ }
+ }
+
+ if (locked)
+ graph_unlock();
+ raw_local_irq_restore(flags);
+
+ /*
+ * Wait for any possible iterators from look_up_lock_class() to pass
+ * before continuing to free the memory they refer to.
+ *
+ * sync_sched() is sufficient because the read-side is IRQ disable.
+ */
+ synchronize_sched();
+
+ /*
+ * XXX at this point we could return the resources to the pool;
+ * instead we leak them. We would need to change to bitmap allocators
+ * instead of the linear allocators we have now.
+ */
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ unsigned long flags;
+ int i, j;
+ int locked;
+
+ raw_local_irq_save(flags);
+
+ /*
+ * Remove all classes this lock might have:
+ */
+ for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+ /*
+ * If the class exists we look it up and zap it:
+ */
+ class = look_up_lock_class(lock, j);
+ if (class)
+ zap_class(class);
+ }
+ /*
+ * Debug check: in the end all mapped classes should
+ * be gone.
+ */
+ locked = graph_lock();
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
+ int match = 0;
+
+ for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+ match |= class == lock->class_cache[j];
+
+ if (unlikely(match)) {
+ if (debug_locks_off_graph_unlock()) {
+ /*
+ * We all just reset everything, how did it match?
+ */
+ WARN_ON(1);
+ }
+ goto out_restore;
+ }
+ }
+ }
+ if (locked)
+ graph_unlock();
+
+out_restore:
+ raw_local_irq_restore(flags);
+}
+
+void __init lockdep_init(void)
+{
+ printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ printk(" memory used by lock dependency info: %lu kB\n",
+ (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
+ sizeof(struct list_head) * CLASSHASH_SIZE +
+ sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
+ sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
+ sizeof(struct list_head) * CHAINHASH_SIZE
+#ifdef CONFIG_PROVE_LOCKING
+ + sizeof(struct circular_queue)
+#endif
+ ) / 1024
+ );
+
+ printk(" per task-struct memory footprint: %lu bytes\n",
+ sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+}
+
+static void
+print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
+ const void *mem_to, struct held_lock *hlock)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ pr_warn("\n");
+ pr_warn("=========================\n");
+ pr_warn("WARNING: held lock freed!\n");
+ print_kernel_ident();
+ pr_warn("-------------------------\n");
+ pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
+ curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
+ print_lock(hlock);
+ lockdep_print_held_locks(curr);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+}
+
+static inline int not_in_range(const void* mem_from, unsigned long mem_len,
+ const void* lock_from, unsigned long lock_len)
+{
+ return lock_from + lock_len <= mem_from ||
+ mem_from + mem_len <= lock_from;
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a lock
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned long flags;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ raw_local_irq_save(flags);
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ if (not_in_range(mem_from, mem_len, hlock->instance,
+ sizeof(*hlock->instance)))
+ continue;
+
+ print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock);
+ break;
+ }
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
+
+static void print_held_locks_bug(void)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ pr_warn("\n");
+ pr_warn("====================================\n");
+ pr_warn("WARNING: %s/%d still has locks held!\n",
+ current->comm, task_pid_nr(current));
+ print_kernel_ident();
+ pr_warn("------------------------------------\n");
+ lockdep_print_held_locks(current);
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+}
+
+void debug_check_no_locks_held(void)
+{
+ if (unlikely(current->lockdep_depth > 0))
+ print_held_locks_bug();
+}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
+
+#ifdef __KERNEL__
+void debug_show_all_locks(void)
+{
+ struct task_struct *g, *p;
+
+ if (unlikely(!debug_locks)) {
+ pr_warn("INFO: lockdep is turned off.\n");
+ return;
+ }
+ pr_warn("\nShowing all locks held in the system:\n");
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (!p->lockdep_depth)
+ continue;
+ lockdep_print_held_locks(p);
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
+ }
+ rcu_read_unlock();
+
+ pr_warn("\n");
+ pr_warn("=============================================\n\n");
+}
+EXPORT_SYMBOL_GPL(debug_show_all_locks);
+#endif
+
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void debug_show_held_locks(struct task_struct *task)
+{
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
+ lockdep_print_held_locks(task);
+}
+EXPORT_SYMBOL_GPL(debug_show_held_locks);
+
+asmlinkage __visible void lockdep_sys_exit(void)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(curr->lockdep_depth)) {
+ if (!debug_locks_off())
+ return;
+ pr_warn("\n");
+ pr_warn("================================================\n");
+ pr_warn("WARNING: lock held when returning to user space!\n");
+ print_kernel_ident();
+ pr_warn("------------------------------------------------\n");
+ pr_warn("%s/%d is leaving the kernel with locks still held!\n",
+ curr->comm, curr->pid);
+ lockdep_print_held_locks(curr);
+ }
+
+ /*
+ * The lock history for each syscall should be independent. So wipe the
+ * slate clean on return to userspace.
+ */
+ lockdep_invariant_state(false);
+}
+
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+{
+ struct task_struct *curr = current;
+
+ /* Note: the following can be executed concurrently, so be careful. */
+ pr_warn("\n");
+ pr_warn("=============================\n");
+ pr_warn("WARNING: suspicious RCU usage\n");
+ print_kernel_ident();
+ pr_warn("-----------------------------\n");
+ pr_warn("%s:%d %s!\n", file, line, s);
+ pr_warn("\nother info that might help us debug this:\n\n");
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ !rcu_lockdep_current_cpu_online()
+ ? "RCU used illegally from offline CPU!\n"
+ : !rcu_is_watching()
+ ? "RCU used illegally from idle CPU!\n"
+ : "",
+ rcu_scheduler_active, debug_locks);
+
+ /*
+ * If a CPU is in the RCU-free window in idle (ie: in the section
+ * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * considers that CPU to be in an "extended quiescent state",
+ * which means that RCU will be completely ignoring that CPU.
+ * Therefore, rcu_read_lock() and friends have absolutely no
+ * effect on a CPU running in that state. In other words, even if
+ * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
+ * delete data structures out from under it. RCU really has no
+ * choice here: we need to keep an RCU-free window in idle where
+ * the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run
+ * in the idle task.
+ *
+ * So complain bitterly if someone does call rcu_read_lock(),
+ * rcu_read_lock_bh() and so on from extended quiescent states.
+ */
+ if (!rcu_is_watching())
+ pr_warn("RCU used illegally from extended quiescent state!\n");
+
+ lockdep_print_held_locks(curr);
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+}
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
new file mode 100644
index 000000000..d459d624b
--- /dev/null
+++ b/kernel/locking/lockdep_internals.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit {
+#define LOCKDEP_STATE(__STATE) \
+ LOCK_USED_IN_##__STATE, \
+ LOCK_USED_IN_##__STATE##_READ, \
+ LOCK_ENABLED_##__STATE, \
+ LOCK_ENABLED_##__STATE##_READ,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ LOCK_USED,
+ LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
+
+enum {
+#define LOCKDEP_STATE(__STATE) \
+ __LOCKF(USED_IN_##__STATE) \
+ __LOCKF(USED_IN_##__STATE##_READ) \
+ __LOCKF(ENABLED_##__STATE) \
+ __LOCKF(ENABLED_##__STATE##_READ)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ __LOCKF(USED)
+};
+
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_ENABLED_IRQ_READ \
+ (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+ (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
+/*
+ * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
+ * .data and .bss to fit in required 32MB limit for the kernel. With
+ * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.
+ * So, reduce the static allocations for lockdeps related structures so that
+ * everything fits in current required size limit.
+ */
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+#else
+#define MAX_LOCKDEP_ENTRIES 32768UL
+
+#define MAX_LOCKDEP_CHAINS_BITS 16
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES 524288UL
+#endif
+
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+
+extern struct list_head all_lock_classes;
+extern struct lock_chain lock_chains[];
+
+#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+
+extern void get_usage_chars(struct lock_class *class,
+ char usage[LOCK_USAGE_CHARS]);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern int nr_chain_hlocks;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+extern unsigned int max_bfs_queue_depth;
+
+#ifdef CONFIG_PROVE_LOCKING
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+ return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+#include <asm/local.h>
+/*
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
+ */
+struct lockdep_stats {
+ int chain_lookup_hits;
+ int chain_lookup_misses;
+ int hardirqs_on_events;
+ int hardirqs_off_events;
+ int redundant_hardirqs_on;
+ int redundant_hardirqs_off;
+ int softirqs_on_events;
+ int softirqs_off_events;
+ int redundant_softirqs_on;
+ int redundant_softirqs_off;
+ int nr_unused_locks;
+ int nr_redundant_checks;
+ int nr_redundant;
+ int nr_cyclic_checks;
+ int nr_cyclic_check_recursions;
+ int nr_find_usage_forwards_checks;
+ int nr_find_usage_forwards_recursions;
+ int nr_find_usage_backwards_checks;
+ int nr_find_usage_backwards_recursions;
+};
+
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+
+#define __debug_atomic_inc(ptr) \
+ this_cpu_inc(lockdep_stats.ptr);
+
+#define debug_atomic_inc(ptr) { \
+ WARN_ON_ONCE(!irqs_disabled()); \
+ __this_cpu_inc(lockdep_stats.ptr); \
+}
+
+#define debug_atomic_dec(ptr) { \
+ WARN_ON_ONCE(!irqs_disabled()); \
+ __this_cpu_dec(lockdep_stats.ptr); \
+}
+
+#define debug_atomic_read(ptr) ({ \
+ struct lockdep_stats *__cpu_lockdep_stats; \
+ unsigned long long __total = 0; \
+ int __cpu; \
+ for_each_possible_cpu(__cpu) { \
+ __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
+ __total += __cpu_lockdep_stats->ptr; \
+ } \
+ __total; \
+})
+#else
+# define __debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
new file mode 100644
index 000000000..53cc3bb70
--- /dev/null
+++ b/kernel/locking/lockdep_proc.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kernel/lockdep_proc.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * Code for /proc/lockdep and /proc/lockdep_stats:
+ *
+ */
+#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <linux/uaccess.h>
+#include <asm/div64.h>
+
+#include "lockdep_internals.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &all_lock_classes, pos);
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ return seq_list_start_head(&all_lock_classes, *pos);
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static void print_name(struct seq_file *m, struct lock_class *class)
+{
+ char str[KSYM_NAME_LEN];
+ const char *name = class->name;
+
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ seq_printf(m, "%s", name);
+ } else{
+ seq_printf(m, "%s", name);
+ if (class->name_version > 1)
+ seq_printf(m, "#%d", class->name_version);
+ if (class->subclass)
+ seq_printf(m, "/%d", class->subclass);
+ }
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_list *entry;
+ char usage[LOCK_USAGE_CHARS];
+
+ if (v == &all_lock_classes) {
+ seq_printf(m, "all lock classes:\n");
+ return 0;
+ }
+
+ seq_printf(m, "%p", class->key);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ seq_printf(m, " OPS:%8ld", class->ops);
+#endif
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+#endif
+
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+
+ seq_printf(m, ": ");
+ print_name(m, class);
+ seq_puts(m, "\n");
+
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations lockdep_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+#ifdef CONFIG_PROVE_LOCKING
+static void *lc_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ if (*pos - 1 < nr_lock_chains)
+ return lock_chains + (*pos - 1);
+
+ return NULL;
+}
+
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return lc_start(m, pos);
+}
+
+static void lc_stop(struct seq_file *m, void *v)
+{
+}
+
+static int lc_show(struct seq_file *m, void *v)
+{
+ struct lock_chain *chain = v;
+ struct lock_class *class;
+ int i;
+
+ if (v == SEQ_START_TOKEN) {
+ if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+ seq_printf(m, "(buggered) ");
+ seq_printf(m, "all lock chains:\n");
+ return 0;
+ }
+
+ seq_printf(m, "irq_context: %d\n", chain->irq_context);
+
+ for (i = 0; i < chain->depth; i++) {
+ class = lock_chain_get_class(chain, i);
+ if (!class->key)
+ continue;
+
+ seq_printf(m, "[%p] ", class->key);
+ print_name(m, class);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations lockdep_chains_ops = {
+ .start = lc_start,
+ .next = lc_next,
+ .stop = lc_stop,
+ .show = lc_show,
+};
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void lockdep_stats_debug_show(struct seq_file *m)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
+ hi2 = debug_atomic_read(hardirqs_off_events),
+ hr1 = debug_atomic_read(redundant_hardirqs_on),
+ hr2 = debug_atomic_read(redundant_hardirqs_off),
+ si1 = debug_atomic_read(softirqs_on_events),
+ si2 = debug_atomic_read(softirqs_off_events),
+ sr1 = debug_atomic_read(redundant_softirqs_on),
+ sr2 = debug_atomic_read(redundant_softirqs_off);
+
+ seq_printf(m, " chain lookup misses: %11llu\n",
+ debug_atomic_read(chain_lookup_misses));
+ seq_printf(m, " chain lookup hits: %11llu\n",
+ debug_atomic_read(chain_lookup_hits));
+ seq_printf(m, " cyclic checks: %11llu\n",
+ debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " redundant checks: %11llu\n",
+ debug_atomic_read(nr_redundant_checks));
+ seq_printf(m, " redundant links: %11llu\n",
+ debug_atomic_read(nr_redundant));
+ seq_printf(m, " find-mask forwards checks: %11llu\n",
+ debug_atomic_read(nr_find_usage_forwards_checks));
+ seq_printf(m, " find-mask backwards checks: %11llu\n",
+ debug_atomic_read(nr_find_usage_backwards_checks));
+
+ seq_printf(m, " hardirq on events: %11llu\n", hi1);
+ seq_printf(m, " hardirq off events: %11llu\n", hi2);
+ seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
+ seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
+ seq_printf(m, " softirq on events: %11llu\n", si1);
+ seq_printf(m, " softirq off events: %11llu\n", si2);
+ seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
+ seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
+#endif
+}
+
+static int lockdep_stats_show(struct seq_file *m, void *v)
+{
+ unsigned long nr_unused = 0, nr_uncategorized = 0,
+ nr_irq_safe = 0, nr_irq_unsafe = 0,
+ nr_softirq_safe = 0, nr_softirq_unsafe = 0,
+ nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
+ nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
+ nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
+ nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
+ sum_forward_deps = 0;
+
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+
+ if (class->usage_mask == 0)
+ nr_unused++;
+ if (class->usage_mask == LOCKF_USED)
+ nr_uncategorized++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ)
+ nr_irq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQ)
+ nr_irq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ nr_softirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
+ nr_softirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ nr_hardirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
+ nr_hardirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
+ nr_irq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
+ nr_irq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
+ nr_softirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
+ nr_softirq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
+ nr_hardirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
+ nr_hardirq_read_unsafe++;
+
+ sum_forward_deps += lockdep_count_forward_deps(class);
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
+ DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
+#endif
+
+#endif
+ seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
+ nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
+ nr_list_entries, MAX_LOCKDEP_ENTRIES);
+ seq_printf(m, " indirect dependencies: %11lu\n",
+ sum_forward_deps);
+
+ /*
+ * Total number of dependencies:
+ *
+ * All irq-safe locks may nest inside irq-unsafe locks,
+ * plus all the other known dependencies:
+ */
+ seq_printf(m, " all direct dependencies: %11lu\n",
+ nr_irq_unsafe * nr_irq_safe +
+ nr_hardirq_unsafe * nr_hardirq_safe +
+ nr_list_entries);
+
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
+ nr_lock_chains, MAX_LOCKDEP_CHAINS);
+ seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
+ nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ seq_printf(m, " in-hardirq chains: %11u\n",
+ nr_hardirq_chains);
+ seq_printf(m, " in-softirq chains: %11u\n",
+ nr_softirq_chains);
+#endif
+ seq_printf(m, " in-process chains: %11u\n",
+ nr_process_chains);
+ seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
+ nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+ seq_printf(m, " combined max dependencies: %11u\n",
+ (nr_hardirq_chains + 1) *
+ (nr_softirq_chains + 1) *
+ (nr_process_chains + 1)
+ );
+ seq_printf(m, " hardirq-safe locks: %11lu\n",
+ nr_hardirq_safe);
+ seq_printf(m, " hardirq-unsafe locks: %11lu\n",
+ nr_hardirq_unsafe);
+ seq_printf(m, " softirq-safe locks: %11lu\n",
+ nr_softirq_safe);
+ seq_printf(m, " softirq-unsafe locks: %11lu\n",
+ nr_softirq_unsafe);
+ seq_printf(m, " irq-safe locks: %11lu\n",
+ nr_irq_safe);
+ seq_printf(m, " irq-unsafe locks: %11lu\n",
+ nr_irq_unsafe);
+
+ seq_printf(m, " hardirq-read-safe locks: %11lu\n",
+ nr_hardirq_read_safe);
+ seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
+ nr_hardirq_read_unsafe);
+ seq_printf(m, " softirq-read-safe locks: %11lu\n",
+ nr_softirq_read_safe);
+ seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
+ nr_softirq_read_unsafe);
+ seq_printf(m, " irq-read-safe locks: %11lu\n",
+ nr_irq_read_safe);
+ seq_printf(m, " irq-read-unsafe locks: %11lu\n",
+ nr_irq_read_unsafe);
+
+ seq_printf(m, " uncategorized locks: %11lu\n",
+ nr_uncategorized);
+ seq_printf(m, " unused locks: %11lu\n",
+ nr_unused);
+ seq_printf(m, " max locking depth: %11u\n",
+ max_lockdep_depth);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " max bfs queue depth: %11u\n",
+ max_bfs_queue_depth);
+#endif
+ lockdep_stats_debug_show(m);
+ seq_printf(m, " debug_locks: %11u\n",
+ debug_locks);
+
+ return 0;
+}
+
+#ifdef CONFIG_LOCK_STAT
+
+struct lock_stat_data {
+ struct lock_class *class;
+ struct lock_class_stats stats;
+};
+
+struct lock_stat_seq {
+ struct lock_stat_data *iter_end;
+ struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
+};
+
+/*
+ * sort on absolute number of contentions
+ */
+static int lock_stat_cmp(const void *l, const void *r)
+{
+ const struct lock_stat_data *dl = l, *dr = r;
+ unsigned long nl, nr;
+
+ nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
+ nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
+
+ return nr - nl;
+}
+
+static void seq_line(struct seq_file *m, char c, int offset, int length)
+{
+ int i;
+
+ for (i = 0; i < offset; i++)
+ seq_puts(m, " ");
+ for (i = 0; i < length; i++)
+ seq_printf(m, "%c", c);
+ seq_puts(m, "\n");
+}
+
+static void snprint_time(char *buf, size_t bufsiz, s64 nr)
+{
+ s64 div;
+ s32 rem;
+
+ nr += 5; /* for display rounding */
+ div = div_s64_rem(nr, 1000, &rem);
+ snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
+}
+
+static void seq_time(struct seq_file *m, s64 time)
+{
+ char num[15];
+
+ snprint_time(num, sizeof(num), time);
+ seq_printf(m, " %14s", num);
+}
+
+static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
+{
+ seq_printf(m, "%14lu", lt->nr);
+ seq_time(m, lt->min);
+ seq_time(m, lt->max);
+ seq_time(m, lt->total);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
+}
+
+static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
+{
+ struct lockdep_subclass_key *ckey;
+ struct lock_class_stats *stats;
+ struct lock_class *class;
+ const char *cname;
+ int i, namelen;
+ char name[39];
+
+ class = data->class;
+ stats = &data->stats;
+
+ namelen = 38;
+ if (class->name_version > 1)
+ namelen -= 2; /* XXX truncates versions > 9 */
+ if (class->subclass)
+ namelen -= 2;
+
+ rcu_read_lock_sched();
+ cname = rcu_dereference_sched(class->name);
+ ckey = rcu_dereference_sched(class->key);
+
+ if (!cname && !ckey) {
+ rcu_read_unlock_sched();
+ return;
+
+ } else if (!cname) {
+ char str[KSYM_NAME_LEN];
+ const char *key_name;
+
+ key_name = __get_key_name(ckey, str);
+ snprintf(name, namelen, "%s", key_name);
+ } else {
+ snprintf(name, namelen, "%s", cname);
+ }
+ rcu_read_unlock_sched();
+
+ namelen = strlen(name);
+ if (class->name_version > 1) {
+ snprintf(name+namelen, 3, "#%d", class->name_version);
+ namelen += 2;
+ }
+ if (class->subclass) {
+ snprintf(name+namelen, 3, "/%d", class->subclass);
+ namelen += 2;
+ }
+
+ if (stats->write_holdtime.nr) {
+ if (stats->read_holdtime.nr)
+ seq_printf(m, "%38s-W:", name);
+ else
+ seq_printf(m, "%40s:", name);
+
+ seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
+ seq_lock_time(m, &stats->write_waittime);
+ seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
+ seq_lock_time(m, &stats->write_holdtime);
+ seq_puts(m, "\n");
+ }
+
+ if (stats->read_holdtime.nr) {
+ seq_printf(m, "%38s-R:", name);
+ seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
+ seq_lock_time(m, &stats->read_waittime);
+ seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
+ seq_lock_time(m, &stats->read_holdtime);
+ seq_puts(m, "\n");
+ }
+
+ if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
+ return;
+
+ if (stats->read_holdtime.nr)
+ namelen += 2;
+
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ char ip[32];
+
+ if (class->contention_point[i] == 0)
+ break;
+
+ if (!i)
+ seq_line(m, '-', 40-namelen, namelen);
+
+ snprintf(ip, sizeof(ip), "[<%p>]",
+ (void *)class->contention_point[i]);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contention_point[i],
+ ip, (void *)class->contention_point[i]);
+ }
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ char ip[32];
+
+ if (class->contending_point[i] == 0)
+ break;
+
+ if (!i)
+ seq_line(m, '-', 40-namelen, namelen);
+
+ snprintf(ip, sizeof(ip), "[<%p>]",
+ (void *)class->contending_point[i]);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contending_point[i],
+ ip, (void *)class->contending_point[i]);
+ }
+ if (i) {
+ seq_puts(m, "\n");
+ seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
+ seq_puts(m, "\n");
+ }
+}
+
+static void seq_header(struct seq_file *m)
+{
+ seq_puts(m, "lock_stat version 0.4\n");
+
+ if (unlikely(!debug_locks))
+ seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
+
+ seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+ seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
+ "%14s %14s\n",
+ "class name",
+ "con-bounces",
+ "contentions",
+ "waittime-min",
+ "waittime-max",
+ "waittime-total",
+ "waittime-avg",
+ "acq-bounces",
+ "acquisitions",
+ "holdtime-min",
+ "holdtime-max",
+ "holdtime-total",
+ "holdtime-avg");
+ seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+ seq_printf(m, "\n");
+}
+
+static void *ls_start(struct seq_file *m, loff_t *pos)
+{
+ struct lock_stat_seq *data = m->private;
+ struct lock_stat_data *iter;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ iter = data->stats + (*pos - 1);
+ if (iter >= data->iter_end)
+ iter = NULL;
+
+ return iter;
+}
+
+static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ls_start(m, pos);
+}
+
+static void ls_stop(struct seq_file *m, void *v)
+{
+}
+
+static int ls_show(struct seq_file *m, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_header(m);
+ else
+ seq_stats(m, v);
+
+ return 0;
+}
+
+static const struct seq_operations lockstat_ops = {
+ .start = ls_start,
+ .next = ls_next,
+ .stop = ls_stop,
+ .show = ls_show,
+};
+
+static int lock_stat_open(struct inode *inode, struct file *file)
+{
+ int res;
+ struct lock_class *class;
+ struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
+
+ if (!data)
+ return -ENOMEM;
+
+ res = seq_open(file, &lockstat_ops);
+ if (!res) {
+ struct lock_stat_data *iter = data->stats;
+ struct seq_file *m = file->private_data;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iter->class = class;
+ iter->stats = lock_stats(class);
+ iter++;
+ }
+ data->iter_end = iter;
+
+ sort(data->stats, data->iter_end - data->stats,
+ sizeof(struct lock_stat_data),
+ lock_stat_cmp, NULL);
+
+ m->private = data;
+ } else
+ vfree(data);
+
+ return res;
+}
+
+static ssize_t lock_stat_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct lock_class *class;
+ char c;
+
+ if (count) {
+ if (get_user(c, buf))
+ return -EFAULT;
+
+ if (c != '0')
+ return count;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry)
+ clear_lock_stats(class);
+ }
+ return count;
+}
+
+static int lock_stat_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+
+ vfree(seq->private);
+ return seq_release(inode, file);
+}
+
+static const struct file_operations proc_lock_stat_operations = {
+ .open = lock_stat_open,
+ .write = lock_stat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = lock_stat_release,
+};
+#endif /* CONFIG_LOCK_STAT */
+
+static int __init lockdep_proc_init(void)
+{
+ proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
+#ifdef CONFIG_PROVE_LOCKING
+ proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
+#endif
+ proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
+#ifdef CONFIG_LOCK_STAT
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+ &proc_lock_stat_operations);
+#endif
+
+ return 0;
+}
+
+__initcall(lockdep_proc_init);
+
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
new file mode 100644
index 000000000..35ca09f2e
--- /dev/null
+++ b/kernel/locking/lockdep_states.h
@@ -0,0 +1,8 @@
+/*
+ * Lockdep states,
+ *
+ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
+ * you add one, or come up with a nice dynamic solution.
+ */
+LOCKDEP_STATE(HARDIRQ)
+LOCKDEP_STATE(SOFTIRQ)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000..95395ef59
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,1062 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Davidlohr Bueso <dave@stgolabs.net>
+ * Based on kernel/rcu/torture.c.
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/sched/rt.h>
+#include <linux/spinlock.h>
+#include <linux/rwlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/rtmutex.h>
+#include <linux/atomic.h>
+#include <linux/moduleparam.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+torture_param(int, nwriters_stress, -1,
+ "Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1,
+ "Number of read-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+ "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(int, verbose, 1,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+ "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+
+static bool lock_is_write_held;
+static bool lock_is_read_held;
+
+struct lock_stress_stats {
+ long n_lock_fail;
+ long n_lock_acquired;
+};
+
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+ void (*init)(void);
+ int (*writelock)(void);
+ void (*write_delay)(struct torture_random_state *trsp);
+ void (*task_boost)(struct torture_random_state *trsp);
+ void (*writeunlock)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *trsp);
+ void (*readunlock)(void);
+
+ unsigned long flags; /* for irq spinlocks */
+ const char *name;
+};
+
+struct lock_torture_cxt {
+ int nrealwriters_stress;
+ int nrealreaders_stress;
+ bool debug_lock;
+ atomic_t n_lock_torture_errors;
+ struct lock_torture_ops *cur_ops;
+ struct lock_stress_stats *lwsa; /* writer statistics */
+ struct lock_stress_stats *lrsa; /* reader statistics */
+};
+static struct lock_torture_cxt cxt = { 0, 0, false,
+ ATOMIC_INIT(0),
+ NULL, NULL};
+/*
+ * Definitions for lock torture testing.
+ */
+
+static int torture_lock_busted_write_lock(void)
+{
+ return 0; /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_lock_busted_write_unlock(void)
+{
+ /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_boost_dummy(struct torture_random_state *trsp)
+{
+ /* Only rtmutexes care about priority */
+}
+
+static struct lock_torture_ops lock_busted_ops = {
+ .writelock = torture_lock_busted_write_lock,
+ .write_delay = torture_lock_busted_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_lock_busted_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "lock_busted"
+};
+
+static DEFINE_SPINLOCK(torture_spinlock);
+
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+ spin_lock(&torture_spinlock);
+ return 0;
+}
+
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+ spin_unlock(&torture_spinlock);
+}
+
+static struct lock_torture_ops spin_lock_ops = {
+ .writelock = torture_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "spin_lock"
+};
+
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&torture_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+ spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops spin_lock_irq_ops = {
+ .writelock = torture_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_lock_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "spin_lock_irq"
+};
+
+static DEFINE_RWLOCK(torture_rwlock);
+
+static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+{
+ write_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+{
+ write_unlock(&torture_rwlock);
+}
+
+static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+{
+ read_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 10;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+{
+ read_unlock(&torture_rwlock);
+}
+
+static struct lock_torture_ops rw_lock_ops = {
+ .writelock = torture_rwlock_write_lock,
+ .write_delay = torture_rwlock_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_rwlock_write_unlock,
+ .readlock = torture_rwlock_read_lock,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock,
+ .name = "rw_lock"
+};
+
+static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_write_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_read_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops rw_lock_irq_ops = {
+ .writelock = torture_rwlock_write_lock_irq,
+ .write_delay = torture_rwlock_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_rwlock_write_unlock_irq,
+ .readlock = torture_rwlock_read_lock_irq,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock_irq,
+ .name = "rw_lock_irq"
+};
+
+static DEFINE_MUTEX(torture_mutex);
+
+static int torture_mutex_lock(void) __acquires(torture_mutex)
+{
+ mutex_lock(&torture_mutex);
+ return 0;
+}
+
+static void torture_mutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 5);
+ else
+ mdelay(longdelay_ms / 5);
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_mutex_unlock(void) __releases(torture_mutex)
+{
+ mutex_unlock(&torture_mutex);
+}
+
+static struct lock_torture_ops mutex_lock_ops = {
+ .writelock = torture_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "mutex_lock"
+};
+
+#include <linux/ww_mutex.h>
+static DEFINE_WD_CLASS(torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+
+static int torture_ww_mutex_lock(void)
+__acquires(torture_ww_mutex_0)
+__acquires(torture_ww_mutex_1)
+__acquires(torture_ww_mutex_2)
+{
+ LIST_HEAD(list);
+ struct reorder_lock {
+ struct list_head link;
+ struct ww_mutex *lock;
+ } locks[3], *ll, *ln;
+ struct ww_acquire_ctx ctx;
+
+ locks[0].lock = &torture_ww_mutex_0;
+ list_add(&locks[0].link, &list);
+
+ locks[1].lock = &torture_ww_mutex_1;
+ list_add(&locks[1].link, &list);
+
+ locks[2].lock = &torture_ww_mutex_2;
+ list_add(&locks[2].link, &list);
+
+ ww_acquire_init(&ctx, &torture_ww_class);
+
+ list_for_each_entry(ll, &list, link) {
+ int err;
+
+ err = ww_mutex_lock(ll->lock, &ctx);
+ if (!err)
+ continue;
+
+ ln = ll;
+ list_for_each_entry_continue_reverse(ln, &list, link)
+ ww_mutex_unlock(ln->lock);
+
+ if (err != -EDEADLK)
+ return err;
+
+ ww_mutex_lock_slow(ll->lock, &ctx);
+ list_move(&ll->link, &list);
+ }
+
+ ww_acquire_fini(&ctx);
+ return 0;
+}
+
+static void torture_ww_mutex_unlock(void)
+__releases(torture_ww_mutex_0)
+__releases(torture_ww_mutex_1)
+__releases(torture_ww_mutex_2)
+{
+ ww_mutex_unlock(&torture_ww_mutex_0);
+ ww_mutex_unlock(&torture_ww_mutex_1);
+ ww_mutex_unlock(&torture_ww_mutex_2);
+}
+
+static struct lock_torture_ops ww_mutex_lock_ops = {
+ .writelock = torture_ww_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_ww_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "ww_mutex_lock"
+};
+
+#ifdef CONFIG_RT_MUTEXES
+static DEFINE_RT_MUTEX(torture_rtmutex);
+
+static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
+}
+
+static void torture_rtmutex_boost(struct torture_random_state *trsp)
+{
+ int policy;
+ struct sched_param param;
+ const unsigned int factor = 50000; /* yes, quite arbitrary */
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every ~50k operations. When the
+ * task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ policy = SCHED_FIFO;
+ param.sched_priority = MAX_RT_PRIO - 1;
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another ~500k operations,
+ * then restored back to its original prio, and so forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ policy = SCHED_NORMAL;
+ param.sched_priority = 0;
+ } else /* common case, do nothing */
+ return;
+ }
+
+ sched_setscheduler_nocheck(current, policy, &param);
+}
+
+static void torture_rtmutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /*
+ * We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+{
+ rt_mutex_unlock(&torture_rtmutex);
+}
+
+static struct lock_torture_ops rtmutex_lock_ops = {
+ .writelock = torture_rtmutex_lock,
+ .write_delay = torture_rtmutex_delay,
+ .task_boost = torture_rtmutex_boost,
+ .writeunlock = torture_rtmutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "rtmutex_lock"
+};
+#endif
+
+static DECLARE_RWSEM(torture_rwsem);
+static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+{
+ down_write(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 10);
+ else
+ mdelay(longdelay_ms / 10);
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+{
+ up_write(&torture_rwsem);
+}
+
+static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+{
+ down_read(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 2);
+ else
+ mdelay(longdelay_ms / 2);
+ if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+{
+ up_read(&torture_rwsem);
+}
+
+static struct lock_torture_ops rwsem_lock_ops = {
+ .writelock = torture_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_rwsem_up_write,
+ .readlock = torture_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_rwsem_up_read,
+ .name = "rwsem_lock"
+};
+
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+
+void torture_percpu_rwsem_init(void)
+{
+ BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+}
+
+static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+{
+ percpu_down_write(&pcpu_rwsem);
+ return 0;
+}
+
+static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+{
+ percpu_up_write(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+{
+ percpu_down_read(&pcpu_rwsem);
+ return 0;
+}
+
+static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+{
+ percpu_up_read(&pcpu_rwsem);
+}
+
+static struct lock_torture_ops percpu_rwsem_lock_ops = {
+ .init = torture_percpu_rwsem_init,
+ .writelock = torture_percpu_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_percpu_rwsem_up_write,
+ .readlock = torture_percpu_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_percpu_rwsem_up_read,
+ .name = "percpu_rwsem_lock"
+};
+
+/*
+ * Lock torture writer kthread. Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+ struct lock_stress_stats *lwsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+
+ cxt.cur_ops->task_boost(&rand);
+ cxt.cur_ops->writelock();
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = 1;
+ if (WARN_ON_ONCE(lock_is_read_held))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ cxt.cur_ops->write_delay(&rand);
+ lock_is_write_held = 0;
+ cxt.cur_ops->writeunlock();
+
+ stutter_wait("lock_torture_writer");
+ } while (!torture_must_stop());
+
+ cxt.cur_ops->task_boost(NULL); /* reset prio */
+ torture_kthread_stopping("lock_torture_writer");
+ return 0;
+}
+
+/*
+ * Lock torture reader kthread. Repeatedly acquires and releases
+ * the reader lock.
+ */
+static int lock_torture_reader(void *arg)
+{
+ struct lock_stress_stats *lrsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+
+ cxt.cur_ops->readlock();
+ lock_is_read_held = 1;
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lrsp->n_lock_fail++; /* rare, but... */
+
+ lrsp->n_lock_acquired++;
+ cxt.cur_ops->read_delay(&rand);
+ lock_is_read_held = 0;
+ cxt.cur_ops->readunlock();
+
+ stutter_wait("lock_torture_reader");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_reader");
+ return 0;
+}
+
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void __torture_print_stats(char *page,
+ struct lock_stress_stats *statp, bool write)
+{
+ bool fail = 0;
+ int i, n_stress;
+ long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long long sum = 0;
+
+ n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
+ for (i = 0; i < n_stress; i++) {
+ if (statp[i].n_lock_fail)
+ fail = true;
+ sum += statp[i].n_lock_acquired;
+ if (max < statp[i].n_lock_acquired)
+ max = statp[i].n_lock_acquired;
+ if (min > statp[i].n_lock_acquired)
+ min = statp[i].n_lock_acquired;
+ }
+ page += sprintf(page,
+ "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ write ? "Writes" : "Reads ",
+ sum, max, min, max / 2 > min ? "???" : "",
+ fail, fail ? "!!!" : "");
+ if (fail)
+ atomic_inc(&cxt.n_lock_torture_errors);
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only one
+ * call to this function at a given time!!! This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+ int size = cxt.nrealwriters_stress * 200 + 8192;
+ char *buf;
+
+ if (cxt.cur_ops->readlock)
+ size += cxt.nrealreaders_stress * 200 + 8192;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+
+ __torture_print_stats(buf, cxt.lwsa, true);
+ pr_alert("%s", buf);
+ kfree(buf);
+
+ if (cxt.cur_ops->readlock) {
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+
+ __torture_print_stats(buf, cxt.lrsa, false);
+ pr_alert("%s", buf);
+ kfree(buf);
+ }
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ lock_torture_stats_print();
+ torture_shutdown_absorb("lock_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_stats");
+ return 0;
+}
+
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+ const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, cxt.debug_lock ? " [debug]": "",
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
+ verbose, shuffle_interval, stutter, shutdown_secs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void lock_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ /*
+ * Indicates early cleanup, meaning that the test has not run,
+ * such as when passing bogus args when loading the module. As
+ * such, only perform the underlying torture-specific cleanups,
+ * and avoid anything related to locktorture.
+ */
+ if (!cxt.lwsa && !cxt.lrsa)
+ goto end;
+
+ if (writer_tasks) {
+ for (i = 0; i < cxt.nrealwriters_stress; i++)
+ torture_stop_kthread(lock_torture_writer,
+ writer_tasks[i]);
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < cxt.nrealreaders_stress; i++)
+ torture_stop_kthread(lock_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+ }
+
+ torture_stop_kthread(lock_torture_stats, stats_task);
+ lock_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&cxt.n_lock_torture_errors))
+ lock_torture_print_module_parms(cxt.cur_ops,
+ "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ lock_torture_print_module_parms(cxt.cur_ops,
+ "End of test: LOCK_HOTPLUG");
+ else
+ lock_torture_print_module_parms(cxt.cur_ops,
+ "End of test: SUCCESS");
+
+ kfree(cxt.lwsa);
+ kfree(cxt.lrsa);
+
+end:
+ torture_cleanup_end();
+}
+
+static int __init lock_torture_init(void)
+{
+ int i, j;
+ int firsterr = 0;
+ static struct lock_torture_ops *torture_ops[] = {
+ &lock_busted_ops,
+ &spin_lock_ops, &spin_lock_irq_ops,
+ &rw_lock_ops, &rw_lock_irq_ops,
+ &mutex_lock_ops,
+ &ww_mutex_lock_ops,
+#ifdef CONFIG_RT_MUTEXES
+ &rtmutex_lock_ops,
+#endif
+ &rwsem_lock_ops,
+ &percpu_rwsem_lock_ops,
+ };
+
+ if (!torture_init_begin(torture_type, verbose))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cxt.cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cxt.cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("lock-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+
+ if (nwriters_stress == 0 && nreaders_stress == 0) {
+ pr_alert("lock-torture: must run at least one locking thread\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+
+ if (cxt.cur_ops->init)
+ cxt.cur_ops->init();
+
+ if (nwriters_stress >= 0)
+ cxt.nrealwriters_stress = nwriters_stress;
+ else
+ cxt.nrealwriters_stress = 2 * num_online_cpus();
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ if (strncmp(torture_type, "mutex", 5) == 0)
+ cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ if (strncmp(torture_type, "rtmutex", 7) == 0)
+ cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if ((strncmp(torture_type, "spin", 4) == 0) ||
+ (strncmp(torture_type, "rw_lock", 7) == 0))
+ cxt.debug_lock = true;
+#endif
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+ if (nwriters_stress) {
+ lock_is_write_held = 0;
+ cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*cxt.lwsa),
+ GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
+ }
+ }
+
+ if (cxt.cur_ops->readlock) {
+ if (nreaders_stress >= 0)
+ cxt.nrealreaders_stress = nreaders_stress;
+ else {
+ /*
+ * By default distribute evenly the number of
+ * readers and writers. We still run the same number
+ * of threads as the writer-only locks default.
+ */
+ if (nwriters_stress < 0) /* user doesn't care */
+ cxt.nrealwriters_stress = num_online_cpus();
+ cxt.nrealreaders_stress = cxt.nrealwriters_stress;
+ }
+
+ if (nreaders_stress) {
+ lock_is_read_held = 0;
+ cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
+ sizeof(*cxt.lrsa),
+ GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
+ }
+ }
+
+ lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
+
+ /* Prepare torture context. */
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ,
+ onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shuffle_interval > 0) {
+ firsterr = torture_shuffle_init(shuffle_interval);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs,
+ lock_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter);
+ if (firsterr)
+ goto unwind;
+ }
+
+ if (nwriters_stress) {
+ writer_tasks = kcalloc(cxt.nrealwriters_stress,
+ sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ }
+
+ if (cxt.cur_ops->readlock) {
+ reader_tasks = kcalloc(cxt.nrealreaders_stress,
+ sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ }
+
+ /*
+ * Create the kthreads and start torturing (oh, those poor little locks).
+ *
+ * TODO: Note that we interleave writers with readers, giving writers a
+ * slight advantage, by creating its kthread first. This can be modified
+ * for very specific needs, or even let the user choose the policy, if
+ * ever wanted.
+ */
+ for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
+ j < cxt.nrealreaders_stress; i++, j++) {
+ if (i >= cxt.nrealwriters_stress)
+ goto create_reader;
+
+ /* Create writer. */
+ firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+
+ create_reader:
+ if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
+ continue;
+ /* Create reader. */
+ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
+ reader_tasks[j]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(lock_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ lock_torture_cleanup();
+ return firsterr;
+}
+
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000..5e10153b4
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+#include <asm/mcs_spinlock.h>
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+ int count; /* nesting count, see qspinlock.c */
+};
+
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
+ */
+#define arch_mcs_spin_lock_contended(l) \
+do { \
+ smp_cond_load_acquire(l, VAL); \
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ /*
+ * We rely on the full barrier with global transitivity implied by the
+ * below xchg() to order the initialization stores above against any
+ * observation of @node. And to provide the ACQUIRE ordering associated
+ * with a LOCK primitive.
+ */
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /*
+ * Lock acquired, don't need to set node->locked to 1. Threads
+ * only spin on its own node->locked value for lock acquisition.
+ * However, since this thread can immediately acquire the lock
+ * and does not proceed to spin on its own node->locked, this
+ * value won't be used. If a debug mode is needed to
+ * audit lock status, then set node->locked value here.
+ */
+ return;
+ }
+ WRITE_ONCE(prev->next, node);
+
+ /* Wait until the lock holder passes the lock down. */
+ arch_mcs_spin_lock_contended(&node->locked);
+}
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *next = READ_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (likely(cmpxchg_release(lock, node, NULL) == node))
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
+
+ /* Pass lock to next waiter. */
+ arch_mcs_spin_unlock_contended(&next->locked);
+}
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
new file mode 100644
index 000000000..839df4383
--- /dev/null
+++ b/kernel/locking/mutex-debug.c
@@ -0,0 +1,107 @@
+/*
+ * kernel/mutex-debug.c
+ *
+ * Debugging code for mutexes
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection started by:
+ *
+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Released under the General Public License (GPL).
+ */
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+
+#include "mutex-debug.h"
+
+/*
+ * Must be called with lock->wait_lock held.
+ */
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
+ waiter->magic = waiter;
+ INIT_LIST_HEAD(&waiter->list);
+}
+
+void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+ DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+}
+
+void debug_mutex_free_waiter(struct mutex_waiter *waiter)
+{
+ DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
+ memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
+}
+
+void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
+ /* Mark the current thread as blocked on the lock: */
+ task->blocked_on = waiter;
+}
+
+void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+ DEBUG_LOCKS_WARN_ON(waiter->task != task);
+ DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+ task->blocked_on = NULL;
+
+ INIT_LIST_HEAD(&waiter->list);
+ waiter->task = NULL;
+}
+
+void debug_mutex_unlock(struct mutex *lock)
+{
+ if (likely(debug_locks)) {
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ }
+}
+
+void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->magic = lock;
+}
+
+/***
+ * mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void mutex_destroy(struct mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
+ lock->magic = NULL;
+}
+
+EXPORT_SYMBOL_GPL(mutex_destroy);
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
new file mode 100644
index 000000000..53e631e1d
--- /dev/null
+++ b/kernel/locking/mutex-debug.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal declarations,
+ * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
+ * More details are in kernel/mutex-debug.c.
+ */
+
+/*
+ * This must be called with lock->wait_lock held.
+ */
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
new file mode 100644
index 000000000..fbc62d360
--- /dev/null
+++ b/kernel/locking/mutex.c
@@ -0,0 +1,1451 @@
+/*
+ * kernel/locking/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ * from the -rt tree, where it was originally implemented for rtmutexes
+ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ * and Sven Dietrich.
+ *
+ * Also see Documentation/locking/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/ww_mutex.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/osq_lock.h>
+
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+#else
+# include "mutex.h"
+#endif
+
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ atomic_long_set(&lock->owner, 0);
+ spin_lock_init(&lock->wait_lock);
+ INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ osq_lock_init(&lock->osq);
+#endif
+
+ debug_mutex_init(lock, name, key);
+}
+EXPORT_SYMBOL(__mutex_init);
+
+/*
+ * @owner: contains: 'struct task_struct *' to the current lock owner,
+ * NULL means not owned. Since task_struct pointers are aligned at
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
+ *
+ * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
+ */
+#define MUTEX_FLAG_WAITERS 0x01
+#define MUTEX_FLAG_HANDOFF 0x02
+#define MUTEX_FLAG_PICKUP 0x04
+
+#define MUTEX_FLAGS 0x07
+
+static inline struct task_struct *__owner_task(unsigned long owner)
+{
+ return (struct task_struct *)(owner & ~MUTEX_FLAGS);
+}
+
+static inline unsigned long __owner_flags(unsigned long owner)
+{
+ return owner & MUTEX_FLAGS;
+}
+
+/*
+ * Trylock variant that retuns the owning task on failure.
+ */
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+{
+ unsigned long owner, curr = (unsigned long)current;
+
+ owner = atomic_long_read(&lock->owner);
+ for (;;) { /* must loop, can race against a flag */
+ unsigned long old, flags = __owner_flags(owner);
+ unsigned long task = owner & ~MUTEX_FLAGS;
+
+ if (task) {
+ if (likely(task != curr))
+ break;
+
+ if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+ break;
+
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
+#endif
+ }
+
+ /*
+ * We set the HANDOFF bit, we must make sure it doesn't live
+ * past the point where we acquire it. This would be possible
+ * if we (accidentally) set the bit on an unlocked mutex.
+ */
+ flags &= ~MUTEX_FLAG_HANDOFF;
+
+ old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
+ if (old == owner)
+ return NULL;
+
+ owner = old;
+ }
+
+ return __owner_task(owner);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+ return !__mutex_trylock_or_owner(lock);
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Lockdep annotations are contained to the slow paths for simplicity.
+ * There is nothing that would stop spreading the lockdep annotations outwards
+ * except more code.
+ */
+
+/*
+ * Optimistic trylock that only works in the uncontended case. Make sure to
+ * follow with a __mutex_trylock() before failing.
+ */
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+ unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;
+
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
+ return true;
+
+ return false;
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+ unsigned long curr = (unsigned long)current;
+
+ if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+ return true;
+
+ return false;
+}
+#endif
+
+static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
+{
+ atomic_long_or(flag, &lock->owner);
+}
+
+static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
+{
+ atomic_long_andnot(flag, &lock->owner);
+}
+
+static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
+}
+
+/*
+ * Add @waiter to a given location in the lock wait_list and set the
+ * FLAG_WAITERS flag if it's the first waiter.
+ */
+static void
+__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct list_head *list)
+{
+ debug_mutex_add_waiter(lock, waiter, current);
+
+ list_add_tail(&waiter->list, list);
+ if (__mutex_waiter_is_first(lock, waiter))
+ __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+}
+
+static void
+__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ list_del(&waiter->list);
+ if (likely(list_empty(&lock->wait_list)))
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
+ debug_mutex_remove_waiter(lock, waiter, current);
+}
+
+/*
+ * Give up ownership to a specific task, when @task = NULL, this is equivalent
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
+ */
+static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ for (;;) {
+ unsigned long old, new;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+ DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
+#endif
+
+ new = (owner & MUTEX_FLAG_WAITERS);
+ new |= (unsigned long)task;
+ if (task)
+ new |= MUTEX_FLAG_PICKUP;
+
+ old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
+ if (old == owner)
+ break;
+
+ owner = old;
+ }
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+static void __sched __mutex_lock_slowpath(struct mutex *lock);
+
+/**
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging)
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void __sched mutex_lock(struct mutex *lock)
+{
+ might_sleep();
+
+ if (!__mutex_trylock_fast(lock))
+ __mutex_lock_slowpath(lock);
+}
+EXPORT_SYMBOL(mutex_lock);
+#endif
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if context @a is 'after' context @b. IOW, @a is a younger
+ * transaction than @b and depending on algorithm either needs to wait for
+ * @b or die.
+ */
+static inline bool __sched
+__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a younger waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool __sched
+__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 &&
+ __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
+ debug_mutex_wake_waiter(lock, waiter);
+ wake_up_process(waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with older transactions than
+ * the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx)
+{
+ struct task_struct *owner = __mutex_owner(lock);
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current)
+ wake_up_process(owner);
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are later contexts waiting
+ * behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void __sched
+__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct mutex_waiter *cur;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ list_for_each_entry(cur, &lock->wait_list, list) {
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ spin_lock(&lock->base.wait_lock);
+ __ww_mutex_check_waiters(&lock->base, ctx);
+ spin_unlock(&lock->base.wait_lock);
+}
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline
+bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
+{
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ *
+ * Check this in every inner iteration because we may
+ * be racing against another thread's ww_mutex_lock.
+ */
+ if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx))
+ return false;
+
+ /*
+ * If we aren't on the wait list yet, cancel the spin
+ * if there are waiters. We want to avoid stealing the
+ * lock from a waiter with an earlier stamp, since the
+ * other thread may already own a lock that we also
+ * need.
+ */
+ if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS))
+ return false;
+
+ /*
+ * Similarly, stop spinning if we are no longer the
+ * first waiter.
+ */
+ if (waiter && !__mutex_waiter_is_first(lock, waiter))
+ return false;
+
+ return true;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer access and not
+ * reliable.
+ *
+ * "noinline" so that this function shows up on perf profiles.
+ */
+static noinline
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
+ struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter)
+{
+ bool ret = true;
+
+ rcu_read_lock();
+ while (__mutex_owner(lock) == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ /*
+ * Use vcpu_is_preempted to detect lock holder preemption issue.
+ */
+ if (!owner->on_cpu || need_resched() ||
+ vcpu_is_preempted(task_cpu(owner))) {
+ ret = false;
+ break;
+ }
+
+ if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) {
+ ret = false;
+ break;
+ }
+
+ cpu_relax();
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ struct task_struct *owner;
+ int retval = 1;
+
+ if (need_resched())
+ return 0;
+
+ rcu_read_lock();
+ owner = __mutex_owner(lock);
+
+ /*
+ * As lock holder preemption issue, we both skip spinning if task is not
+ * on cpu or its cpu is preempted
+ */
+ if (owner)
+ retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+ rcu_read_unlock();
+
+ /*
+ * If lock->owner is not set, the mutex has been released. Return true
+ * such that we'll trylock in the spin path, which is a faster option
+ * than the blocking slow path.
+ */
+ return retval;
+}
+
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ *
+ * The waiter flag is set to true if the spinner is a waiter in the wait
+ * queue. The waiter-spinner will spin on the lock directly and concurrently
+ * with the spinner at the head of the OSQ, if present, until the owner is
+ * changed to itself.
+ */
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
+{
+ if (!waiter) {
+ /*
+ * The purpose of the mutex_can_spin_on_owner() function is
+ * to eliminate the overhead of osq_lock() and osq_unlock()
+ * in case spinning isn't possible. As a waiter-spinner
+ * is not going to take OSQ lock anyway, there is no need
+ * to call mutex_can_spin_on_owner().
+ */
+ if (!mutex_can_spin_on_owner(lock))
+ goto fail;
+
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
+ if (!osq_lock(&lock->osq))
+ goto fail;
+ }
+
+ for (;;) {
+ struct task_struct *owner;
+
+ /* Try to acquire the mutex... */
+ owner = __mutex_trylock_or_owner(lock);
+ if (!owner)
+ break;
+
+ /*
+ * There's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter))
+ goto fail_unlock;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+
+ if (!waiter)
+ osq_unlock(&lock->osq);
+
+ return true;
+
+
+fail_unlock:
+ if (!waiter)
+ osq_unlock(&lock->osq);
+
+fail:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched()) {
+ /*
+ * We _should_ have TASK_RUNNING here, but just in case
+ * we do not, make it so, otherwise we might get stuck.
+ */
+ __set_current_state(TASK_RUNNING);
+ schedule_preempt_disabled();
+ }
+
+ return false;
+}
+#else
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
+{
+ return false;
+}
+#endif
+
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
+
+/**
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void __sched mutex_unlock(struct mutex *lock)
+{
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+ if (__mutex_unlock_fast(lock))
+ return;
+#endif
+ __mutex_unlock_slowpath(lock, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+/**
+ * ww_mutex_unlock - release the w/w mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously with any of the
+ * ww_mutex_lock* functions (with or without an acquire context). It is
+ * forbidden to release the locks after releasing the acquire context.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a unlocked mutex is not allowed.
+ */
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+
+ mutex_unlock(&lock->base);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+
+
+static __always_inline int __sched
+__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int __sched
+__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct mutex_waiter *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ cur = waiter;
+ list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int __sched
+__ww_mutex_add_waiter(struct mutex_waiter *waiter,
+ struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct mutex_waiter *cur;
+ struct list_head *pos;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __mutex_add_waiter(lock, waiter, &lock->wait_list);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ pos = &lock->wait_list;
+ list_for_each_entry_reverse(cur, &lock->wait_list, list) {
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = &cur->list;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx);
+ }
+
+ __mutex_add_waiter(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static __always_inline int __sched
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct mutex_waiter waiter;
+ struct ww_mutex *ww;
+ int ret;
+
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
+ might_sleep();
+
+ ww = container_of(lock, struct ww_mutex, base);
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+ }
+
+ preempt_disable();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+
+ if (__mutex_trylock(lock) ||
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
+ /* got the lock, yay! */
+ lock_acquired(&lock->dep_map, ip);
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ preempt_enable();
+ return 0;
+ }
+
+ spin_lock(&lock->wait_lock);
+ /*
+ * After waiting to acquire the wait_lock, try again.
+ */
+ if (__mutex_trylock(lock)) {
+ if (ww_ctx)
+ __ww_mutex_check_waiters(lock, ww_ctx);
+
+ goto skip_wait;
+ }
+
+ debug_mutex_lock_common(lock, &waiter);
+
+ lock_contended(&lock->dep_map, ip);
+
+ if (!use_ww_ctx) {
+ /* add waiting tasks to the end of the waitqueue (FIFO): */
+ __mutex_add_waiter(lock, &waiter, &lock->wait_list);
+
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ waiter.ww_ctx = MUTEX_POISON_WW_CTX;
+#endif
+ } else {
+ /*
+ * Add in stamp order, waking up waiters that must kill
+ * themselves.
+ */
+ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+ if (ret)
+ goto err_early_kill;
+
+ waiter.ww_ctx = ww_ctx;
+ }
+
+ waiter.task = current;
+
+ set_current_state(state);
+ for (;;) {
+ bool first;
+
+ /*
+ * Once we hold wait_lock, we're serialized against
+ * mutex_unlock() handing the lock off to us, do a trylock
+ * before testing the error conditions to make sure we pick up
+ * the handoff.
+ */
+ if (__mutex_trylock(lock))
+ goto acquired;
+
+ /*
+ * Check for signals and kill conditions while holding
+ * wait_lock. This ensures the lock cancellation is ordered
+ * against mutex_unlock() and wake-ups do not go missing.
+ */
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ goto err;
+ }
+
+ if (ww_ctx) {
+ ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
+ if (ret)
+ goto err;
+ }
+
+ spin_unlock(&lock->wait_lock);
+ schedule_preempt_disabled();
+
+ first = __mutex_waiter_is_first(lock, &waiter);
+ if (first)
+ __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+
+ set_current_state(state);
+ /*
+ * Here we order against unlock; we must either see it change
+ * state back to RUNNING and fall through the next schedule(),
+ * or we must see its unlock and acquire.
+ */
+ if (__mutex_trylock(lock) ||
+ (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
+ break;
+
+ spin_lock(&lock->wait_lock);
+ }
+ spin_lock(&lock->wait_lock);
+acquired:
+ __set_current_state(TASK_RUNNING);
+
+ if (ww_ctx) {
+ /*
+ * Wound-Wait; we stole the lock (!first_waiter), check the
+ * waiters as anyone might want to wound us.
+ */
+ if (!ww_ctx->is_wait_die &&
+ !__mutex_waiter_is_first(lock, &waiter))
+ __ww_mutex_check_waiters(lock, ww_ctx);
+ }
+
+ __mutex_remove_waiter(lock, &waiter);
+
+ debug_mutex_free_waiter(&waiter);
+
+skip_wait:
+ /* got the lock - cleanup and rejoice! */
+ lock_acquired(&lock->dep_map, ip);
+
+ if (ww_ctx)
+ ww_mutex_lock_acquired(ww, ww_ctx);
+
+ spin_unlock(&lock->wait_lock);
+ preempt_enable();
+ return 0;
+
+err:
+ __set_current_state(TASK_RUNNING);
+ __mutex_remove_waiter(lock, &waiter);
+err_early_kill:
+ spin_unlock(&lock->wait_lock);
+ debug_mutex_free_waiter(&waiter);
+ mutex_release(&lock->dep_map, 1, ip);
+ preempt_enable();
+ return ret;
+}
+
+static int __sched
+__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+}
+
+static int __sched
+__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched
+mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+{
+ return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+ return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+ unsigned tmp;
+
+ if (ctx->deadlock_inject_countdown-- == 0) {
+ tmp = ctx->deadlock_inject_interval;
+ if (tmp > UINT_MAX/4)
+ tmp = UINT_MAX;
+ else
+ tmp = tmp*2 + tmp + tmp/2;
+
+ ctx->deadlock_inject_interval = tmp;
+ ctx->deadlock_inject_countdown = tmp;
+ ctx->contending_lock = lock;
+
+ ww_mutex_unlock(lock);
+
+ return -EDEADLK;
+ }
+#endif
+
+ return 0;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+ ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
+ 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+ ctx);
+ if (!ret && ctx && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+ ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
+ 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+ ctx);
+
+ if (!ret && ctx && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
+
+#endif
+
+/*
+ * Release the lock, slowpath:
+ */
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
+{
+ struct task_struct *next = NULL;
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long owner;
+
+ mutex_release(&lock->dep_map, 1, ip);
+
+ /*
+ * Release the lock before (potentially) taking the spinlock such that
+ * other contenders can get on with things ASAP.
+ *
+ * Except when HANDOFF, in that case we must not clear the owner field,
+ * but instead set it to the top waiter.
+ */
+ owner = atomic_long_read(&lock->owner);
+ for (;;) {
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+ DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
+#endif
+
+ if (owner & MUTEX_FLAG_HANDOFF)
+ break;
+
+ old = atomic_long_cmpxchg_release(&lock->owner, owner,
+ __owner_flags(owner));
+ if (old == owner) {
+ if (owner & MUTEX_FLAG_WAITERS)
+ break;
+
+ return;
+ }
+
+ owner = old;
+ }
+
+ spin_lock(&lock->wait_lock);
+ debug_mutex_unlock(lock);
+ if (!list_empty(&lock->wait_list)) {
+ /* get the first entry from the wait-list: */
+ struct mutex_waiter *waiter =
+ list_first_entry(&lock->wait_list,
+ struct mutex_waiter, list);
+
+ next = waiter->task;
+
+ debug_mutex_wake_waiter(lock, waiter);
+ wake_q_add(&wake_q, next);
+ }
+
+ if (owner & MUTEX_FLAG_HANDOFF)
+ __mutex_handoff(lock, next);
+
+ spin_unlock(&lock->wait_lock);
+
+ wake_up_q(&wake_q);
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static noinline int __sched
+__mutex_lock_killable_slowpath(struct mutex *lock);
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock);
+
+/**
+ * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). If a signal is delivered while the
+ * process is sleeping, this function will return without acquiring the
+ * mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * signal arrived.
+ */
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ might_sleep();
+
+ if (__mutex_trylock_fast(lock))
+ return 0;
+
+ return __mutex_lock_interruptible_slowpath(lock);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+/**
+ * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). If a signal which will be fatal to
+ * the current process is delivered while the process is sleeping, this
+ * function will return without acquiring the mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * fatal signal arrived.
+ */
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ might_sleep();
+
+ if (__mutex_trylock_fast(lock))
+ return 0;
+
+ return __mutex_lock_killable_slowpath(lock);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+/**
+ * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). While the task is waiting for this
+ * mutex, it will be accounted as being in the IO wait state by the
+ * scheduler.
+ *
+ * Context: Process context.
+ */
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ mutex_lock(lock);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
+
+static noinline void __sched
+__mutex_lock_slowpath(struct mutex *lock)
+{
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+
+static noinline int __sched
+__mutex_lock_killable_slowpath(struct mutex *lock)
+{
+ return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock)
+{
+ return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+
+static noinline int __sched
+__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ _RET_IP_, ctx);
+}
+
+static noinline int __sched
+__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ _RET_IP_, ctx);
+}
+
+#endif
+
+/**
+ * mutex_trylock - try to acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated from the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int __sched mutex_trylock(struct mutex *lock)
+{
+ bool locked = __mutex_trylock(lock);
+
+ if (locked)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return locked;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ might_sleep();
+
+ if (__mutex_trylock_fast(&lock->base)) {
+ if (ctx)
+ ww_mutex_set_context_fastpath(lock, ctx);
+ return 0;
+ }
+
+ return __ww_mutex_lock_slowpath(lock, ctx);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ might_sleep();
+
+ if (__mutex_trylock_fast(&lock->base)) {
+ if (ctx)
+ ww_mutex_set_context_fastpath(lock, ctx);
+ return 0;
+ }
+
+ return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+#endif
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
new file mode 100644
index 000000000..f0c710b1d
--- /dev/null
+++ b/kernel/locking/mutex.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal prototypes, for the
+ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
+ */
+
+#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+#define debug_mutex_free_waiter(waiter) do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+#define debug_mutex_unlock(lock) do { } while (0)
+#define debug_mutex_init(lock, name, key) do { } while (0)
+
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
new file mode 100644
index 000000000..6ef600aa0
--- /dev/null
+++ b/kernel/locking/osq_lock.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/osq_lock.h>
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+ return node->cpu - 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+ for (;;) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ break;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+
+ cpu_relax();
+ }
+
+ return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ node->locked = 0;
+ node->next = NULL;
+ node->cpu = curr;
+
+ /*
+ * We need both ACQUIRE (pairs with corresponding RELEASE in
+ * unlock() uncontended, or fastpath) and RELEASE (to publish
+ * the node fields we just initialised) semantics when updating
+ * the lock tail.
+ */
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
+ return true;
+
+ prev = decode_cpu(old);
+ node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(prev->next, node);
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ while (!READ_ONCE(node->locked)) {
+ /*
+ * If we need to reschedule bail... so we can block.
+ * Use vcpu_is_preempted() to avoid waiting for a preempted
+ * lock holder:
+ */
+ if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
+ goto unqueue;
+
+ cpu_relax();
+ }
+ return true;
+
+unqueue:
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ if (prev->next == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becomming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ cpu_relax();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = READ_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next(lock, node, prev);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
+
+ return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node, *next;
+ int curr = encode_cpu(smp_processor_id());
+
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (likely(atomic_cmpxchg_release(&lock->tail, curr,
+ OSQ_UNLOCKED_VAL) == curr))
+ return;
+
+ /*
+ * Second most likely case.
+ */
+ node = this_cpu_ptr(&osq_node);
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, 1);
+ return;
+ }
+
+ next = osq_wait_next(lock, node, NULL);
+ if (next)
+ WRITE_ONCE(next->locked, 1);
+}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
new file mode 100644
index 000000000..883cf1b92
--- /dev/null
+++ b/kernel/locking/percpu-rwsem.c
@@ -0,0 +1,192 @@
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/percpu.h>
+#include <linux/lockdep.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
+ const char *name, struct lock_class_key *rwsem_key)
+{
+ sem->read_count = alloc_percpu(int);
+ if (unlikely(!sem->read_count))
+ return -ENOMEM;
+
+ /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ rcuwait_init(&sem->writer);
+ sem->readers_block = 0;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
+
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
+{
+ /*
+ * XXX: temporary kludge. The error path in alloc_super()
+ * assumes that percpu_free_rwsem() is safe after kzalloc().
+ */
+ if (!sem->read_count)
+ return;
+
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->read_count);
+ sem->read_count = NULL; /* catch use after free bugs */
+}
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
+
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+{
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * If the reader misses the writer's assignment of readers_block, then
+ * the writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->read_count, so that it doesn't matter that the
+ * writer missed them.
+ */
+
+ smp_mb(); /* A matches D */
+
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(!smp_load_acquire(&sem->readers_block)))
+ return 1;
+
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
+
+ if (try)
+ return 0;
+
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
+
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ this_cpu_inc(*sem->read_count);
+ __up_read(&sem->rw_sem);
+
+ preempt_disable();
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to recheck readers_active */
+ rcuwait_wake_up(&sem->writer);
+}
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
+{
+ if (per_cpu_sum(*sem->read_count) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
+
+ smp_mb(); /* C matches B */
+
+ return true;
+}
+
+void percpu_down_write(struct percpu_rw_semaphore *sem)
+{
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
+
+ down_write(&sem->rw_sem);
+
+ /*
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
+ */
+ WRITE_ONCE(sem->readers_block, 1);
+
+ smp_mb(); /* D matches A */
+
+ /*
+ * If they don't see our writer of readers_block, then we are
+ * guaranteed to see their sem->read_count increment, and therefore
+ * will wait for them.
+ */
+
+ /* Wait for all now active readers to complete. */
+ rcuwait_wait_event(&sem->writer, readers_active_check(sem));
+}
+EXPORT_SYMBOL_GPL(percpu_down_write);
+
+void percpu_up_write(struct percpu_rw_semaphore *sem)
+{
+ /*
+ * Signal the writer is done, no fast path yet.
+ *
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
+ *
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
+ */
+ smp_store_release(&sem->readers_block, 0);
+
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);
+
+ /*
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
+ */
+ rcu_sync_exit(&sem->rss);
+}
+EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 000000000..16c09cda3
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,93 @@
+/*
+ * Queued read/write locks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <asm/qrwlock.h>
+
+/**
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queued_read_lock_slowpath(struct qrwlock *lock)
+{
+ /*
+ * Readers come here when they cannot get the lock without waiting
+ */
+ if (unlikely(in_interrupt())) {
+ /*
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
+ */
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
+ return;
+ }
+ atomic_sub(_QR_BIAS, &lock->cnts);
+
+ /*
+ * Put the reader into the wait queue
+ */
+ arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
+
+ /*
+ * The ACQUIRE semantics of the following spinning code ensure
+ * that accesses can't leak upwards out of our subsequent critical
+ * section in the case that the lock is currently held for write.
+ */
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
+
+ /*
+ * Signal the next one in queue to become queue head
+ */
+ arch_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL(queued_read_lock_slowpath);
+
+/**
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queued_write_lock_slowpath(struct qrwlock *lock)
+{
+ int cnts;
+
+ /* Put the writer into the wait queue */
+ arch_spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock directly if no reader is present */
+ if (!atomic_read(&lock->cnts) &&
+ (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+ goto unlock;
+
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_add(_QW_WAITING, &lock->cnts);
+
+ /* When no more readers or writers, set the locked flag */
+ do {
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
+unlock:
+ arch_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000..edd75e0c7
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,541 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <linux/prefetch.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES 8
+#else
+#define MAX_NODES 4
+#endif
+
+/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline __pure u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(idx > 3);
+#endif
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ /*
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
+ */
+ return (u32)xchg_relaxed(&lock->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new, val = atomic_read(&lock->val);
+
+ for (;;) {
+ new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ /*
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
+
+#define pv_enabled() false
+
+#define pv_init_node __pv_init_node
+#define pv_wait_node __pv_wait_node
+#define pv_kick_node __pv_kick_node
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 old, tail;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (pv_enabled())
+ goto pv_queue;
+
+ if (virt_spin_lock(lock))
+ return;
+
+ /*
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ }
+
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,0 -> 0,0,1 ; trylock
+ * 0,0,1 -> 0,1,1 ; pending
+ */
+ val = queued_fetch_set_pending_acquire(lock);
+
+ /*
+ * If we observe any contention; undo and queue.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
+ goto queue;
+ }
+
+ /*
+ * We're pending, wait for the owner to go away.
+ *
+ * 0,1,1 -> 0,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
+ */
+ if (val & _Q_LOCKED_MASK)
+ atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ clear_pending_set_locked(lock);
+ qstat_inc(qstat_lock_pending, true);
+ return;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ qstat_inc(qstat_lock_slowpath, true);
+pv_queue:
+ node = this_cpu_ptr(&mcs_nodes[0]);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ node += idx;
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
+ node->locked = 0;
+ node->next = NULL;
+ pv_init_node(node);
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+ next = NULL;
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ prev = decode_tail(old);
+
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
+
+ pv_wait_node(node, prev);
+ arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+ * been designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
+ */
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+
+ val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
+
+locked:
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,*,0 -> *,*,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
+ */
+
+ /*
+ * In the PV case we might already have _Q_LOCKED_VAL set.
+ *
+ * The atomic_cond_read_acquire() call above has provided the
+ * necessary acquire semantics required for locking.
+ */
+ if (((val & _Q_TAIL_MASK) == tail) &&
+ atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+
+ /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+ set_locked(lock);
+
+ /*
+ * contended path; wait for next if not observed yet, release.
+ */
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ pv_kick_node(lock, next);
+
+release:
+ /*
+ * release the node
+ */
+ __this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled() true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head_or_lock
+
+#undef queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000..82104d3dd
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,563 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+#include <linux/debug_locks.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ * pv_kick(cpu) -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+
+/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
+enum vcpu_state {
+ vcpu_running = 0,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
+};
+
+struct pv_node {
+ struct mcs_spinlock mcs;
+ struct mcs_spinlock __res[3];
+
+ int cpu;
+ u8 state;
+};
+
+/*
+ * Hybrid PV queued/unfair lock
+ *
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
+ */
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
+{
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
+
+ cpu_relax();
+ }
+
+ return false;
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 1);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ return !READ_ONCE(lock->locked) &&
+ (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
+ _Q_LOCKED_VAL) == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_or(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ int val = atomic_read(&lock->val);
+
+ for (;;) {
+ int old, new;
+
+ if (val & _Q_LOCKED_MASK)
+ break;
+
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ old = val;
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ val = atomic_cmpxchg_acquire(&lock->val, old, new);
+
+ if (val == old)
+ return 1;
+ }
+ return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+ struct qspinlock *lock;
+ struct pv_node *node;
+};
+
+#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+ int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+ if (pv_hash_size < PV_HE_MIN)
+ pv_hash_size = PV_HE_MIN;
+
+ /*
+ * Allocate space from bootmem which should be page-size aligned
+ * and hence cacheline aligned.
+ */
+ pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+ sizeof(struct pv_hash_entry),
+ pv_hash_size, 0,
+ HASH_EARLY | HASH_ZERO,
+ &pv_lock_hash_bits, NULL,
+ pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash) \
+ for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
+ offset < (1 << pv_lock_hash_bits); \
+ offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ int hopcnt = 0;
+
+ for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
+ if (!cmpxchg(&he->lock, NULL, lock)) {
+ WRITE_ONCE(he->node, node);
+ qstat_hop(hopcnt);
+ return &he->lock;
+ }
+ }
+ /*
+ * Hard assume there is a free entry for us.
+ *
+ * This is guaranteed by ensuring every blocked lock only ever consumes
+ * a single entry, and since we only have 4 nesting levels per CPU
+ * and allocated 4*nr_possible_cpus(), this must be so.
+ *
+ * The single entry is guaranteed by having the lock owner unhash
+ * before it releases.
+ */
+ BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ struct pv_node *node;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (READ_ONCE(he->lock) == lock) {
+ node = READ_ONCE(he->node);
+ WRITE_ONCE(he->lock, NULL);
+ return node;
+ }
+ }
+ /*
+ * Hard assume we'll find an entry.
+ *
+ * This guarantees a limited lookup time and is itself guaranteed by
+ * having the lock owner do the unhash -- IFF the unlock sees the
+ * SLOW flag, there MUST be a hash entry.
+ */
+ BUG();
+}
+
+/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+ pn->cpu = smp_processor_id();
+ pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
+ */
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ int loop;
+ bool wait_early;
+
+ for (;;) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
+ if (READ_ONCE(node->locked))
+ return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * Order pn->state vs pn->locked thusly:
+ *
+ * [S] pn->state = vcpu_halted [S] next->locked = 1
+ * MB MB
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
+ *
+ * Matches the cmpxchg() from pv_kick_node().
+ */
+ smp_store_mb(pn->state, vcpu_halted);
+
+ if (!READ_ONCE(node->locked)) {
+ qstat_inc(qstat_pv_wait_node, true);
+ qstat_inc(qstat_pv_wait_early, wait_early);
+ pv_wait(&pn->state, vcpu_halted);
+ }
+
+ /*
+ * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
+ */
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+
+ /*
+ * If the locked flag is still not set after wakeup, it is a
+ * spurious wakeup and the vCPU should wait again. However,
+ * there is a pretty high overhead for CPU halting and kicking.
+ * So it is better to spin for a while in the hope that the
+ * MCS lock will be released soon.
+ */
+ qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+ }
+
+ /*
+ * By now our node->locked should be 1 and our caller will not actually
+ * spin-wait for it. We do however rely on our caller to do a
+ * load-acquire for us.
+ */
+}
+
+/*
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
+ */
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ /*
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
+ *
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ *
+ * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * must be ordered before the read of pn->state in the cmpxchg()
+ * below for the code to work correctly. To guarantee full ordering
+ * irrespective of the success or failure of the cmpxchg(),
+ * a relaxed version with explicit barrier is used. The control
+ * dependency will order the reading of pn->state before any
+ * subsequent writes.
+ */
+ smp_mb__before_atomic();
+ if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
+ != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
+ *
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
+ */
+ WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
+}
+
+/*
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
+ */
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct qspinlock **lp = NULL;
+ int waitcnt = 0;
+ int loop;
+
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == vcpu_hashed)
+ lp = (struct qspinlock **)1;
+
+ /*
+ * Tracking # of slowpath locking operations
+ */
+ qstat_inc(qstat_lock_slowpath, true);
+
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (trylock_clear_pending(lock))
+ goto gotlock;
+ cpu_relax();
+ }
+ clear_pending(lock);
+
+
+ if (!lp) { /* ONCE */
+ lp = pv_hash(lock, pn);
+
+ /*
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
+ *
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
+ *
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
+ */
+ if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
+ /*
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
+ */
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(*lp, NULL);
+ goto gotlock;
+ }
+ }
+ WRITE_ONCE(pn->state, vcpu_hashed);
+ qstat_inc(qstat_pv_wait_head, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
+ pv_wait(&lock->locked, _Q_SLOW_VAL);
+
+ /*
+ * Because of lock stealing, the queue head vCPU may not be
+ * able to acquire the lock before it has to wait again.
+ */
+ }
+
+ /*
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
+ */
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
+}
+
+/*
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
+ */
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
+{
+ struct pv_node *node;
+
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
+ return;
+ }
+
+ /*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
+
+ /*
+ * Since the above failed to release, this must be the SLOW path.
+ * Therefore start by looking up the blocked node and unhashing it.
+ */
+ node = pv_unhash(lock);
+
+ /*
+ * Now that we have a reference to the (likely) blocked pv_node,
+ * release the lock.
+ */
+ smp_store_release(&lock->locked, 0);
+
+ /*
+ * At this point the memory pointed at by lock can be freed/reused,
+ * however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
+ */
+ qstat_inc(qstat_pv_kick_unlock, true);
+ pv_kick(node->cpu);
+}
+
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000..6bd78c074
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,291 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ * pv_hash_hops - average # of hops per hashing operation
+ * pv_kick_unlock - # of vCPU kicks issued at unlock time
+ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
+ * pv_latency_kick - average latency (ns) of vCPU kick operation
+ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_stealing - # of lock stealing operations
+ * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
+ * pv_wait_again - # of wait's after a queue head vCPU kick
+ * pv_wait_early - # of early vCPU wait's
+ * pv_wait_head - # of vCPU wait's at the queue head
+ * pv_wait_node - # of vCPU wait's at a non-head queue node
+ * lock_pending - # of locking operations via pending code
+ * lock_slowpath - # of locking operations via MCS lock queue
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+ qstat_pv_hash_hops,
+ qstat_pv_kick_unlock,
+ qstat_pv_kick_wake,
+ qstat_pv_latency_kick,
+ qstat_pv_latency_wake,
+ qstat_pv_lock_stealing,
+ qstat_pv_spurious_wakeup,
+ qstat_pv_wait_again,
+ qstat_pv_wait_early,
+ qstat_pv_wait_head,
+ qstat_pv_wait_node,
+ qstat_lock_pending,
+ qstat_lock_slowpath,
+ qstat_num, /* Total number of statistical counters */
+ qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+ [qstat_pv_hash_hops] = "pv_hash_hops",
+ [qstat_pv_kick_unlock] = "pv_kick_unlock",
+ [qstat_pv_kick_wake] = "pv_kick_wake",
+ [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+ [qstat_pv_latency_kick] = "pv_latency_kick",
+ [qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
+ [qstat_pv_wait_again] = "pv_wait_again",
+ [qstat_pv_wait_early] = "pv_wait_early",
+ [qstat_pv_wait_head] = "pv_wait_head",
+ [qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_lock_pending] = "lock_pending",
+ [qstat_lock_slowpath] = "lock_slowpath",
+ [qstat_reset_cnts] = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, counter, len;
+ u64 stat = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ counter = (long)file_inode(file)->i_private;
+
+ if (counter >= qstat_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ stat += per_cpu(qstats[counter], cpu);
+ /*
+ * Need to sum additional counter for some of them
+ */
+ switch (counter) {
+
+ case qstat_pv_latency_kick:
+ case qstat_pv_hash_hops:
+ kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ break;
+
+ case qstat_pv_latency_wake:
+ kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ break;
+ }
+ }
+
+ if (counter == qstat_pv_hash_hops) {
+ u64 frac = 0;
+
+ if (kicks) {
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+ }
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((counter == qstat_pv_latency_kick) ||
+ (counter == qstat_pv_latency_wake)) {
+ if (kicks)
+ stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if ((long)file_inode(file)->i_private != qstat_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+ .read = qstat_read,
+ .write = qstat_write,
+ .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+ struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+ int i;
+
+ if (!d_qstat)
+ goto out;
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < qstat_num; i++)
+ if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat))
+ goto fail_undo;
+
+ if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat))
+ goto fail_undo;
+
+ return 0;
+fail_undo:
+ debugfs_remove_recursive(d_qstat);
+out:
+ pr_warn("Could not create 'qlockstat' debugfs entries\n");
+ return -ENOMEM;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+ if (cond)
+ this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+ this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(qstats[qstat_pv_latency_wake],
+ sched_clock() - *pkick_time);
+ qstat_inc(qstat_pv_kick_wake, true);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
new file mode 100644
index 000000000..fd4fe1f5b
--- /dev/null
+++ b/kernel/locking/rtmutex-debug.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Copyright (C) 2006 Esben Nielsen
+ * Copyright (C) 2006 Kihon Technologies Inc.,
+ * Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/debug_locks.h>
+
+#include "rtmutex_common.h"
+
+static void printk_task(struct task_struct *p)
+{
+ if (p)
+ printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
+ else
+ printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+ if (lock->name)
+ printk(" [%p] {%s}\n",
+ lock, lock->name);
+ else
+ printk(" [%p] {%s:%d}\n",
+ lock, lock->file, lock->line);
+
+ if (print_owner && rt_mutex_owner(lock)) {
+ printk(".. ->owner: %p\n", lock->owner);
+ printk(".. held by: ");
+ printk_task(rt_mutex_owner(lock));
+ printk("\n");
+ }
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *act_waiter,
+ struct rt_mutex *lock)
+{
+ struct task_struct *task;
+
+ if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
+ return;
+
+ task = rt_mutex_owner(act_waiter->lock);
+ if (task && task != current) {
+ act_waiter->deadlock_task_pid = get_pid(task_pid(task));
+ act_waiter->deadlock_lock = lock;
+ }
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+ struct task_struct *task;
+
+ if (!waiter->deadlock_lock || !debug_locks)
+ return;
+
+ rcu_read_lock();
+ task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if (!debug_locks_off()) {
+ rcu_read_unlock();
+ return;
+ }
+
+ pr_warn("\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: circular locking deadlock detected!\n");
+ pr_warn("%s\n", print_tainted());
+ pr_warn("--------------------------------------------\n");
+ printk("%s/%d is deadlocking current task %s/%d\n\n",
+ task->comm, task_pid_nr(task),
+ current->comm, task_pid_nr(current));
+
+ printk("\n1) %s/%d is trying to acquire this lock:\n",
+ current->comm, task_pid_nr(current));
+ printk_lock(waiter->lock, 1);
+
+ printk("\n2) %s/%d is blocked on this lock:\n",
+ task->comm, task_pid_nr(task));
+ printk_lock(waiter->deadlock_lock, 1);
+
+ debug_show_held_locks(current);
+ debug_show_held_locks(task);
+
+ printk("\n%s/%d's [blocked] stackdump:\n\n",
+ task->comm, task_pid_nr(task));
+ show_stack(task, NULL);
+ printk("\n%s/%d's [current] stackdump:\n\n",
+ current->comm, task_pid_nr(current));
+ dump_stack();
+ debug_show_all_locks();
+ rcu_read_unlock();
+
+ printk("[ turning off deadlock detection."
+ "Please report this trace. ]\n\n");
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock)
+{
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+}
+
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
+{
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ memset(waiter, 0x11, sizeof(*waiter));
+ waiter->deadlock_task_pid = NULL;
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ put_pid(waiter->deadlock_task_pid);
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
+{
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lock->name = name;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
new file mode 100644
index 000000000..fc549713b
--- /dev/null
+++ b/kernel/locking/rtmutex-debug.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+ struct task_struct *powner);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w) \
+ do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk walk)
+{
+ return (waiter != NULL);
+}
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
new file mode 100644
index 000000000..a5ec4f685
--- /dev/null
+++ b/kernel/locking/rtmutex.c
@@ -0,0 +1,1923 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ * Copyright (C) 2006 Esben Nielsen
+ *
+ * See Documentation/locking/rt-mutex-design.txt for details.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
+ * is used to keep track of the "lock has waiters" state.
+ *
+ * owner bit0
+ * NULL 0 lock is free (fast acquire possible)
+ * NULL 1 lock is free and has waiters and the top waiter
+ * is going to take the lock*
+ * taskpointer 0 lock is held (fast release possible)
+ * taskpointer 1 lock is held and has waiters**
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
+ *
+ * (**) There is a small time when bit 0 is set but there are no
+ * waiters. This can happen when grabbing the lock in the slow path.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
+ * set this bit before looking at the lock.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+{
+ unsigned long val = (unsigned long)owner;
+
+ if (rt_mutex_has_waiters(lock))
+ val |= RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ if (rt_mutex_has_waiters(lock))
+ return;
+
+ /*
+ * The rbtree has no waiters enqueued, now make sure that the
+ * lock->owner still has the waiters bit set, otherwise the
+ * following can happen:
+ *
+ * CPU 0 CPU 1 CPU2
+ * l->owner=T1
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T2)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ *
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T3)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ * signal(->T2) signal(->T3)
+ * lock(l->lock)
+ * dequeue(T2)
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * dequeue(T3)
+ * ==> wait list is empty
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * l->owner = owner
+ * owner = l->owner & ~HAS_WAITERS;
+ * ==> l->owner = T1
+ * }
+ * lock(l->lock)
+ * rt_mutex_unlock(l) fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * owner = l->owner & ~HAS_WAITERS;
+ * cmpxchg(l->owner, T1, NULL)
+ * ===> Success (l->owner = NULL)
+ *
+ * l->owner = owner
+ * ==> l->owner = T1
+ * }
+ *
+ * With the check for the waiter bit in place T3 on CPU2 will not
+ * overwrite. All tasks fiddling with the waiters bit are
+ * serialized by l->lock, so nothing else can modify the waiters
+ * bit. If the bit is set then nothing can change l->owner either
+ * so the simple RMW is safe. The cmpxchg() will simply fail if it
+ * happens in the middle of the RMW because the waiters bit is
+ * still set.
+ */
+ owner = READ_ONCE(*p);
+ if (owner & RT_MUTEX_HAS_WAITERS)
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+/*
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
+ */
+#ifndef CONFIG_DEBUG_RT_MUTEXES
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg_relaxed(p, owner,
+ owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
+ __releases(lock->wait_lock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+
+ clear_rt_mutex_waiters(lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ /*
+ * If a new waiter comes in between the unlock and the cmpxchg
+ * we have two situations:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * cmpxchg(p, owner, 0) == owner
+ * mark_rt_mutex_waiters(lock);
+ * acquire(lock);
+ * or:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * mark_rt_mutex_waiters(lock);
+ *
+ * cmpxchg(p, owner, 0) != owner
+ * enqueue_waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * wake waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * acquire(lock);
+ */
+ return rt_mutex_cmpxchg_release(lock, owner, NULL);
+}
+
+#else
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (0)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
+# define rt_mutex_cmpxchg_release(l,c,n) (0)
+
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
+ __releases(lock->wait_lock)
+{
+ lock->owner = NULL;
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ return true;
+}
+#endif
+
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p) \
+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+
+static inline int
+rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio < right->prio)
+ return 1;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 1 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return dl_time_before(left->deadline, right->deadline);
+
+ return 0;
+}
+
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio != right->prio)
+ return 0;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 0 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return left->deadline == right->deadline;
+
+ return 1;
+}
+
+static void
+rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+ struct rb_node **link = &lock->waiters.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct rt_mutex_waiter *entry;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
+ if (rt_mutex_waiter_less(waiter, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&waiter->tree_entry, parent, link);
+ rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+}
+
+static void
+rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+ if (RB_EMPTY_NODE(&waiter->tree_entry))
+ return;
+
+ rb_erase_cached(&waiter->tree_entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+}
+
+static void
+rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+ struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct rt_mutex_waiter *entry;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
+ if (rt_mutex_waiter_less(waiter, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&waiter->pi_tree_entry, parent, link);
+ rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+}
+
+static void
+rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+ if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ return;
+
+ rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+}
+
+static void rt_mutex_adjust_prio(struct task_struct *p)
+{
+ struct task_struct *pi_task = NULL;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ if (task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
+
+ rt_mutex_setprio(p, pi_task);
+}
+
+/*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
+{
+ /*
+ * This is just a wrapper function for the following call,
+ * because debug_rt_mutex_detect_deadlock() smells like a magic
+ * debug feature and I wanted to keep the cond function in the
+ * main source file along with the comments instead of having
+ * two of the same in the headers.
+ */
+ return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+ return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ *
+ * @task: the task owning the mutex (owner) for which a chain walk is
+ * probably needed
+ * @chwalk: do we have to carry out deadlock detection?
+ * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * things for a task that has just got its priority adjusted, and
+ * is waiting on a mutex)
+ * @next_lock: the mutex on which the owner of @orig_lock was blocked before
+ * we dropped its pi_lock. Is never dereferenced, only used for
+ * comparison to detect lock chain changes.
+ * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
+ * its priority to the mutex owner (can be NULL in the case
+ * depicted above or if the top waiter is gone away and we are
+ * actually deboosting the owner)
+ * @top_task: the current top waiter
+ *
+ * Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step Description Protected by
+ * function arguments:
+ * @task [R]
+ * @orig_lock if != NULL @top_task is blocked on it
+ * @next_lock Unprotected. Cannot be
+ * dereferenced. Only used for
+ * comparison.
+ * @orig_waiter if != NULL @top_task is blocked on it
+ * @top_task current, or in case of proxy
+ * locking protected by calling
+ * code
+ * again:
+ * loop_sanity_check();
+ * retry:
+ * [1] lock(task->pi_lock); [R] acquire [P]
+ * [2] waiter = task->pi_blocked_on; [P]
+ * [3] check_exit_conditions_1(); [P]
+ * [4] lock = waiter->lock; [P]
+ * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
+ * unlock(task->pi_lock); release [P]
+ * goto retry;
+ * }
+ * [6] check_exit_conditions_2(); [P] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
+ * [8] unlock(task->pi_lock); release [P]
+ * put_task_struct(task); release [R]
+ * [9] check_exit_conditions_3(); [L]
+ * [10] task = owner(lock); [L]
+ * get_task_struct(task); [L] acquire [R]
+ * lock(task->pi_lock); [L] acquire [P]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12] check_exit_conditions_4(); [P] + [L]
+ * [13] unlock(task->pi_lock); release [P]
+ * unlock(lock->wait_lock); release [L]
+ * goto again;
+ */
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex *next_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
+{
+ struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+ struct rt_mutex_waiter *prerequeue_top_waiter;
+ int ret = 0, depth = 0;
+ struct rt_mutex *lock;
+ bool detect_deadlock;
+ bool requeue = true;
+
+ detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
+
+ /*
+ * The (de)boosting is a step by step approach with a lot of
+ * pitfalls. We want this to be preemptible and we want hold a
+ * maximum of two locks per step. So we have to check
+ * carefully whether things change under us.
+ */
+ again:
+ /*
+ * We limit the lock chain length for each invocation.
+ */
+ if (++depth > max_lock_depth) {
+ static int prev_max;
+
+ /*
+ * Print this only once. If the admin changes the limit,
+ * print a new message when reaching the limit again.
+ */
+ if (prev_max != max_lock_depth) {
+ prev_max = max_lock_depth;
+ printk(KERN_WARNING "Maximum lock depth %d reached "
+ "task: %s (%d)\n", max_lock_depth,
+ top_task->comm, task_pid_nr(top_task));
+ }
+ put_task_struct(task);
+
+ return -EDEADLK;
+ }
+
+ /*
+ * We are fully preemptible here and only hold the refcount on
+ * @task. So everything can have changed under us since the
+ * caller or our own code below (goto retry/again) dropped all
+ * locks.
+ */
+ retry:
+ /*
+ * [1] Task cannot go away as we did a get_task() before !
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+
+ /*
+ * [2] Get the waiter on which @task is blocked on.
+ */
+ waiter = task->pi_blocked_on;
+
+ /*
+ * [3] check_exit_conditions_1() protected by task->pi_lock.
+ */
+
+ /*
+ * Check whether the end of the boosting chain has been
+ * reached or the state of the chain has changed while we
+ * dropped the locks.
+ */
+ if (!waiter)
+ goto out_unlock_pi;
+
+ /*
+ * Check the orig_waiter state. After we dropped the locks,
+ * the previous owner of the lock might have released the lock.
+ */
+ if (orig_waiter && !rt_mutex_owner(orig_lock))
+ goto out_unlock_pi;
+
+ /*
+ * We dropped all locks after taking a refcount on @task, so
+ * the task might have moved on in the lock chain or even left
+ * the chain completely and blocks now on an unrelated lock or
+ * on @orig_lock.
+ *
+ * We stored the lock on which @task was blocked in @next_lock,
+ * so we can detect the chain change.
+ */
+ if (next_lock != waiter->lock)
+ goto out_unlock_pi;
+
+ /*
+ * Drop out, when the task has no waiters. Note,
+ * top_waiter can be NULL, when we are in the deboosting
+ * mode!
+ */
+ if (top_waiter) {
+ if (!task_has_pi_waiters(task))
+ goto out_unlock_pi;
+ /*
+ * If deadlock detection is off, we stop here if we
+ * are not the top pi waiter of the task. If deadlock
+ * detection is enabled we continue, but stop the
+ * requeueing in the chain walk.
+ */
+ if (top_waiter != task_top_pi_waiter(task)) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+ }
+
+ /*
+ * If the waiter priority is the same as the task priority
+ * then there is no further priority adjustment necessary. If
+ * deadlock detection is off, we stop the chain walk. If its
+ * enabled we continue, but stop the requeueing in the chain
+ * walk.
+ */
+ if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+
+ /*
+ * [4] Get the next lock
+ */
+ lock = waiter->lock;
+ /*
+ * [5] We need to trylock here as we are holding task->pi_lock,
+ * which is the reverse lock order versus the other rtmutex
+ * operations.
+ */
+ if (!raw_spin_trylock(&lock->wait_lock)) {
+ raw_spin_unlock_irq(&task->pi_lock);
+ cpu_relax();
+ goto retry;
+ }
+
+ /*
+ * [6] check_exit_conditions_2() protected by task->pi_lock and
+ * lock->wait_lock.
+ *
+ * Deadlock detection. If the lock is the same as the original
+ * lock which caused us to walk the lock chain or if the
+ * current lock is owned by the task which initiated the chain
+ * walk, we detected a deadlock.
+ */
+ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+ debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
+ raw_spin_unlock(&lock->wait_lock);
+ ret = -EDEADLK;
+ goto out_unlock_pi;
+ }
+
+ /*
+ * If we just follow the lock chain for deadlock detection, no
+ * need to do all the requeue operations. To avoid a truckload
+ * of conditionals around the various places below, just do the
+ * minimum chain walk checks.
+ */
+ if (!requeue) {
+ /*
+ * No requeue[7] here. Just release @task [8]
+ */
+ raw_spin_unlock(&task->pi_lock);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ * If there is no owner of the lock, end of chain.
+ */
+ if (!rt_mutex_owner(lock)) {
+ raw_spin_unlock_irq(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock(&task->pi_lock);
+
+ /*
+ * No requeue [11] here. We just do deadlock detection.
+ *
+ * [12] Store whether owner is blocked
+ * itself. Decision is made after dropping the locks
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Get the top waiter for the next iteration
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop locks */
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ /* If owner is not blocked, end of chain. */
+ if (!next_lock)
+ goto out_put_task;
+ goto again;
+ }
+
+ /*
+ * Store the current top waiter before doing the requeue
+ * operation on @lock. We need it for the boost/deboost
+ * decision below.
+ */
+ prerequeue_top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [7] Requeue the waiter in the lock waiter tree. */
+ rt_mutex_dequeue(lock, waiter);
+
+ /*
+ * Update the waiter prio fields now that we're dequeued.
+ *
+ * These values can have changed through either:
+ *
+ * sys_sched_set_scheduler() / sys_sched_setattr()
+ *
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+ *
+ * Even though pi_waiters also uses these fields, and that tree is only
+ * updated in [11], we can do this here, since we hold [L], which
+ * serializes all pi_waiters access and rb_erase() does not care about
+ * the values of the node being removed.
+ */
+ waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
+
+ rt_mutex_enqueue(lock, waiter);
+
+ /* [8] Release the task */
+ raw_spin_unlock(&task->pi_lock);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ *
+ * We must abort the chain walk if there is no lock owner even
+ * in the dead lock detection case, as we have nothing to
+ * follow here. This is the end of the chain we are walking.
+ */
+ if (!rt_mutex_owner(lock)) {
+ /*
+ * If the requeue [7] above changed the top waiter,
+ * then we need to wake the new top waiter up to try
+ * to get the lock.
+ */
+ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
+ wake_up_process(rt_mutex_top_waiter(lock)->task);
+ raw_spin_unlock_irq(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. the owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock(&task->pi_lock);
+
+ /* [11] requeue the pi waiters if necessary */
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ /*
+ * The waiter became the new top (highest priority)
+ * waiter on the lock. Replace the previous top waiter
+ * in the owner tasks pi waiters tree with this waiter
+ * and adjust the priority of the owner.
+ */
+ rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ rt_mutex_enqueue_pi(task, waiter);
+ rt_mutex_adjust_prio(task);
+
+ } else if (prerequeue_top_waiter == waiter) {
+ /*
+ * The waiter was the top waiter on the lock, but is
+ * no longer the top prority waiter. Replace waiter in
+ * the owner tasks pi waiters tree with the new top
+ * (highest priority) waiter and adjust the priority
+ * of the owner.
+ * The new top waiter is stored in @waiter so that
+ * @waiter == @top_waiter evaluates to true below and
+ * we continue to deboost the rest of the chain.
+ */
+ rt_mutex_dequeue_pi(task, waiter);
+ waiter = rt_mutex_top_waiter(lock);
+ rt_mutex_enqueue_pi(task, waiter);
+ rt_mutex_adjust_prio(task);
+ } else {
+ /*
+ * Nothing changed. No need to do any priority
+ * adjustment.
+ */
+ }
+
+ /*
+ * [12] check_exit_conditions_4() protected by task->pi_lock
+ * and lock->wait_lock. The actual decisions are made after we
+ * dropped the locks.
+ *
+ * Check whether the task which owns the current lock is pi
+ * blocked itself. If yes we store a pointer to the lock for
+ * the lock chain change detection above. After we dropped
+ * task->pi_lock next_lock cannot be dereferenced anymore.
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Store the top waiter of @lock for the end of chain walk
+ * decision below.
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop the locks */
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ /*
+ * Make the actual exit decisions [12], based on the stored
+ * values.
+ *
+ * We reached the end of the lock chain. Stop right here. No
+ * point to go back just to figure that out.
+ */
+ if (!next_lock)
+ goto out_put_task;
+
+ /*
+ * If the current waiter is not the top waiter on the lock,
+ * then we can stop the chain walk here if we are not in full
+ * deadlock detection mode.
+ */
+ if (!detect_deadlock && waiter != top_waiter)
+ goto out_put_task;
+
+ goto again;
+
+ out_unlock_pi:
+ raw_spin_unlock_irq(&task->pi_lock);
+ out_put_task:
+ put_task_struct(task);
+
+ return ret;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * Must be called with lock->wait_lock held and interrupts disabled
+ *
+ * @lock: The lock to be acquired.
+ * @task: The task which wants to acquire the lock
+ * @waiter: The waiter that is queued to the lock's wait tree if the
+ * callsite called task_blocked_on_lock(), otherwise NULL
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ /*
+ * Before testing whether we can acquire @lock, we set the
+ * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+ * other tasks which try to modify @lock into the slow path
+ * and they serialize on @lock->wait_lock.
+ *
+ * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+ * as explained at the top of this file if and only if:
+ *
+ * - There is a lock owner. The caller must fixup the
+ * transient state if it does a trylock or leaves the lock
+ * function due to a signal or timeout.
+ *
+ * - @task acquires the lock and there are no other
+ * waiters. This is undone in rt_mutex_set_owner(@task) at
+ * the end of this function.
+ */
+ mark_rt_mutex_waiters(lock);
+
+ /*
+ * If @lock has an owner, give up.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * If @waiter != NULL, @task has already enqueued the waiter
+ * into @lock waiter tree. If @waiter == NULL then this is a
+ * trylock attempt.
+ */
+ if (waiter) {
+ /*
+ * If waiter is not the highest priority waiter of
+ * @lock, give up.
+ */
+ if (waiter != rt_mutex_top_waiter(lock))
+ return 0;
+
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+
+ } else {
+ /*
+ * If the lock has waiters already we check whether @task is
+ * eligible to take over the lock.
+ *
+ * If there are no other waiters, @task can acquire
+ * the lock. @task->pi_blocked_on is NULL, so it does
+ * not need to be dequeued.
+ */
+ if (rt_mutex_has_waiters(lock)) {
+ /*
+ * If @task->prio is greater than or equal to
+ * the top waiter priority (kernel view),
+ * @task lost.
+ */
+ if (!rt_mutex_waiter_less(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
+ return 0;
+
+ /*
+ * The current top waiter stays enqueued. We
+ * don't have to change anything in the lock
+ * waiters order.
+ */
+ } else {
+ /*
+ * No waiters. Take the lock without the
+ * pi_lock dance.@task->pi_blocked_on is NULL
+ * and we have no waiters to enqueue in @task
+ * pi waiters tree.
+ */
+ goto takeit;
+ }
+ }
+
+ /*
+ * Clear @task->pi_blocked_on. Requires protection by
+ * @task->pi_lock. Redundant operation for the @waiter == NULL
+ * case, but conditionals are more expensive than a redundant
+ * store.
+ */
+ raw_spin_lock(&task->pi_lock);
+ task->pi_blocked_on = NULL;
+ /*
+ * Finish the lock acquisition. @task is the new owner. If
+ * other waiters exist we have to insert the highest priority
+ * waiter into @task->pi_waiters tree.
+ */
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+ raw_spin_unlock(&task->pi_lock);
+
+takeit:
+ /* We got the lock. */
+ debug_rt_mutex_lock(lock);
+
+ /*
+ * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+ * are still waiters or clears it.
+ */
+ rt_mutex_set_owner(lock, task);
+
+ return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held and interrupts disabled
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ enum rtmutex_chainwalk chwalk)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *top_waiter = waiter;
+ struct rt_mutex *next_lock;
+ int chain_walk = 0, res;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ /*
+ * Early deadlock detection. We really don't want the task to
+ * enqueue on itself just to untangle the mess later. It's not
+ * only an optimization. We drop the locks, so another waiter
+ * can come in before the chain walk detects the deadlock. So
+ * the other will detect the deadlock and return -EDEADLOCK,
+ * which is wrong, as the other waiter is not in a deadlock
+ * situation.
+ */
+ if (owner == task)
+ return -EDEADLK;
+
+ raw_spin_lock(&task->pi_lock);
+ waiter->task = task;
+ waiter->lock = lock;
+ waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
+
+ /* Get the top priority waiter on the lock */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ rt_mutex_enqueue(lock, waiter);
+
+ task->pi_blocked_on = waiter;
+
+ raw_spin_unlock(&task->pi_lock);
+
+ if (!owner)
+ return 0;
+
+ raw_spin_lock(&owner->pi_lock);
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ rt_mutex_dequeue_pi(owner, top_waiter);
+ rt_mutex_enqueue_pi(owner, waiter);
+
+ rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+ } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
+ chain_walk = 1;
+ }
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
+
+ raw_spin_unlock(&owner->pi_lock);
+ /*
+ * Even if full deadlock detection is on, if the owner is not
+ * blocked itself, we can avoid finding this out in the chain
+ * walk.
+ */
+ if (!chain_walk || !next_lock)
+ return 0;
+
+ /*
+ * The owner can't disappear while holding a lock,
+ * so the owner struct is protected by wait_lock.
+ * Gets dropped in rt_mutex_adjust_prio_chain()!
+ */
+ get_task_struct(owner);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
+ next_lock, waiter, task);
+
+ raw_spin_lock_irq(&lock->wait_lock);
+
+ return res;
+}
+
+/*
+ * Remove the top waiter from the current tasks pi waiter tree and
+ * queue it up.
+ *
+ * Called with lock->wait_lock held and interrupts disabled.
+ */
+static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+ struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *waiter;
+
+ raw_spin_lock(&current->pi_lock);
+
+ waiter = rt_mutex_top_waiter(lock);
+
+ /*
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
+ */
+ rt_mutex_dequeue_pi(current, waiter);
+ rt_mutex_adjust_prio(current);
+
+ /*
+ * As we are waking up the top waiter, and the waiter stays
+ * queued on the lock until it gets the lock, this lock
+ * obviously has waiters. Just set the bit here and this has
+ * the added benefit of forcing all new tasks into the
+ * slow path making sure no task of lower priority than
+ * the top waiter can steal this lock.
+ */
+ lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
+
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_postunlock();
+ */
+ preempt_disable();
+ wake_q_add(wake_q, waiter->task);
+ raw_spin_unlock(&current->pi_lock);
+}
+
+/*
+ * Remove a waiter from a lock and give up
+ *
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
+ * have just failed to try_to_take_rt_mutex().
+ */
+static void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex *next_lock;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ raw_spin_lock(&current->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock(&current->pi_lock);
+
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
+ return;
+
+ raw_spin_lock(&owner->pi_lock);
+
+ rt_mutex_dequeue_pi(owner, waiter);
+
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
+
+ rt_mutex_adjust_prio(owner);
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
+
+ raw_spin_unlock(&owner->pi_lock);
+
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
+ if (!next_lock)
+ return;
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
+
+ raw_spin_lock_irq(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Must be called with lock->wait_lock held and interrupts disabled
+ */
+static int __sched
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret = 0;
+
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock, current, waiter))
+ break;
+
+ /*
+ * TASK_INTERRUPTIBLE checks for signals and
+ * timeout. Ignored otherwise.
+ */
+ if (likely(state == TASK_INTERRUPTIBLE)) {
+ /* Signal pending? */
+ if (signal_pending(current))
+ ret = -EINTR;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ if (ret)
+ break;
+ }
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ schedule();
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(state);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ return ret;
+}
+
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
+{
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ /*
+ * Yell lowdly and stop the task right here.
+ */
+ rt_mutex_print_deadlock(w);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk)
+{
+ struct rt_mutex_waiter waiter;
+ unsigned long flags;
+ int ret = 0;
+
+ rt_mutex_init_waiter(&waiter);
+
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout))
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
+
+ if (likely(!ret))
+ /* sleep on the mutex */
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
+
+ if (unlikely(ret)) {
+ __set_current_state(TASK_RUNNING);
+ remove_waiter(lock, &waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ }
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ /* Remove pending timer: */
+ if (unlikely(timeout))
+ hrtimer_cancel(&timeout->timer);
+
+ debug_rt_mutex_free_waiter(&waiter);
+
+ return ret;
+}
+
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ unsigned long flags;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ ret = __rt_mutex_slowtrylock(lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
+ */
+static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
+{
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ debug_rt_mutex_unlock(lock);
+
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock, flags) == true)
+ return false;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ }
+
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ *
+ * Queue the next waiter for wakeup once we release the wait_lock.
+ */
+ mark_wakeup_next_waiter(wake_q, lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ return true; /* call rt_mutex_postunlock() */
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk))
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 0;
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk))
+{
+ if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 0;
+
+ return slowfn(lock, state, timeout, chwalk);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return slowfn(lock);
+}
+
+/*
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
+{
+ wake_up_q(wake_q);
+
+ /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+ preempt_enable();
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+ bool (*slowfn)(struct rt_mutex *lock,
+ struct wake_q_head *wqh))
+{
+ DEFINE_WAKE_Q(wake_q);
+
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
+
+ if (slowfn(lock, &wake_q))
+ rt_mutex_postunlock(&wake_q);
+}
+
+static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock(lock, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+#endif
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock(lock, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ int ret;
+
+ might_sleep();
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/*
+ * Futex variant, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
+ *
+ * @lock: the rt_mutex to be locked
+ * @timeout: timeout structure or NULL (no timeout)
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -ETIMEDOUT when the timeout expired
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
+{
+ int ret;
+
+ might_sleep();
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ RT_MUTEX_MIN_CHAINWALK,
+ rt_mutex_slowlock);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
+ return 0;
+
+ ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/**
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wake_q, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
+}
+
+/**
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+ WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ lock->magic = NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ lock->owner = NULL;
+ raw_spin_lock_init(&lock->wait_lock);
+ lock->waiters = RB_ROOT_CACHED;
+
+ if (name && key)
+ debug_rt_mutex_init(lock, name, key);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ __rt_mutex_init(lock, NULL, NULL);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This merrily cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ return NULL;
+
+ return rt_mutex_top_waiter(lock)->task;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
new file mode 100644
index 000000000..732f96abf
--- /dev/null
+++ b/kernel/locking/rtmutex.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define rt_mutex_deadlock_check(l) (0)
+#define debug_rt_mutex_init_waiter(w) do { } while (0)
+#define debug_rt_mutex_free_waiter(w) do { } while (0)
+#define debug_rt_mutex_lock(l) do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
+#define debug_rt_mutex_unlock(l) do { } while (0)
+#define debug_rt_mutex_init(m, n, k) do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
+#define debug_rt_mutex_print_deadlock(w) do { } while (0)
+#define debug_rt_mutex_reset_waiter(w) do { } while (0)
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ WARN(1, "rtmutex deadlock detected\n");
+}
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+ enum rtmutex_chainwalk walk)
+{
+ return walk == RT_MUTEX_FULL_CHAINWALK;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
new file mode 100644
index 000000000..ca6fb4890
--- /dev/null
+++ b/kernel/locking/rtmutex_common.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @tree_entry: pi node to enqueue into the mutex waiters tree
+ * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @task: task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+ struct rb_node tree_entry;
+ struct rb_node pi_tree_entry;
+ struct task_struct *task;
+ struct rt_mutex *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ unsigned long ip;
+ struct pid *deadlock_task_pid;
+ struct rt_mutex *deadlock_lock;
+#endif
+ int prio;
+ u64 deadline;
+};
+
+/*
+ * Various helpers to access the waiters-tree:
+ */
+
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+ struct rt_mutex_waiter *w = NULL;
+
+ if (leftmost) {
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ BUG_ON(w->lock != lock);
+ }
+ return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return rb_entry(p->pi_waiters.rb_leftmost,
+ struct rt_mutex_waiter, pi_tree_entry);
+}
+
+#else
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ return NULL;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_HAS_WAITERS 1UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+ unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+
+ return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+/*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
+ * no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
+ * walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+ RT_MUTEX_MIN_CHAINWALK,
+ RT_MUTEX_FULL_CHAINWALK,
+};
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+#endif
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
new file mode 100644
index 000000000..a7ffb2a96
--- /dev/null
+++ b/kernel/locking/rwsem-spinlock.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0
+/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
+ * generic spinlock implementation
+ *
+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ */
+#include <linux/rwsem.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+#include <linux/export.h>
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+};
+
+int rwsem_is_locked(struct rw_semaphore *sem)
+{
+ int ret = 1;
+ unsigned long flags;
+
+ if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
+ ret = (sem->count != 0);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rwsem_is_locked);
+
+/*
+ * initialise the semaphore
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ sem->count = 0;
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here, then:
+ * - the 'active count' _reached_ zero
+ * - the 'waiting count' is non-zero
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if wakewrite is non-zero
+ */
+static inline struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+{
+ struct rwsem_waiter *waiter;
+ struct task_struct *tsk;
+ int woken;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wakewrite)
+ /* Wake up a writer. Note that we do not grant it the
+ * lock - it will have to acquire it when it runs. */
+ wake_up_process(waiter->task);
+ goto out;
+ }
+
+ /* grant an infinite number of read locks to the front of the queue */
+ woken = 0;
+ do {
+ struct list_head *next = waiter->list.next;
+
+ list_del(&waiter->list);
+ tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
+ smp_mb();
+ waiter->task = NULL;
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ woken++;
+ if (next == &sem->wait_list)
+ break;
+ waiter = list_entry(next, struct rwsem_waiter, list);
+ } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+
+ sem->count += woken;
+
+ out:
+ return sem;
+}
+
+/*
+ * wake a single writer
+ */
+static inline struct rw_semaphore *
+__rwsem_wake_one_writer(struct rw_semaphore *sem)
+{
+ struct rwsem_waiter *waiter;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+ wake_up_process(waiter->task);
+
+ return sem;
+}
+
+/*
+ * get a read lock on the semaphore
+ */
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
+{
+ struct rwsem_waiter waiter;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->count++;
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ goto out;
+ }
+
+ /* set up my own style of waitqueue */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ get_task_struct(current);
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* wait to be given the lock */
+ for (;;) {
+ if (!waiter.task)
+ break;
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ schedule();
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ out:
+ return 0;
+
+out_nolock:
+ /*
+ * We didn't take the lock, so that there is a writer, which
+ * is owner or the first waiter of the sem. If it's a waiter,
+ * it will be woken by current owner. Not need to wake anybody.
+ */
+ list_del(&waiter.list);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ return -EINTR;
+}
+
+void __sched __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int __down_read_trylock(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ int ret = 0;
+
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->count++;
+ ret = 1;
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * get a write lock on the semaphore
+ */
+int __sched __down_write_common(struct rw_semaphore *sem, int state)
+{
+ struct rwsem_waiter waiter;
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* set up my own style of waitqueue */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* wait for someone to release the lock */
+ for (;;) {
+ /*
+ * That is the key to support write lock stealing: allows the
+ * task already on CPU to get the lock soon rather than put
+ * itself into sleep and waiting for system woke it or someone
+ * else in the head of the wait list up.
+ */
+ if (sem->count == 0)
+ break;
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ schedule();
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+ }
+ /* got the lock */
+ sem->count = -1;
+ list_del(&waiter.list);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+
+out_nolock:
+ list_del(&waiter.list);
+ if (!list_empty(&sem->wait_list) && sem->count >= 0)
+ __rwsem_do_wake(sem, 0);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return -EINTR;
+}
+
+void __sched __down_write(struct rw_semaphore *sem)
+{
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int __down_write_trylock(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count == 0) {
+ /* got the lock */
+ sem->count = -1;
+ ret = 1;
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void __up_read(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (--sem->count == 0 && !list_empty(&sem->wait_list))
+ sem = __rwsem_wake_one_writer(sem);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * release a write lock on the semaphore
+ */
+void __up_write(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ sem->count = 0;
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, 1);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void __downgrade_write(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ sem->count = 1;
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, 0);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
new file mode 100644
index 000000000..e41e4b4b5
--- /dev/null
+++ b/kernel/locking/rwsem-xadd.c
@@ -0,0 +1,730 @@
+// SPDX-License-Identifier: GPL-2.0
+/* rwsem.c: R/W semaphores: contention handling functions
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ */
+#include <linux/rwsem.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
+#include <linux/osq_lock.h>
+
+#include "rwsem.h"
+
+/*
+ * Guide to the rw_semaphore's count field for common values.
+ * (32-bit case illustrated, similar for 64-bit)
+ *
+ * 0x0000000X (1) X readers active or attempting lock, no writer waiting
+ * X = #active_readers + #readers attempting to lock
+ * (X*ACTIVE_BIAS)
+ *
+ * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
+ * attempting to read lock or write lock.
+ *
+ * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
+ * X = #active readers + # readers attempting lock
+ * (X*ACTIVE_BIAS + WAITING_BIAS)
+ * (2) 1 writer attempting lock, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ * (3) 1 writer active, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
+ * (WAITING_BIAS + ACTIVE_BIAS)
+ * (2) 1 writer active or attempting lock, no waiters for lock
+ * (ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0000 (1) There are writers or readers queued but none active
+ * or in the process of attempting lock.
+ * (WAITING_BIAS)
+ * Note: writer can attempt to steal lock for this count by adding
+ * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
+ *
+ * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
+ * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
+ *
+ * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
+ * the count becomes more than 0 for successful lock acquisition,
+ * i.e. the case where there are only readers or nobody has lock.
+ * (1st and 2nd case above).
+ *
+ * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
+ * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
+ * acquisition (i.e. nobody else has lock or attempts lock). If
+ * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
+ * are only waiters but none active (5th case above), and attempt to
+ * steal the lock.
+ *
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ sem->owner = NULL;
+ osq_lock_init(&sem->osq);
+#endif
+}
+
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+};
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then:
+ * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
+ * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ */
+static void __rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+ struct list_head wlist;
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
+ */
+ wake_q_add(wake_q, waiter->task);
+ }
+
+ return;
+ }
+
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ adjustment = RWSEM_ACTIVE_READ_BIAS;
+ try_reader_grant:
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+ if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+ /*
+ * If the count is still less than RWSEM_WAITING_BIAS
+ * after removing the adjustment, it is assumed that
+ * a writer has stolen the lock. We have to undo our
+ * reader grant.
+ */
+ if (atomic_long_add_return(-adjustment, &sem->count) <
+ RWSEM_WAITING_BIAS)
+ return;
+
+ /* Last active locker left. Retry waking readers. */
+ goto try_reader_grant;
+ }
+ /*
+ * It is not really necessary to set it to reader-owned here,
+ * but it gives the spinners an early indication that the
+ * readers now have the lock.
+ */
+ rwsem_set_reader_owned(sem);
+ }
+
+ /*
+ * Grant an infinite number of read locks to the readers at the front
+ * of the queue. We know that woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
+ *
+ * We have to do wakeup in 2 passes to prevent the possibility that
+ * the reader count may be decremented before it is incremented. It
+ * is because the to-be-woken waiter may not have slept yet. So it
+ * may see waiter->task got cleared, finish its critical section and
+ * do an unlock before the reader count increment.
+ *
+ * 1) Collect the read-waiters in a separate list, count them and
+ * fully increment the reader count in rwsem.
+ * 2) For each waiters in the new list, clear waiter->task and
+ * put them into wake_q to be woken up later.
+ */
+ list_for_each_entry(waiter, &sem->wait_list, list) {
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+ break;
+
+ woken++;
+ }
+ list_cut_before(&wlist, &sem->wait_list, &waiter->list);
+
+ adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+ if (list_empty(&sem->wait_list)) {
+ /* hit end of list above */
+ adjustment -= RWSEM_WAITING_BIAS;
+ }
+
+ if (adjustment)
+ atomic_long_add(adjustment, &sem->count);
+
+ /* 2nd pass */
+ list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ struct task_struct *tsk;
+
+ tsk = waiter->task;
+ get_task_struct(tsk);
+
+ /*
+ * Ensure calling get_task_struct() before setting the reader
+ * waiter to nil such that rwsem_down_read_failed() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add(wake_q, tsk);
+ /* wake_q_add() already take the task ref */
+ put_task_struct(tsk);
+ }
+}
+
+/*
+ * Wait for the read lock to be granted
+ */
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
+{
+ long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list))
+ adjustment += RWSEM_WAITING_BIAS;
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ count = atomic_long_add_return(adjustment, &sem->count);
+
+ /*
+ * If there are no active locks, wake the front queued process(es).
+ *
+ * If there are no writers and we are first in the queue,
+ * wake our own waiter to join the existing active readers !
+ */
+ if (count == RWSEM_WAITING_BIAS ||
+ (count > RWSEM_WAITING_BIAS &&
+ adjustment != -RWSEM_ACTIVE_READ_BIAS))
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+
+ /* wait to be given the lock */
+ while (true) {
+ set_current_state(state);
+ if (!waiter.task)
+ break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+ return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ /*
+ * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
+ */
+ if (count != RWSEM_WAITING_BIAS)
+ return false;
+
+ /*
+ * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+ * are other tasks on the wait list, we need to add on WAITING_BIAS.
+ */
+ count = list_is_singular(&sem->wait_list) ?
+ RWSEM_ACTIVE_WRITE_BIAS :
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+
+ if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+ == RWSEM_WAITING_BIAS) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long old, count = atomic_long_read(&sem->count);
+
+ while (true) {
+ if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+ return false;
+
+ old = atomic_long_cmpxchg_acquire(&sem->count, count,
+ count + RWSEM_ACTIVE_WRITE_BIAS);
+ if (old == count) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ count = old;
+ }
+}
+
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+ /*
+ * As lock holder preemption issue, we both skip spinning if
+ * task is not on cpu or its cpu is preempted
+ */
+ return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool ret = true;
+
+ BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
+ if (need_resched())
+ return false;
+
+ rcu_read_lock();
+ owner = READ_ONCE(sem->owner);
+ if (owner) {
+ ret = is_rwsem_owner_spinnable(owner) &&
+ owner_on_cpu(owner);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner = READ_ONCE(sem->owner);
+
+ if (!is_rwsem_owner_spinnable(owner))
+ return false;
+
+ rcu_read_lock();
+ while (owner && (READ_ONCE(sem->owner) == owner)) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ /*
+ * abort spinning when need_resched or owner is not running or
+ * owner's cpu is preempted.
+ */
+ if (need_resched() || !owner_on_cpu(owner)) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * If there is a new owner or the owner is not set, we continue
+ * spinning.
+ */
+ return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock as we can't determine if they are
+ * actively running or not.
+ */
+ while (rwsem_spin_on_owner(sem)) {
+ /*
+ * Try to acquire the lock
+ */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!sem->owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return osq_is_locked(&sem->osq);
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+static inline struct rw_semaphore *
+__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
+{
+ long count;
+ bool waiting = true; /* any queued threads before us */
+ struct rwsem_waiter waiter;
+ struct rw_semaphore *ret = sem;
+ DEFINE_WAKE_Q(wake_q);
+
+ /* undo write bias from down_write operation, stop active locking */
+ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ return sem;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
+ if (list_empty(&sem->wait_list))
+ waiting = false;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ if (waiting) {
+ count = atomic_long_read(&sem->count);
+
+ /*
+ * If there were already threads queued before us and there are
+ * no active writers, the lock must be read owned; so we try to
+ * wake any read locks that were queued ahead of us.
+ */
+ if (count > RWSEM_WAITING_BIAS) {
+ __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+ /*
+ * The wakeup is normally called _after_ the wait_lock
+ * is released, but given that we are proactively waking
+ * readers we can deal with the wake_q overhead as it is
+ * similar to releasing and taking the wait_lock again
+ * for attempting rwsem_try_write_lock().
+ */
+ wake_up_q(&wake_q);
+
+ /*
+ * Reinitialize wake_q after use.
+ */
+ wake_q_init(&wake_q);
+ }
+
+ } else
+ count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
+
+ /* wait until we successfully acquire the lock */
+ set_current_state(state);
+ while (true) {
+ if (rwsem_try_write_lock(count, sem))
+ break;
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* Block until there are no active lockers. */
+ do {
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
+ schedule();
+ set_current_state(state);
+ } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ __set_current_state(TASK_RUNNING);
+ list_del(&waiter.list);
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ return ret;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ else
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed);
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed_killable);
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+__visible
+struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ /*
+ * __rwsem_down_write_failed_common(sem)
+ * rwsem_optimistic_spin(sem)
+ * osq_unlock(sem->osq)
+ * ...
+ * atomic_long_add_return(&sem->count)
+ *
+ * - VS -
+ *
+ * __up_write()
+ * if (atomic_long_sub_return_release(&sem->count) < 0)
+ * rwsem_wake(sem)
+ * osq_is_locked(&sem->osq)
+ *
+ * And __up_write() must observe !osq_is_locked() when it observes the
+ * atomic_long_add_return() in order to not miss a wakeup.
+ *
+ * This boils down to:
+ *
+ * [S.rel] X = 1 [RmW] r0 = (Y += 0)
+ * MB RMB
+ * [RmW] Y += 1 [L] r1 = X
+ *
+ * exists (r0=1 /\ r1=0)
+ */
+ smp_rmb();
+
+ /*
+ * If a spinner is present, it is not necessary to do the wakeup.
+ * Try to do wakeup only if the trylock succeeds to minimize
+ * spinlock contention which may introduce too much delay in the
+ * unlock operation.
+ *
+ * spinning writer up_write/up_read caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * MB RMB
+ * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+ *
+ * Here, it is important to make sure that there won't be a missed
+ * wakeup while the rwsem is free and the only spinning writer goes
+ * to sleep without taking the rwsem. Even when the spinning writer
+ * is just going to break out of the waiting loop, it will still do
+ * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+ * rwsem_has_spinner() is true, it will guarantee at least one
+ * trylock attempt on the rwsem later on.
+ */
+ if (rwsem_has_spinner(sem)) {
+ /*
+ * The smp_rmb() here is to make sure that the spinner
+ * state is consulted before reading the wait_lock.
+ */
+ smp_rmb();
+ if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+ return sem;
+ goto locked;
+ }
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
+
+ if (!list_empty(&sem->wait_list))
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+EXPORT_SYMBOL(rwsem_wake);
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+__visible
+struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
new file mode 100644
index 000000000..776308d2f
--- /dev/null
+++ b/kernel/locking/rwsem.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/export.h>
+#include <linux/rwsem.h>
+#include <linux/atomic.h>
+
+#include "rwsem.h"
+
+/*
+ * lock for reading
+ */
+void __sched down_read(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
+}
+
+EXPORT_SYMBOL(down_read);
+
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_reader_owned(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_read_trylock(sem);
+
+ if (ret == 1) {
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_reader_owned(sem);
+ }
+ return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void __sched down_write(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * lock for writing
+ */
+int __sched down_write_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_owner(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_write_trylock(sem);
+
+ if (ret == 1) {
+ rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
+ return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
+
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current);
+
+ rwsem_clear_owner(sem);
+ __up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+ lock_downgrade(&sem->dep_map, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current);
+
+ rwsem_set_reader_owned(sem);
+ __downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+ might_sleep();
+ rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(_down_write_nest_lock);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+ rwsem_set_reader_owned(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_owner(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000..b9d0e72aa
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 4 states:
+ * 1) 0
+ * - lock is free or the owner hasn't set the field yet
+ * 2) RWSEM_READER_OWNED
+ * - lock is currently or previously owned by readers (lock is free
+ * or not set by owner yet)
+ * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ * - lock is owned by an anonymous writer, so spinning on the lock
+ * owner should be disabled.
+ * 4) Other non-zero value
+ * - a writer owns the lock and other writers can spin on the lock owner.
+ */
+#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0)
+#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
+
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c)
+#endif
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ WRITE_ONCE(sem->owner, current);
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ WRITE_ONCE(sem->owner, NULL);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * We check the owner value first to make sure that we will only
+ * do a write to the rwsem cacheline when it is really necessary
+ * to minimize cacheline contention.
+ */
+ if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED)
+ WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
+}
+
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
+{
+ return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
+}
+
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
+{
+ return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
+}
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
new file mode 100644
index 000000000..561acdd39
--- /dev/null
+++ b/kernel/locking/semaphore.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock. It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore. If it's zero, there may be tasks waiting on the wait_list.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/ftrace.h>
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_interruptible(struct semaphore *sem);
+static noinline int __down_killable(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
+static noinline void __up(struct semaphore *sem);
+
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore. If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
+void down(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ __down(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+/**
+ * down_interruptible - acquire the semaphore unless interrupted
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a signal, this function will return -EINTR.
+ * If the semaphore is successfully acquired, this function returns 0.
+ */
+int down_interruptible(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_interruptible(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/**
+ * down_killable - acquire the semaphore unless killed
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a fatal signal, this function will return
+ * -EINTR. If the semaphore is successfully acquired, this function returns
+ * 0.
+ */
+int down_killable(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_killable(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_killable);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
+ * been acquired successfully or 1 if it it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock! Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+ unsigned long flags;
+ int count;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ count = sem->count - 1;
+ if (likely(count >= 0))
+ sem->count = count;
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @timeout: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME. It returns 0 if the semaphore was acquired.
+ */
+int down_timeout(struct semaphore *sem, long timeout)
+{
+ unsigned long flags;
+ int result = 0;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_timeout(sem, timeout);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore. Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+void up(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(list_empty(&sem->wait_list)))
+ sem->count++;
+ else
+ __up(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ bool up;
+};
+
+/*
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler. Likewise the
+ * 'timeout' parameter for the cases without timeouts.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ struct semaphore_waiter waiter;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+ waiter.task = current;
+ waiter.up = false;
+
+ for (;;) {
+ if (signal_pending_state(state, current))
+ goto interrupted;
+ if (unlikely(timeout <= 0))
+ goto timed_out;
+ __set_current_state(state);
+ raw_spin_unlock_irq(&sem->lock);
+ timeout = schedule_timeout(timeout);
+ raw_spin_lock_irq(&sem->lock);
+ if (waiter.up)
+ return 0;
+ }
+
+ timed_out:
+ list_del(&waiter.list);
+ return -ETIME;
+
+ interrupted:
+ list_del(&waiter.list);
+ return -EINTR;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+ __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_interruptible(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_killable(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
+{
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+ struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+ struct semaphore_waiter, list);
+ list_del(&waiter->list);
+ waiter->up = true;
+ wake_up_process(waiter->task);
+}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
new file mode 100644
index 000000000..936f3d14d
--- /dev/null
+++ b/kernel/locking/spinlock.c
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (2004) Linus Torvalds
+ *
+ * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
+ */
+
+#include <linux/linkage.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/export.h>
+
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
+/*
+ * The __lock_function inlines are taken from
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
+ */
+#else
+
+/*
+ * Some architectures can relax in favour of the CPU owning the lock.
+ */
+#ifndef arch_read_relax
+# define arch_read_relax(l) cpu_relax()
+#endif
+#ifndef arch_write_relax
+# define arch_write_relax(l) cpu_relax()
+#endif
+#ifndef arch_spin_relax
+# define arch_spin_relax(l) cpu_relax()
+#endif
+
+/*
+ * We build the __lock_function inlines here. They are too large for
+ * inlining all over the place, but here is only one user per function
+ * which embedds them into the calling _lock_function below.
+ *
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ */
+#define BUILD_LOCK_OPS(op, locktype) \
+void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
+{ \
+ for (;;) { \
+ preempt_disable(); \
+ if (likely(do_raw_##op##_trylock(lock))) \
+ break; \
+ preempt_enable(); \
+ \
+ arch_##op##_relax(&lock->raw_lock); \
+ } \
+} \
+ \
+unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+{ \
+ unsigned long flags; \
+ \
+ for (;;) { \
+ preempt_disable(); \
+ local_irq_save(flags); \
+ if (likely(do_raw_##op##_trylock(lock))) \
+ break; \
+ local_irq_restore(flags); \
+ preempt_enable(); \
+ \
+ arch_##op##_relax(&lock->raw_lock); \
+ } \
+ \
+ return flags; \
+} \
+ \
+void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
+{ \
+ _raw_##op##_lock_irqsave(lock); \
+} \
+ \
+void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
+{ \
+ unsigned long flags; \
+ \
+ /* */ \
+ /* Careful: we must exclude softirqs too, hence the */ \
+ /* irq-disabling. We use the generic preemption-aware */ \
+ /* function: */ \
+ /**/ \
+ flags = _raw_##op##_lock_irqsave(lock); \
+ local_bh_disable(); \
+ local_irq_restore(flags); \
+} \
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ * __[spin|read|write]_lock()
+ * __[spin|read|write]_lock_irq()
+ * __[spin|read|write]_lock_irqsave()
+ * __[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, rwlock);
+BUILD_LOCK_OPS(write, rwlock);
+
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return __raw_spin_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_trylock);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+{
+ return __raw_spin_trylock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+{
+ __raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+{
+ return __raw_spin_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+{
+ __raw_spin_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK_BH
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+{
+ __raw_spin_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh);
+#endif
+
+#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+{
+ __raw_spin_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_unlock);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+{
+ __raw_spin_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+{
+ __raw_spin_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+{
+ __raw_spin_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_READ_TRYLOCK
+int __lockfunc _raw_read_trylock(rwlock_t *lock)
+{
+ return __raw_read_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_read_trylock);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK
+void __lockfunc _raw_read_lock(rwlock_t *lock)
+{
+ __raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+{
+ return __raw_read_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK_IRQ
+void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+{
+ __raw_read_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK_BH
+void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+{
+ __raw_read_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK
+void __lockfunc _raw_read_unlock(rwlock_t *lock)
+{
+ __raw_read_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_read_unlock);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
+void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ __raw_read_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
+void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+{
+ __raw_read_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_read_unlock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK_BH
+void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+{
+ __raw_read_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_read_unlock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_TRYLOCK
+int __lockfunc _raw_write_trylock(rwlock_t *lock)
+{
+ return __raw_write_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_write_trylock);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK
+void __lockfunc _raw_write_lock(rwlock_t *lock)
+{
+ __raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+{
+ return __raw_write_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
+void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+{
+ __raw_write_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK_BH
+void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+{
+ __raw_write_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK
+void __lockfunc _raw_write_unlock(rwlock_t *lock)
+{
+ __raw_write_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
+void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ __raw_write_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
+void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+{
+ __raw_write_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
+void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+{
+ __raw_write_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock_bh);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_nested);
+
+unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
+ int subclass)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
+ do_raw_spin_lock_flags, &flags);
+ return flags;
+}
+EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
+
+void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ preempt_disable();
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
+
+#endif
+
+notrace int in_lock_functions(unsigned long addr)
+{
+ /* Linker adds these: start and end of __lockfunc functions */
+ extern char __lock_text_start[], __lock_text_end[];
+
+ return addr >= (unsigned long)__lock_text_start
+ && addr < (unsigned long)__lock_text_end;
+}
+EXPORT_SYMBOL(in_lock_functions);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
new file mode 100644
index 000000000..03595c29c
--- /dev/null
+++ b/kernel/locking/spinlock_debug.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ *
+ * This file contains the spinlock/rwlock implementations for
+ * DEBUG_SPINLOCK.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/nmi.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+
+void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ lock->magic = SPINLOCK_MAGIC;
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__raw_spin_lock_init);
+
+void __rwlock_init(rwlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
+ lock->magic = RWLOCK_MAGIC;
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__rwlock_init);
+
+static void spin_dump(raw_spinlock_t *lock, const char *msg)
+{
+ struct task_struct *owner = READ_ONCE(lock->owner);
+
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
+ printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
+ msg, raw_smp_processor_id(),
+ current->comm, task_pid_nr(current));
+ printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
+ ".owner_cpu: %d\n",
+ lock, READ_ONCE(lock->magic),
+ owner ? owner->comm : "<none>",
+ owner ? task_pid_nr(owner) : -1,
+ READ_ONCE(lock->owner_cpu));
+ dump_stack();
+}
+
+static void spin_bug(raw_spinlock_t *lock, const char *msg)
+{
+ if (!debug_locks_off())
+ return;
+
+ spin_dump(lock, msg);
+}
+
+#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
+
+static inline void
+debug_spin_lock_before(raw_spinlock_t *lock)
+{
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
+ lock, "cpu recursion");
+}
+
+static inline void debug_spin_lock_after(raw_spinlock_t *lock)
+{
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
+}
+
+static inline void debug_spin_unlock(raw_spinlock_t *lock)
+{
+ SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
+ SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
+ SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+ lock, "wrong CPU");
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
+}
+
+/*
+ * We are now relying on the NMI watchdog to detect lockup instead of doing
+ * the detection here with an unfair lock which can cause problem of its own.
+ */
+void do_raw_spin_lock(raw_spinlock_t *lock)
+{
+ debug_spin_lock_before(lock);
+ arch_spin_lock(&lock->raw_lock);
+ debug_spin_lock_after(lock);
+}
+
+int do_raw_spin_trylock(raw_spinlock_t *lock)
+{
+ int ret = arch_spin_trylock(&lock->raw_lock);
+
+ if (ret)
+ debug_spin_lock_after(lock);
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_spin_unlock(raw_spinlock_t *lock)
+{
+ debug_spin_unlock(lock);
+ arch_spin_unlock(&lock->raw_lock);
+}
+
+static void rwlock_bug(rwlock_t *lock, const char *msg)
+{
+ if (!debug_locks_off())
+ return;
+
+ printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
+ msg, raw_smp_processor_id(), current->comm,
+ task_pid_nr(current), lock);
+ dump_stack();
+}
+
+#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
+
+void do_raw_read_lock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ arch_read_lock(&lock->raw_lock);
+}
+
+int do_raw_read_trylock(rwlock_t *lock)
+{
+ int ret = arch_read_trylock(&lock->raw_lock);
+
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_read_unlock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ arch_read_unlock(&lock->raw_lock);
+}
+
+static inline void debug_write_lock_before(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
+ RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ lock, "cpu recursion");
+}
+
+static inline void debug_write_lock_after(rwlock_t *lock)
+{
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
+}
+
+static inline void debug_write_unlock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
+ RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+ lock, "wrong CPU");
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
+}
+
+void do_raw_write_lock(rwlock_t *lock)
+{
+ debug_write_lock_before(lock);
+ arch_write_lock(&lock->raw_lock);
+ debug_write_lock_after(lock);
+}
+
+int do_raw_write_trylock(rwlock_t *lock)
+{
+ int ret = arch_write_trylock(&lock->raw_lock);
+
+ if (ret)
+ debug_write_lock_after(lock);
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_write_unlock(rwlock_t *lock)
+{
+ debug_write_unlock(lock);
+ arch_write_unlock(&lock->raw_lock);
+}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
new file mode 100644
index 000000000..65a3b7e55
--- /dev/null
+++ b/kernel/locking/test-ww_mutex.c
@@ -0,0 +1,647 @@
+/*
+ * Module-based API test facility for ww_mutexes
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ */
+
+#include <linux/kernel.h>
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ww_mutex.h>
+
+static DEFINE_WD_CLASS(ww_class);
+struct workqueue_struct *wq;
+
+struct test_mutex {
+ struct work_struct work;
+ struct ww_mutex mutex;
+ struct completion ready, go, done;
+ unsigned int flags;
+};
+
+#define TEST_MTX_SPIN BIT(0)
+#define TEST_MTX_TRY BIT(1)
+#define TEST_MTX_CTX BIT(2)
+#define __TEST_MTX_LAST BIT(3)
+
+static void test_mutex_work(struct work_struct *work)
+{
+ struct test_mutex *mtx = container_of(work, typeof(*mtx), work);
+
+ complete(&mtx->ready);
+ wait_for_completion(&mtx->go);
+
+ if (mtx->flags & TEST_MTX_TRY) {
+ while (!ww_mutex_trylock(&mtx->mutex))
+ cond_resched();
+ } else {
+ ww_mutex_lock(&mtx->mutex, NULL);
+ }
+ complete(&mtx->done);
+ ww_mutex_unlock(&mtx->mutex);
+}
+
+static int __test_mutex(unsigned int flags)
+{
+#define TIMEOUT (HZ / 16)
+ struct test_mutex mtx;
+ struct ww_acquire_ctx ctx;
+ int ret;
+
+ ww_mutex_init(&mtx.mutex, &ww_class);
+ ww_acquire_init(&ctx, &ww_class);
+
+ INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
+ init_completion(&mtx.ready);
+ init_completion(&mtx.go);
+ init_completion(&mtx.done);
+ mtx.flags = flags;
+
+ schedule_work(&mtx.work);
+
+ wait_for_completion(&mtx.ready);
+ ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL);
+ complete(&mtx.go);
+ if (flags & TEST_MTX_SPIN) {
+ unsigned long timeout = jiffies + TIMEOUT;
+
+ ret = 0;
+ do {
+ if (completion_done(&mtx.done)) {
+ ret = -EINVAL;
+ break;
+ }
+ cond_resched();
+ } while (time_before(jiffies, timeout));
+ } else {
+ ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
+ }
+ ww_mutex_unlock(&mtx.mutex);
+ ww_acquire_fini(&ctx);
+
+ if (ret) {
+ pr_err("%s(flags=%x): mutual exclusion failure\n",
+ __func__, flags);
+ ret = -EINVAL;
+ }
+
+ flush_work(&mtx.work);
+ destroy_work_on_stack(&mtx.work);
+ return ret;
+#undef TIMEOUT
+}
+
+static int test_mutex(void)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < __TEST_MTX_LAST; i++) {
+ ret = __test_mutex(i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int test_aa(void)
+{
+ struct ww_mutex mutex;
+ struct ww_acquire_ctx ctx;
+ int ret;
+
+ ww_mutex_init(&mutex, &ww_class);
+ ww_acquire_init(&ctx, &ww_class);
+
+ ww_mutex_lock(&mutex, &ctx);
+
+ if (ww_mutex_trylock(&mutex)) {
+ pr_err("%s: trylocked itself!\n", __func__);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret != -EALREADY) {
+ pr_err("%s: missed deadlock for recursing, ret=%d\n",
+ __func__, ret);
+ if (!ret)
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ww_mutex_unlock(&mutex);
+ ww_acquire_fini(&ctx);
+ return ret;
+}
+
+struct test_abba {
+ struct work_struct work;
+ struct ww_mutex a_mutex;
+ struct ww_mutex b_mutex;
+ struct completion a_ready;
+ struct completion b_ready;
+ bool resolve;
+ int result;
+};
+
+static void test_abba_work(struct work_struct *work)
+{
+ struct test_abba *abba = container_of(work, typeof(*abba), work);
+ struct ww_acquire_ctx ctx;
+ int err;
+
+ ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+
+ complete(&abba->b_ready);
+ wait_for_completion(&abba->a_ready);
+
+ err = ww_mutex_lock(&abba->a_mutex, &ctx);
+ if (abba->resolve && err == -EDEADLK) {
+ ww_mutex_unlock(&abba->b_mutex);
+ ww_mutex_lock_slow(&abba->a_mutex, &ctx);
+ err = ww_mutex_lock(&abba->b_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(&abba->a_mutex);
+ ww_mutex_unlock(&abba->b_mutex);
+ ww_acquire_fini(&ctx);
+
+ abba->result = err;
+}
+
+static int test_abba(bool resolve)
+{
+ struct test_abba abba;
+ struct ww_acquire_ctx ctx;
+ int err, ret;
+
+ ww_mutex_init(&abba.a_mutex, &ww_class);
+ ww_mutex_init(&abba.b_mutex, &ww_class);
+ INIT_WORK_ONSTACK(&abba.work, test_abba_work);
+ init_completion(&abba.a_ready);
+ init_completion(&abba.b_ready);
+ abba.resolve = resolve;
+
+ schedule_work(&abba.work);
+
+ ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+
+ complete(&abba.a_ready);
+ wait_for_completion(&abba.b_ready);
+
+ err = ww_mutex_lock(&abba.b_mutex, &ctx);
+ if (resolve && err == -EDEADLK) {
+ ww_mutex_unlock(&abba.a_mutex);
+ ww_mutex_lock_slow(&abba.b_mutex, &ctx);
+ err = ww_mutex_lock(&abba.a_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(&abba.b_mutex);
+ ww_mutex_unlock(&abba.a_mutex);
+ ww_acquire_fini(&ctx);
+
+ flush_work(&abba.work);
+ destroy_work_on_stack(&abba.work);
+
+ ret = 0;
+ if (resolve) {
+ if (err || abba.result) {
+ pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n",
+ __func__, err, abba.result);
+ ret = -EINVAL;
+ }
+ } else {
+ if (err != -EDEADLK && abba.result != -EDEADLK) {
+ pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n",
+ __func__, err, abba.result);
+ ret = -EINVAL;
+ }
+ }
+ return ret;
+}
+
+struct test_cycle {
+ struct work_struct work;
+ struct ww_mutex a_mutex;
+ struct ww_mutex *b_mutex;
+ struct completion *a_signal;
+ struct completion b_signal;
+ int result;
+};
+
+static void test_cycle_work(struct work_struct *work)
+{
+ struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
+ struct ww_acquire_ctx ctx;
+ int err, erra = 0;
+
+ ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_lock(&cycle->a_mutex, &ctx);
+
+ complete(cycle->a_signal);
+ wait_for_completion(&cycle->b_signal);
+
+ err = ww_mutex_lock(cycle->b_mutex, &ctx);
+ if (err == -EDEADLK) {
+ err = 0;
+ ww_mutex_unlock(&cycle->a_mutex);
+ ww_mutex_lock_slow(cycle->b_mutex, &ctx);
+ erra = ww_mutex_lock(&cycle->a_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(cycle->b_mutex);
+ if (!erra)
+ ww_mutex_unlock(&cycle->a_mutex);
+ ww_acquire_fini(&ctx);
+
+ cycle->result = err ?: erra;
+}
+
+static int __test_cycle(unsigned int nthreads)
+{
+ struct test_cycle *cycles;
+ unsigned int n, last = nthreads - 1;
+ int ret;
+
+ cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL);
+ if (!cycles)
+ return -ENOMEM;
+
+ for (n = 0; n < nthreads; n++) {
+ struct test_cycle *cycle = &cycles[n];
+
+ ww_mutex_init(&cycle->a_mutex, &ww_class);
+ if (n == last)
+ cycle->b_mutex = &cycles[0].a_mutex;
+ else
+ cycle->b_mutex = &cycles[n + 1].a_mutex;
+
+ if (n == 0)
+ cycle->a_signal = &cycles[last].b_signal;
+ else
+ cycle->a_signal = &cycles[n - 1].b_signal;
+ init_completion(&cycle->b_signal);
+
+ INIT_WORK(&cycle->work, test_cycle_work);
+ cycle->result = 0;
+ }
+
+ for (n = 0; n < nthreads; n++)
+ queue_work(wq, &cycles[n].work);
+
+ flush_workqueue(wq);
+
+ ret = 0;
+ for (n = 0; n < nthreads; n++) {
+ struct test_cycle *cycle = &cycles[n];
+
+ if (!cycle->result)
+ continue;
+
+ pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n",
+ n, nthreads, cycle->result);
+ ret = -EINVAL;
+ break;
+ }
+
+ for (n = 0; n < nthreads; n++)
+ ww_mutex_destroy(&cycles[n].a_mutex);
+ kfree(cycles);
+ return ret;
+}
+
+static int test_cycle(unsigned int ncpus)
+{
+ unsigned int n;
+ int ret;
+
+ for (n = 2; n <= ncpus + 1; n++) {
+ ret = __test_cycle(n);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct stress {
+ struct work_struct work;
+ struct ww_mutex *locks;
+ unsigned long timeout;
+ int nlocks;
+};
+
+static int *get_random_order(int count)
+{
+ int *order;
+ int n, r, tmp;
+
+ order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
+ if (!order)
+ return order;
+
+ for (n = 0; n < count; n++)
+ order[n] = n;
+
+ for (n = count - 1; n > 1; n--) {
+ r = get_random_int() % (n + 1);
+ if (r != n) {
+ tmp = order[n];
+ order[n] = order[r];
+ order[r] = tmp;
+ }
+ }
+
+ return order;
+}
+
+static void dummy_load(struct stress *stress)
+{
+ usleep_range(1000, 2000);
+}
+
+static void stress_inorder_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ const int nlocks = stress->nlocks;
+ struct ww_mutex *locks = stress->locks;
+ struct ww_acquire_ctx ctx;
+ int *order;
+
+ order = get_random_order(nlocks);
+ if (!order)
+ return;
+
+ do {
+ int contended = -1;
+ int n, err;
+
+ ww_acquire_init(&ctx, &ww_class);
+retry:
+ err = 0;
+ for (n = 0; n < nlocks; n++) {
+ if (n == contended)
+ continue;
+
+ err = ww_mutex_lock(&locks[order[n]], &ctx);
+ if (err < 0)
+ break;
+ }
+ if (!err)
+ dummy_load(stress);
+
+ if (contended > n)
+ ww_mutex_unlock(&locks[order[contended]]);
+ contended = n;
+ while (n--)
+ ww_mutex_unlock(&locks[order[n]]);
+
+ if (err == -EDEADLK) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
+
+ if (err) {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
+
+ kfree(order);
+ kfree(stress);
+}
+
+struct reorder_lock {
+ struct list_head link;
+ struct ww_mutex *lock;
+};
+
+static void stress_reorder_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ LIST_HEAD(locks);
+ struct ww_acquire_ctx ctx;
+ struct reorder_lock *ll, *ln;
+ int *order;
+ int n, err;
+
+ order = get_random_order(stress->nlocks);
+ if (!order)
+ return;
+
+ for (n = 0; n < stress->nlocks; n++) {
+ ll = kmalloc(sizeof(*ll), GFP_KERNEL);
+ if (!ll)
+ goto out;
+
+ ll->lock = &stress->locks[order[n]];
+ list_add(&ll->link, &locks);
+ }
+ kfree(order);
+ order = NULL;
+
+ do {
+ ww_acquire_init(&ctx, &ww_class);
+
+ list_for_each_entry(ll, &locks, link) {
+ err = ww_mutex_lock(ll->lock, &ctx);
+ if (!err)
+ continue;
+
+ ln = ll;
+ list_for_each_entry_continue_reverse(ln, &locks, link)
+ ww_mutex_unlock(ln->lock);
+
+ if (err != -EDEADLK) {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+
+ ww_mutex_lock_slow(ll->lock, &ctx);
+ list_move(&ll->link, &locks); /* restarts iteration */
+ }
+
+ dummy_load(stress);
+ list_for_each_entry(ll, &locks, link)
+ ww_mutex_unlock(ll->lock);
+
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
+
+out:
+ list_for_each_entry_safe(ll, ln, &locks, link)
+ kfree(ll);
+ kfree(order);
+ kfree(stress);
+}
+
+static void stress_one_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ const int nlocks = stress->nlocks;
+ struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ int err;
+
+ do {
+ err = ww_mutex_lock(lock, NULL);
+ if (!err) {
+ dummy_load(stress);
+ ww_mutex_unlock(lock);
+ } else {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+ } while (!time_after(jiffies, stress->timeout));
+
+ kfree(stress);
+}
+
+#define STRESS_INORDER BIT(0)
+#define STRESS_REORDER BIT(1)
+#define STRESS_ONE BIT(2)
+#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
+
+static int stress(int nlocks, int nthreads, unsigned int flags)
+{
+ struct ww_mutex *locks;
+ int n;
+
+ locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
+ if (!locks)
+ return -ENOMEM;
+
+ for (n = 0; n < nlocks; n++)
+ ww_mutex_init(&locks[n], &ww_class);
+
+ for (n = 0; nthreads; n++) {
+ struct stress *stress;
+ void (*fn)(struct work_struct *work);
+
+ fn = NULL;
+ switch (n & 3) {
+ case 0:
+ if (flags & STRESS_INORDER)
+ fn = stress_inorder_work;
+ break;
+ case 1:
+ if (flags & STRESS_REORDER)
+ fn = stress_reorder_work;
+ break;
+ case 2:
+ if (flags & STRESS_ONE)
+ fn = stress_one_work;
+ break;
+ }
+
+ if (!fn)
+ continue;
+
+ stress = kmalloc(sizeof(*stress), GFP_KERNEL);
+ if (!stress)
+ break;
+
+ INIT_WORK(&stress->work, fn);
+ stress->locks = locks;
+ stress->nlocks = nlocks;
+ stress->timeout = jiffies + 2*HZ;
+
+ queue_work(wq, &stress->work);
+ nthreads--;
+ }
+
+ flush_workqueue(wq);
+
+ for (n = 0; n < nlocks; n++)
+ ww_mutex_destroy(&locks[n]);
+ kfree(locks);
+
+ return 0;
+}
+
+static int __init test_ww_mutex_init(void)
+{
+ int ncpus = num_online_cpus();
+ int ret;
+
+ wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+
+ ret = test_mutex();
+ if (ret)
+ return ret;
+
+ ret = test_aa();
+ if (ret)
+ return ret;
+
+ ret = test_abba(false);
+ if (ret)
+ return ret;
+
+ ret = test_abba(true);
+ if (ret)
+ return ret;
+
+ ret = test_cycle(ncpus);
+ if (ret)
+ return ret;
+
+ ret = stress(16, 2*ncpus, STRESS_INORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(16, 2*ncpus, STRESS_REORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void __exit test_ww_mutex_exit(void)
+{
+ destroy_workqueue(wq);
+}
+
+module_init(test_ww_mutex_init);
+module_exit(test_ww_mutex_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");