summaryrefslogtreecommitdiffstats
path: root/fs/dlm
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dlm')
-rw-r--r--fs/dlm/Kconfig17
-rw-r--r--fs/dlm/Makefile21
-rw-r--r--fs/dlm/ast.c271
-rw-r--r--fs/dlm/ast.h37
-rw-r--r--fs/dlm/config.c966
-rw-r--r--fs/dlm/config.h56
-rw-r--r--fs/dlm/debug_fs.c1052
-rw-r--r--fs/dlm/dir.c307
-rw-r--r--fs/dlm/dir.h23
-rw-r--r--fs/dlm/dlm_internal.h829
-rw-r--r--fs/dlm/lock.c6153
-rw-r--r--fs/dlm/lock.h80
-rw-r--r--fs/dlm/lockspace.c937
-rw-r--r--fs/dlm/lockspace.h37
-rw-r--r--fs/dlm/lowcomms.c1997
-rw-r--r--fs/dlm/lowcomms.h56
-rw-r--r--fs/dlm/lvb_table.h16
-rw-r--r--fs/dlm/main.c93
-rw-r--r--fs/dlm/member.c752
-rw-r--r--fs/dlm/member.h31
-rw-r--r--fs/dlm/memory.c175
-rw-r--r--fs/dlm/memory.h33
-rw-r--r--fs/dlm/midcomms.c1514
-rw-r--r--fs/dlm/midcomms.h43
-rw-r--r--fs/dlm/plock.c640
-rw-r--r--fs/dlm/rcom.c692
-rw-r--r--fs/dlm/rcom.h26
-rw-r--r--fs/dlm/recover.c958
-rw-r--r--fs/dlm/recover.h32
-rw-r--r--fs/dlm/recoverd.c378
-rw-r--r--fs/dlm/recoverd.h21
-rw-r--r--fs/dlm/requestqueue.c174
-rw-r--r--fs/dlm/requestqueue.h21
-rw-r--r--fs/dlm/user.c1046
-rw-r--r--fs/dlm/user.h17
-rw-r--r--fs/dlm/util.c66
-rw-r--r--fs/dlm/util.h18
37 files changed, 19585 insertions, 0 deletions
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 0000000000..f82a495276
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig DLM
+ tristate "Distributed Lock Manager (DLM)"
+ depends on INET
+ depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
+ select IP_SCTP
+ help
+ A general purpose distributed lock manager for kernel or userspace
+ applications.
+
+config DLM_DEBUG
+ bool "DLM debugging"
+ depends on DLM
+ help
+ Under the debugfs mount point, the name of each lockspace will
+ appear as a file in the "dlm" directory. The output is the
+ list of resource and locks the local node knows about.
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 0000000000..5a471af1d1
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_DLM) += dlm.o
+dlm-y := ast.o \
+ config.o \
+ dir.o \
+ lock.o \
+ lockspace.o \
+ main.o \
+ member.o \
+ memory.o \
+ midcomms.o \
+ lowcomms.o \
+ plock.o \
+ rcom.o \
+ recover.o \
+ recoverd.o \
+ requestqueue.o \
+ user.o \
+ util.o
+dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
+
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 0000000000..1f2f70a1b8
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <trace/events/dlm.h>
+
+#include "dlm_internal.h"
+#include "memory.h"
+#include "lock.h"
+#include "user.h"
+#include "ast.h"
+
+void dlm_release_callback(struct kref *ref)
+{
+ struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref);
+
+ dlm_free_cb(cb);
+}
+
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to)
+{
+ if (*from)
+ kref_put(&(*from)->ref, dlm_release_callback);
+
+ if (to)
+ kref_get(&to->ref);
+
+ *from = to;
+}
+
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
+ struct dlm_callback *cb;
+ int prev_mode;
+
+ if (flags & DLM_CB_BAST) {
+ /* if cb is a bast, it should be skipped if the blocking mode is
+ * compatible with the last granted mode
+ */
+ if (lkb->lkb_last_cast) {
+ if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) {
+ log_debug(ls, "skip %x bast mode %d for cast mode %d",
+ lkb->lkb_id, mode,
+ lkb->lkb_last_cast->mode);
+ goto out;
+ }
+ }
+
+ /*
+ * Suppress some redundant basts here, do more on removal.
+ * Don't even add a bast if the callback just before it
+ * is a bast for the same mode or a more restrictive mode.
+ * (the addional > PR check is needed for PR/CW inversion)
+ */
+ if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) {
+ prev_mode = lkb->lkb_last_cb->mode;
+
+ if ((prev_mode == mode) ||
+ (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
+ log_debug(ls, "skip %x add bast mode %d for bast mode %d",
+ lkb->lkb_id, mode, prev_mode);
+ goto out;
+ }
+ }
+ }
+
+ cb = dlm_allocate_cb();
+ if (!cb) {
+ rv = DLM_ENQUEUE_CALLBACK_FAILURE;
+ goto out;
+ }
+
+ cb->flags = flags;
+ cb->mode = mode;
+ cb->sb_status = status;
+ cb->sb_flags = (sbflags & 0x000000FF);
+ kref_init(&cb->ref);
+ if (!test_and_set_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags))
+ rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
+
+ list_add_tail(&cb->list, &lkb->lkb_callbacks);
+
+ if (flags & DLM_CB_CAST)
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb);
+
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb);
+
+ out:
+ return rv;
+}
+
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb)
+{
+ /* oldest undelivered cb is callbacks first entry */
+ *cb = list_first_entry_or_null(&lkb->lkb_callbacks,
+ struct dlm_callback, list);
+ if (!*cb)
+ return DLM_DEQUEUE_CALLBACK_EMPTY;
+
+ /* remove it from callbacks so shift others down */
+ list_del(&(*cb)->list);
+ if (list_empty(&lkb->lkb_callbacks))
+ return DLM_DEQUEUE_CALLBACK_LAST;
+
+ return DLM_DEQUEUE_CALLBACK_SUCCESS;
+}
+
+void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+ uint32_t sbflags)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int rv;
+
+ if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
+ dlm_user_add_ast(lkb, flags, mode, status, sbflags);
+ return;
+ }
+
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
+ kref_get(&lkb->lkb_ref);
+
+ spin_lock(&ls->ls_cb_lock);
+ if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
+ list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
+ } else {
+ queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
+ }
+ spin_unlock(&ls->ls_cb_lock);
+ break;
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
+ WARN_ON_ONCE(1);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ spin_unlock(&lkb->lkb_cb_lock);
+}
+
+void dlm_callback_work(struct work_struct *work)
+{
+ struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work);
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ void (*castfn) (void *astparam);
+ void (*bastfn) (void *astparam, int mode);
+ struct dlm_callback *cb;
+ int rv;
+
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) {
+ clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags);
+ spin_unlock(&lkb->lkb_cb_lock);
+ goto out;
+ }
+ spin_unlock(&lkb->lkb_cb_lock);
+
+ for (;;) {
+ castfn = lkb->lkb_astfn;
+ bastfn = lkb->lkb_bastfn;
+
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(ls, lkb, cb->mode);
+ lkb->lkb_last_bast_time = ktime_get();
+ lkb->lkb_last_bast_mode = cb->mode;
+ bastfn(lkb->lkb_astparam, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
+ trace_dlm_ast(ls, lkb);
+ lkb->lkb_last_cast_time = ktime_get();
+ castfn(lkb->lkb_astparam);
+ }
+
+ kref_put(&cb->ref, dlm_release_callback);
+
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) {
+ clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags);
+ spin_unlock(&lkb->lkb_cb_lock);
+ break;
+ }
+ spin_unlock(&lkb->lkb_cb_lock);
+ }
+
+out:
+ /* undo kref_get from dlm_add_callback, may cause lkb to be freed */
+ dlm_put_lkb(lkb);
+}
+
+int dlm_callback_start(struct dlm_ls *ls)
+{
+ ls->ls_callback_wq = alloc_workqueue("dlm_callback",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
+ if (!ls->ls_callback_wq) {
+ log_print("can't start dlm_callback workqueue");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void dlm_callback_stop(struct dlm_ls *ls)
+{
+ if (ls->ls_callback_wq)
+ destroy_workqueue(ls->ls_callback_wq);
+}
+
+void dlm_callback_suspend(struct dlm_ls *ls)
+{
+ if (ls->ls_callback_wq) {
+ spin_lock(&ls->ls_cb_lock);
+ set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock(&ls->ls_cb_lock);
+
+ flush_workqueue(ls->ls_callback_wq);
+ }
+}
+
+#define MAX_CB_QUEUE 25
+
+void dlm_callback_resume(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb, *safe;
+ int count = 0, sum = 0;
+ bool empty;
+
+ if (!ls->ls_callback_wq)
+ return;
+
+more:
+ spin_lock(&ls->ls_cb_lock);
+ list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
+ list_del_init(&lkb->lkb_cb_list);
+ queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
+ count++;
+ if (count == MAX_CB_QUEUE)
+ break;
+ }
+ empty = list_empty(&ls->ls_cb_delay);
+ if (empty)
+ clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock(&ls->ls_cb_lock);
+
+ sum += count;
+ if (!empty) {
+ count = 0;
+ cond_resched();
+ goto more;
+ }
+
+ if (sum)
+ log_rinfo(ls, "%s %d", __func__, sum);
+}
+
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 0000000000..ce007892dc
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1
+#define DLM_ENQUEUE_CALLBACK_SUCCESS 0
+#define DLM_ENQUEUE_CALLBACK_FAILURE -1
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags);
+#define DLM_DEQUEUE_CALLBACK_EMPTY 2
+#define DLM_DEQUEUE_CALLBACK_LAST 1
+#define DLM_DEQUEUE_CALLBACK_SUCCESS 0
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb);
+void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+ uint32_t sbflags);
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to);
+
+void dlm_release_callback(struct kref *ref);
+void dlm_callback_work(struct work_struct *work);
+int dlm_callback_start(struct dlm_ls *ls);
+void dlm_callback_stop(struct dlm_ls *ls);
+void dlm_callback_suspend(struct dlm_ls *ls);
+void dlm_callback_resume(struct dlm_ls *ls);
+
+#endif
+
+
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 0000000000..e55e0a2cd2
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,966 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/configfs.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/dlmconstants.h>
+#include <net/ipv6.h>
+#include <net/sock.h>
+
+#include "config.h"
+#include "midcomms.h"
+#include "lowcomms.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr (write only)
+ * /config/dlm/<cluster>/comms/<comm>/addr_list (read only)
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct dlm_comm *local_comm;
+static uint32_t dlm_comm_count;
+
+struct dlm_clusters;
+struct dlm_cluster;
+struct dlm_spaces;
+struct dlm_space;
+struct dlm_comms;
+struct dlm_comm;
+struct dlm_nodes;
+struct dlm_node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static struct configfs_attribute *comm_attrs[];
+static struct configfs_attribute *node_attrs[];
+
+struct dlm_cluster {
+ struct config_group group;
+ unsigned int cl_tcp_port;
+ unsigned int cl_buffer_size;
+ unsigned int cl_rsbtbl_size;
+ unsigned int cl_recover_timer;
+ unsigned int cl_toss_secs;
+ unsigned int cl_scan_secs;
+ unsigned int cl_log_debug;
+ unsigned int cl_log_info;
+ unsigned int cl_protocol;
+ unsigned int cl_mark;
+ unsigned int cl_new_rsb_count;
+ unsigned int cl_recover_callbacks;
+ char cl_cluster_name[DLM_LOCKSPACE_LEN];
+
+ struct dlm_spaces *sps;
+ struct dlm_comms *cms;
+};
+
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+ NULL;
+}
+
+enum {
+ CLUSTER_ATTR_TCP_PORT = 0,
+ CLUSTER_ATTR_BUFFER_SIZE,
+ CLUSTER_ATTR_RSBTBL_SIZE,
+ CLUSTER_ATTR_RECOVER_TIMER,
+ CLUSTER_ATTR_TOSS_SECS,
+ CLUSTER_ATTR_SCAN_SECS,
+ CLUSTER_ATTR_LOG_DEBUG,
+ CLUSTER_ATTR_LOG_INFO,
+ CLUSTER_ATTR_PROTOCOL,
+ CLUSTER_ATTR_MARK,
+ CLUSTER_ATTR_NEW_RSB_COUNT,
+ CLUSTER_ATTR_RECOVER_CALLBACKS,
+ CLUSTER_ATTR_CLUSTER_NAME,
+};
+
+static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf)
+{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
+ return sprintf(buf, "%s\n", cl->cl_cluster_name);
+}
+
+static ssize_t cluster_cluster_name_store(struct config_item *item,
+ const char *buf, size_t len)
+{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
+
+ strscpy(dlm_config.ci_cluster_name, buf,
+ sizeof(dlm_config.ci_cluster_name));
+ strscpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
+ return len;
+}
+
+CONFIGFS_ATTR(cluster_, cluster_name);
+
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
+ int *info_field, int (*check_cb)(unsigned int x),
+ const char *buf, size_t len)
+{
+ unsigned int x;
+ int rc;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ rc = kstrtouint(buf, 0, &x);
+ if (rc)
+ return rc;
+
+ if (check_cb) {
+ rc = check_cb(x);
+ if (rc)
+ return rc;
+ }
+
+ *cl_field = x;
+ *info_field = x;
+
+ return len;
+}
+
+#define CLUSTER_ATTR(name, check_cb) \
+static ssize_t cluster_##name##_store(struct config_item *item, \
+ const char *buf, size_t len) \
+{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
+ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
+ check_cb, buf, len); \
+} \
+static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
+{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
+ return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
+} \
+CONFIGFS_ATTR(cluster_, name);
+
+static int dlm_check_protocol_and_dlm_running(unsigned int x)
+{
+ switch (x) {
+ case 0:
+ /* TCP */
+ break;
+ case 1:
+ /* SCTP */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (dlm_lowcomms_is_running())
+ return -EBUSY;
+
+ return 0;
+}
+
+static int dlm_check_zero_and_dlm_running(unsigned int x)
+{
+ if (!x)
+ return -EINVAL;
+
+ if (dlm_lowcomms_is_running())
+ return -EBUSY;
+
+ return 0;
+}
+
+static int dlm_check_zero(unsigned int x)
+{
+ if (!x)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int dlm_check_buffer_size(unsigned int x)
+{
+ if (x < DLM_MAX_SOCKET_BUFSIZE)
+ return -EINVAL;
+
+ return 0;
+}
+
+CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running);
+CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
+CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
+CLUSTER_ATTR(recover_timer, dlm_check_zero);
+CLUSTER_ATTR(toss_secs, dlm_check_zero);
+CLUSTER_ATTR(scan_secs, dlm_check_zero);
+CLUSTER_ATTR(log_debug, NULL);
+CLUSTER_ATTR(log_info, NULL);
+CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
+CLUSTER_ATTR(mark, NULL);
+CLUSTER_ATTR(new_rsb_count, NULL);
+CLUSTER_ATTR(recover_callbacks, NULL);
+
+static struct configfs_attribute *cluster_attrs[] = {
+ [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
+ [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size,
+ [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size,
+ [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer,
+ [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
+ [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
+ [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+ [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
+ [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
+ [CLUSTER_ATTR_MARK] = &cluster_attr_mark,
+ [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
+ [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
+ [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
+ NULL,
+};
+
+enum {
+ COMM_ATTR_NODEID = 0,
+ COMM_ATTR_LOCAL,
+ COMM_ATTR_ADDR,
+ COMM_ATTR_ADDR_LIST,
+ COMM_ATTR_MARK,
+};
+
+enum {
+ NODE_ATTR_NODEID = 0,
+ NODE_ATTR_WEIGHT,
+};
+
+struct dlm_clusters {
+ struct configfs_subsystem subsys;
+};
+
+struct dlm_spaces {
+ struct config_group ss_group;
+};
+
+struct dlm_space {
+ struct config_group group;
+ struct list_head members;
+ struct mutex members_lock;
+ int members_count;
+ struct dlm_nodes *nds;
+};
+
+struct dlm_comms {
+ struct config_group cs_group;
+};
+
+struct dlm_comm {
+ struct config_item item;
+ int seq;
+ int nodeid;
+ int local;
+ int addr_count;
+ unsigned int mark;
+ struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct dlm_nodes {
+ struct config_group ns_group;
+};
+
+struct dlm_node {
+ struct config_item item;
+ struct list_head list; /* space->members */
+ int nodeid;
+ int weight;
+ int new;
+ int comm_seq; /* copy of cm->seq when nd->nodeid is set */
+};
+
+static struct configfs_group_operations clusters_ops = {
+ .make_group = make_cluster,
+ .drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+ .release = release_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+ .make_group = make_space,
+ .drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+ .release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+ .make_item = make_comm,
+ .drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+ .release = release_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+ .make_item = make_node,
+ .drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+ .release = release_node,
+};
+
+static const struct config_item_type clusters_type = {
+ .ct_group_ops = &clusters_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type cluster_type = {
+ .ct_item_ops = &cluster_ops,
+ .ct_attrs = cluster_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type spaces_type = {
+ .ct_group_ops = &spaces_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type space_type = {
+ .ct_item_ops = &space_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type comms_type = {
+ .ct_group_ops = &comms_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type comm_type = {
+ .ct_item_ops = &comm_ops,
+ .ct_attrs = comm_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type nodes_type = {
+ .ct_group_ops = &nodes_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static const struct config_item_type node_type = {
+ .ct_item_ops = &node_ops,
+ .ct_attrs = node_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct dlm_space *config_item_to_space(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct dlm_space, group) :
+ NULL;
+}
+
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
+{
+ return i ? container_of(i, struct dlm_comm, item) : NULL;
+}
+
+static struct dlm_node *config_item_to_node(struct config_item *i)
+{
+ return i ? container_of(i, struct dlm_node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+ const char *name)
+{
+ struct dlm_cluster *cl = NULL;
+ struct dlm_spaces *sps = NULL;
+ struct dlm_comms *cms = NULL;
+
+ cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
+ sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
+ cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
+
+ if (!cl || !sps || !cms)
+ goto fail;
+
+ cl->sps = sps;
+ cl->cms = cms;
+
+ config_group_init_type_name(&cl->group, name, &cluster_type);
+ config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+ config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+ configfs_add_default_group(&sps->ss_group, &cl->group);
+ configfs_add_default_group(&cms->cs_group, &cl->group);
+
+ cl->cl_tcp_port = dlm_config.ci_tcp_port;
+ cl->cl_buffer_size = dlm_config.ci_buffer_size;
+ cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
+ cl->cl_recover_timer = dlm_config.ci_recover_timer;
+ cl->cl_toss_secs = dlm_config.ci_toss_secs;
+ cl->cl_scan_secs = dlm_config.ci_scan_secs;
+ cl->cl_log_debug = dlm_config.ci_log_debug;
+ cl->cl_log_info = dlm_config.ci_log_info;
+ cl->cl_protocol = dlm_config.ci_protocol;
+ cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
+ cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
+ memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
+ DLM_LOCKSPACE_LEN);
+
+ space_list = &sps->ss_group;
+ comm_list = &cms->cs_group;
+ return &cl->group;
+
+ fail:
+ kfree(cl);
+ kfree(sps);
+ kfree(cms);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+ struct dlm_cluster *cl = config_item_to_cluster(i);
+
+ configfs_remove_default_groups(&cl->group);
+
+ space_list = NULL;
+ comm_list = NULL;
+
+ config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+ struct dlm_cluster *cl = config_item_to_cluster(i);
+
+ kfree(cl->sps);
+ kfree(cl->cms);
+ kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+ struct dlm_space *sp = NULL;
+ struct dlm_nodes *nds = NULL;
+
+ sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
+ nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
+
+ if (!sp || !nds)
+ goto fail;
+
+ config_group_init_type_name(&sp->group, name, &space_type);
+
+ config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+ configfs_add_default_group(&nds->ns_group, &sp->group);
+
+ INIT_LIST_HEAD(&sp->members);
+ mutex_init(&sp->members_lock);
+ sp->members_count = 0;
+ sp->nds = nds;
+ return &sp->group;
+
+ fail:
+ kfree(sp);
+ kfree(nds);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+ struct dlm_space *sp = config_item_to_space(i);
+
+ /* assert list_empty(&sp->members) */
+
+ configfs_remove_default_groups(&sp->group);
+ config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+ struct dlm_space *sp = config_item_to_space(i);
+ kfree(sp->nds);
+ kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+ struct dlm_comm *cm;
+
+ cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
+ if (!cm)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&cm->item, name, &comm_type);
+
+ cm->seq = dlm_comm_count++;
+ if (!cm->seq)
+ cm->seq = dlm_comm_count++;
+
+ cm->nodeid = -1;
+ cm->local = 0;
+ cm->addr_count = 0;
+ cm->mark = 0;
+ return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+ struct dlm_comm *cm = config_item_to_comm(i);
+ if (local_comm == cm)
+ local_comm = NULL;
+ dlm_midcomms_close(cm->nodeid);
+ while (cm->addr_count--)
+ kfree(cm->addr[cm->addr_count]);
+ config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+ struct dlm_comm *cm = config_item_to_comm(i);
+ kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+ struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+ struct dlm_node *nd;
+
+ nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
+ if (!nd)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&nd->item, name, &node_type);
+ nd->nodeid = -1;
+ nd->weight = 1; /* default weight of 1 if none is set */
+ nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */
+
+ mutex_lock(&sp->members_lock);
+ list_add(&nd->list, &sp->members);
+ sp->members_count++;
+ mutex_unlock(&sp->members_lock);
+
+ return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+ struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+ struct dlm_node *nd = config_item_to_node(i);
+
+ mutex_lock(&sp->members_lock);
+ list_del(&nd->list);
+ sp->members_count--;
+ mutex_unlock(&sp->members_lock);
+
+ config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+ struct dlm_node *nd = config_item_to_node(i);
+ kfree(nd);
+}
+
+static struct dlm_clusters clusters_root = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "dlm",
+ .ci_type = &clusters_type,
+ },
+ },
+ },
+};
+
+int __init dlm_config_init(void)
+{
+ config_group_init(&clusters_root.subsys.su_group);
+ mutex_init(&clusters_root.subsys.su_mutex);
+ return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+ configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t comm_nodeid_show(struct config_item *item, char *buf)
+{
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid);
+}
+
+static ssize_t comm_nodeid_store(struct config_item *item, const char *buf,
+ size_t len)
+{
+ int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid);
+
+ if (rc)
+ return rc;
+ return len;
+}
+
+static ssize_t comm_local_show(struct config_item *item, char *buf)
+{
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->local);
+}
+
+static ssize_t comm_local_store(struct config_item *item, const char *buf,
+ size_t len)
+{
+ struct dlm_comm *cm = config_item_to_comm(item);
+ int rc = kstrtoint(buf, 0, &cm->local);
+
+ if (rc)
+ return rc;
+ if (cm->local && !local_comm)
+ local_comm = cm;
+ return len;
+}
+
+static ssize_t comm_addr_store(struct config_item *item, const char *buf,
+ size_t len)
+{
+ struct dlm_comm *cm = config_item_to_comm(item);
+ struct sockaddr_storage *addr;
+ int rv;
+
+ if (len != sizeof(struct sockaddr_storage))
+ return -EINVAL;
+
+ if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+ return -ENOSPC;
+
+ addr = kzalloc(sizeof(*addr), GFP_NOFS);
+ if (!addr)
+ return -ENOMEM;
+
+ memcpy(addr, buf, len);
+
+ rv = dlm_midcomms_addr(cm->nodeid, addr, len);
+ if (rv) {
+ kfree(addr);
+ return rv;
+ }
+
+ cm->addr[cm->addr_count++] = addr;
+ return len;
+}
+
+static ssize_t comm_addr_list_show(struct config_item *item, char *buf)
+{
+ struct dlm_comm *cm = config_item_to_comm(item);
+ ssize_t s;
+ ssize_t allowance;
+ int i;
+ struct sockaddr_storage *addr;
+ struct sockaddr_in *addr_in;
+ struct sockaddr_in6 *addr_in6;
+
+ /* Taken from ip6_addr_string() defined in lib/vsprintf.c */
+ char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")];
+
+
+ /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */
+ allowance = 4096;
+ buf[0] = '\0';
+
+ for (i = 0; i < cm->addr_count; i++) {
+ addr = cm->addr[i];
+
+ switch(addr->ss_family) {
+ case AF_INET:
+ addr_in = (struct sockaddr_in *)addr;
+ s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ addr_in6 = (struct sockaddr_in6 *)addr;
+ s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr);
+ break;
+ default:
+ s = sprintf(buf0, "%s\n", "<UNKNOWN>");
+ break;
+ }
+ allowance -= s;
+ if (allowance >= 0)
+ strcat(buf, buf0);
+ else {
+ allowance += s;
+ break;
+ }
+ }
+ return 4096 - allowance;
+}
+
+static ssize_t comm_mark_show(struct config_item *item, char *buf)
+{
+ return sprintf(buf, "%u\n", config_item_to_comm(item)->mark);
+}
+
+static ssize_t comm_mark_store(struct config_item *item, const char *buf,
+ size_t len)
+{
+ struct dlm_comm *comm;
+ unsigned int mark;
+ int rc;
+
+ rc = kstrtouint(buf, 0, &mark);
+ if (rc)
+ return rc;
+
+ if (mark == 0)
+ mark = dlm_config.ci_mark;
+
+ comm = config_item_to_comm(item);
+ rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark);
+ if (rc)
+ return rc;
+
+ comm->mark = mark;
+ return len;
+}
+
+CONFIGFS_ATTR(comm_, nodeid);
+CONFIGFS_ATTR(comm_, local);
+CONFIGFS_ATTR(comm_, mark);
+CONFIGFS_ATTR_WO(comm_, addr);
+CONFIGFS_ATTR_RO(comm_, addr_list);
+
+static struct configfs_attribute *comm_attrs[] = {
+ [COMM_ATTR_NODEID] = &comm_attr_nodeid,
+ [COMM_ATTR_LOCAL] = &comm_attr_local,
+ [COMM_ATTR_ADDR] = &comm_attr_addr,
+ [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list,
+ [COMM_ATTR_MARK] = &comm_attr_mark,
+ NULL,
+};
+
+static ssize_t node_nodeid_show(struct config_item *item, char *buf)
+{
+ return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid);
+}
+
+static ssize_t node_nodeid_store(struct config_item *item, const char *buf,
+ size_t len)
+{
+ struct dlm_node *nd = config_item_to_node(item);
+ uint32_t seq = 0;
+ int rc = kstrtoint(buf, 0, &nd->nodeid);
+
+ if (rc)
+ return rc;
+ dlm_comm_seq(nd->nodeid, &seq);
+ nd->comm_seq = seq;
+ return len;
+}
+
+static ssize_t node_weight_show(struct config_item *item, char *buf)
+{
+ return sprintf(buf, "%d\n", config_item_to_node(item)->weight);
+}
+
+static ssize_t node_weight_store(struct config_item *item, const char *buf,
+ size_t len)
+{
+ int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight);
+
+ if (rc)
+ return rc;
+ return len;
+}
+
+CONFIGFS_ATTR(node_, nodeid);
+CONFIGFS_ATTR(node_, weight);
+
+static struct configfs_attribute *node_attrs[] = {
+ [NODE_ATTR_NODEID] = &node_attr_nodeid,
+ [NODE_ATTR_WEIGHT] = &node_attr_weight,
+ NULL,
+};
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct dlm_space *get_space(char *name)
+{
+ struct config_item *i;
+
+ if (!space_list)
+ return NULL;
+
+ mutex_lock(&space_list->cg_subsys->su_mutex);
+ i = config_group_find_item(space_list, name);
+ mutex_unlock(&space_list->cg_subsys->su_mutex);
+
+ return config_item_to_space(i);
+}
+
+static void put_space(struct dlm_space *sp)
+{
+ config_item_put(&sp->group.cg_item);
+}
+
+static struct dlm_comm *get_comm(int nodeid)
+{
+ struct config_item *i;
+ struct dlm_comm *cm = NULL;
+ int found = 0;
+
+ if (!comm_list)
+ return NULL;
+
+ mutex_lock(&clusters_root.subsys.su_mutex);
+
+ list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+ cm = config_item_to_comm(i);
+
+ if (cm->nodeid != nodeid)
+ continue;
+ found = 1;
+ config_item_get(i);
+ break;
+ }
+ mutex_unlock(&clusters_root.subsys.su_mutex);
+
+ if (!found)
+ cm = NULL;
+ return cm;
+}
+
+static void put_comm(struct dlm_comm *cm)
+{
+ config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+ int *count_out)
+{
+ struct dlm_space *sp;
+ struct dlm_node *nd;
+ struct dlm_config_node *nodes, *node;
+ int rv, count;
+
+ sp = get_space(lsname);
+ if (!sp)
+ return -EEXIST;
+
+ mutex_lock(&sp->members_lock);
+ if (!sp->members_count) {
+ rv = -EINVAL;
+ printk(KERN_ERR "dlm: zero members_count\n");
+ goto out;
+ }
+
+ count = sp->members_count;
+
+ nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
+ if (!nodes) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ node = nodes;
+ list_for_each_entry(nd, &sp->members, list) {
+ node->nodeid = nd->nodeid;
+ node->weight = nd->weight;
+ node->new = nd->new;
+ node->comm_seq = nd->comm_seq;
+ node++;
+
+ nd->new = 0;
+ }
+
+ *count_out = count;
+ *nodes_out = nodes;
+ rv = 0;
+ out:
+ mutex_unlock(&sp->members_lock);
+ put_space(sp);
+ return rv;
+}
+
+int dlm_comm_seq(int nodeid, uint32_t *seq)
+{
+ struct dlm_comm *cm = get_comm(nodeid);
+ if (!cm)
+ return -EEXIST;
+ *seq = cm->seq;
+ put_comm(cm);
+ return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+ return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+ if (!local_comm)
+ return -1;
+ if (num + 1 > local_comm->addr_count)
+ return -1;
+ memcpy(addr, local_comm->addr[num], sizeof(*addr));
+ return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT 21064
+#define DEFAULT_RSBTBL_SIZE 1024
+#define DEFAULT_RECOVER_TIMER 5
+#define DEFAULT_TOSS_SECS 10
+#define DEFAULT_SCAN_SECS 5
+#define DEFAULT_LOG_DEBUG 0
+#define DEFAULT_LOG_INFO 1
+#define DEFAULT_PROTOCOL DLM_PROTO_TCP
+#define DEFAULT_MARK 0
+#define DEFAULT_NEW_RSB_COUNT 128
+#define DEFAULT_RECOVER_CALLBACKS 0
+#define DEFAULT_CLUSTER_NAME ""
+
+struct dlm_config_info dlm_config = {
+ .ci_tcp_port = DEFAULT_TCP_PORT,
+ .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE,
+ .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+ .ci_recover_timer = DEFAULT_RECOVER_TIMER,
+ .ci_toss_secs = DEFAULT_TOSS_SECS,
+ .ci_scan_secs = DEFAULT_SCAN_SECS,
+ .ci_log_debug = DEFAULT_LOG_DEBUG,
+ .ci_log_info = DEFAULT_LOG_INFO,
+ .ci_protocol = DEFAULT_PROTOCOL,
+ .ci_mark = DEFAULT_MARK,
+ .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
+ .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
+ .ci_cluster_name = DEFAULT_CLUSTER_NAME
+};
+
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 0000000000..4c91fcca0f
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DLM_MAX_SOCKET_BUFSIZE 4096
+
+struct dlm_config_node {
+ int nodeid;
+ int weight;
+ int new;
+ uint32_t comm_seq;
+};
+
+#define DLM_MAX_ADDR_COUNT 3
+
+#define DLM_PROTO_TCP 0
+#define DLM_PROTO_SCTP 1
+
+struct dlm_config_info {
+ int ci_tcp_port;
+ int ci_buffer_size;
+ int ci_rsbtbl_size;
+ int ci_recover_timer;
+ int ci_toss_secs;
+ int ci_scan_secs;
+ int ci_log_debug;
+ int ci_log_info;
+ int ci_protocol;
+ int ci_mark;
+ int ci_new_rsb_count;
+ int ci_recover_callbacks;
+ char ci_cluster_name[DLM_LOCKSPACE_LEN];
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+ int *count_out);
+int dlm_comm_seq(int nodeid, uint32_t *seq);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif /* __CONFIG_DOT_H__ */
+
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 0000000000..d2c0353875
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,1052 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "midcomms.h"
+#include "lock.h"
+#include "ast.h"
+
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+
+static struct dentry *dlm_root;
+static struct dentry *dlm_comms;
+
+static char *print_lockmode(int mode)
+{
+ switch (mode) {
+ case DLM_LOCK_IV:
+ return "--";
+ case DLM_LOCK_NL:
+ return "NL";
+ case DLM_LOCK_CR:
+ return "CR";
+ case DLM_LOCK_CW:
+ return "CW";
+ case DLM_LOCK_PR:
+ return "PR";
+ case DLM_LOCK_PW:
+ return "PW";
+ case DLM_LOCK_EX:
+ return "EX";
+ default:
+ return "??";
+ }
+}
+
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *res)
+{
+ seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+ if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
+ lkb->lkb_status == DLM_LKSTS_WAITING)
+ seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+ if (lkb->lkb_nodeid) {
+ if (lkb->lkb_nodeid != res->res_nodeid)
+ seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+ lkb->lkb_remid);
+ else
+ seq_printf(s, " Master: %08x", lkb->lkb_remid);
+ }
+
+ if (lkb->lkb_wait_type)
+ seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+ seq_putc(s, '\n');
+}
+
+static void print_format1(struct dlm_rsb *res, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+ int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+
+ lock_rsb(res);
+
+ seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+
+ for (i = 0; i < res->res_length; i++) {
+ if (isprint(res->res_name[i]))
+ seq_printf(s, "%c", res->res_name[i]);
+ else
+ seq_printf(s, "%c", '.');
+ }
+
+ if (res->res_nodeid > 0)
+ seq_printf(s, "\"\nLocal Copy, Master is node %d\n",
+ res->res_nodeid);
+ else if (res->res_nodeid == 0)
+ seq_puts(s, "\"\nMaster Copy\n");
+ else if (res->res_nodeid == -1)
+ seq_printf(s, "\"\nLooking up master (lkid %x)\n",
+ res->res_first_lkid);
+ else
+ seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid);
+ if (seq_has_overflowed(s))
+ goto out;
+
+ /* Print the LVB: */
+ if (res->res_lvbptr) {
+ seq_puts(s, "LVB: ");
+ for (i = 0; i < lvblen; i++) {
+ if (i == lvblen / 2)
+ seq_puts(s, "\n ");
+ seq_printf(s, "%02x ",
+ (unsigned char) res->res_lvbptr[i]);
+ }
+ if (rsb_flag(res, RSB_VALNOTVALID))
+ seq_puts(s, " (INVALID)");
+ seq_putc(s, '\n');
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ root_list = !list_empty(&res->res_root_list);
+ recover_list = !list_empty(&res->res_recover_list);
+
+ if (root_list || recover_list) {
+ seq_printf(s, "Recovery: root %d recover %d flags %lx count %d\n",
+ root_list, recover_list,
+ res->res_flags, res->res_recover_locks_count);
+ }
+
+ /* Print the locks attached to this resource */
+ seq_puts(s, "Granted Queue\n");
+ list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
+ print_format1_lock(s, lkb, res);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ seq_puts(s, "Conversion Queue\n");
+ list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
+ print_format1_lock(s, lkb, res);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ seq_puts(s, "Waiting Queue\n");
+ list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
+ print_format1_lock(s, lkb, res);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ if (list_empty(&res->res_lookup))
+ goto out;
+
+ seq_puts(s, "Lookup Queue\n");
+ list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+ seq_printf(s, "%08x %s",
+ lkb->lkb_id, print_lockmode(lkb->lkb_rqmode));
+ if (lkb->lkb_wait_type)
+ seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+ seq_putc(s, '\n');
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+ out:
+ unlock_rsb(res);
+}
+
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *r)
+{
+ u64 xid = 0;
+ u64 us;
+
+ if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
+ if (lkb->lkb_ua)
+ xid = lkb->lkb_ua->xid;
+ }
+
+ /* microseconds since lkb was added to current queue */
+ us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
+
+ /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
+ r_nodeid r_len r_name */
+
+ seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
+ lkb->lkb_id,
+ lkb->lkb_nodeid,
+ lkb->lkb_remid,
+ lkb->lkb_ownpid,
+ (unsigned long long)xid,
+ lkb->lkb_exflags,
+ dlm_iflags_val(lkb),
+ lkb->lkb_status,
+ lkb->lkb_grmode,
+ lkb->lkb_rqmode,
+ (unsigned long long)us,
+ r->res_nodeid,
+ r->res_length,
+ r->res_name);
+}
+
+static void print_format2(struct dlm_rsb *r, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+
+ lock_rsb(r);
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ print_format2_lock(s, lkb, r);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ print_format2_lock(s, lkb, r);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+ print_format2_lock(s, lkb, r);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+ out:
+ unlock_rsb(r);
+}
+
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ int rsb_lookup)
+{
+ u64 xid = 0;
+
+ if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
+ if (lkb->lkb_ua)
+ xid = lkb->lkb_ua->xid;
+ }
+
+ seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+ lkb->lkb_id,
+ lkb->lkb_nodeid,
+ lkb->lkb_remid,
+ lkb->lkb_ownpid,
+ (unsigned long long)xid,
+ lkb->lkb_exflags,
+ dlm_iflags_val(lkb),
+ lkb->lkb_status,
+ lkb->lkb_grmode,
+ lkb->lkb_rqmode,
+ lkb->lkb_last_bast_mode,
+ rsb_lookup,
+ lkb->lkb_wait_type,
+ lkb->lkb_lvbseq,
+ (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+ (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
+}
+
+static void print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+ int i, lvblen = r->res_ls->ls_lvblen;
+ int print_name = 1;
+
+ lock_rsb(r);
+
+ seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+ r,
+ r->res_nodeid,
+ r->res_first_lkid,
+ r->res_flags,
+ !list_empty(&r->res_root_list),
+ !list_empty(&r->res_recover_list),
+ r->res_recover_locks_count,
+ r->res_length);
+ if (seq_has_overflowed(s))
+ goto out;
+
+ for (i = 0; i < r->res_length; i++) {
+ if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+ print_name = 0;
+ }
+
+ seq_puts(s, print_name ? "str " : "hex");
+
+ for (i = 0; i < r->res_length; i++) {
+ if (print_name)
+ seq_printf(s, "%c", r->res_name[i]);
+ else
+ seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+ }
+ seq_putc(s, '\n');
+ if (seq_has_overflowed(s))
+ goto out;
+
+ if (!r->res_lvbptr)
+ goto do_locks;
+
+ seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+
+ for (i = 0; i < lvblen; i++)
+ seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+ seq_putc(s, '\n');
+ if (seq_has_overflowed(s))
+ goto out;
+
+ do_locks:
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ print_format3_lock(s, lkb, 0);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ print_format3_lock(s, lkb, 0);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+ print_format3_lock(s, lkb, 0);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
+ print_format3_lock(s, lkb, 1);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+ out:
+ unlock_rsb(r);
+}
+
+static void print_format4(struct dlm_rsb *r, struct seq_file *s)
+{
+ int our_nodeid = dlm_our_nodeid();
+ int print_name = 1;
+ int i;
+
+ lock_rsb(r);
+
+ seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ",
+ r,
+ r->res_nodeid,
+ r->res_master_nodeid,
+ r->res_dir_nodeid,
+ our_nodeid,
+ r->res_toss_time,
+ r->res_flags,
+ r->res_length);
+
+ for (i = 0; i < r->res_length; i++) {
+ if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+ print_name = 0;
+ }
+
+ seq_puts(s, print_name ? "str " : "hex");
+
+ for (i = 0; i < r->res_length; i++) {
+ if (print_name)
+ seq_printf(s, "%c", r->res_name[i]);
+ else
+ seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+ }
+ seq_putc(s, '\n');
+ unlock_rsb(r);
+}
+
+static void print_format5_lock(struct seq_file *s, struct dlm_lkb *lkb)
+{
+ struct dlm_callback *cb;
+
+ /* lkb_id lkb_flags mode flags sb_status sb_flags */
+
+ spin_lock(&lkb->lkb_cb_lock);
+ list_for_each_entry(cb, &lkb->lkb_callbacks, list) {
+ seq_printf(s, "%x %x %d %x %d %x\n",
+ lkb->lkb_id,
+ dlm_iflags_val(lkb),
+ cb->mode,
+ cb->flags,
+ cb->sb_status,
+ cb->sb_flags);
+ }
+ spin_unlock(&lkb->lkb_cb_lock);
+}
+
+static void print_format5(struct dlm_rsb *r, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+
+ lock_rsb(r);
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ print_format5_lock(s, lkb);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ print_format5_lock(s, lkb);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+ print_format5_lock(s, lkb);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+ out:
+ unlock_rsb(r);
+}
+
+struct rsbtbl_iter {
+ struct dlm_rsb *rsb;
+ unsigned bucket;
+ int format;
+ int header;
+};
+
+/*
+ * If the buffer is full, seq_printf can be called again, but it
+ * does nothing. So, the these printing routines periodically check
+ * seq_has_overflowed to avoid wasting too much time trying to print to
+ * a full buffer.
+ */
+
+static int table_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+ struct rsbtbl_iter *ri = iter_ptr;
+
+ switch (ri->format) {
+ case 1:
+ print_format1(ri->rsb, seq);
+ break;
+ case 2:
+ if (ri->header) {
+ seq_puts(seq, "id nodeid remid pid xid exflags flags sts grmode rqmode time_ms r_nodeid r_len r_name\n");
+ ri->header = 0;
+ }
+ print_format2(ri->rsb, seq);
+ break;
+ case 3:
+ if (ri->header) {
+ seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+ ri->header = 0;
+ }
+ print_format3(ri->rsb, seq);
+ break;
+ case 4:
+ if (ri->header) {
+ seq_puts(seq, "version 4 rsb 2\n");
+ ri->header = 0;
+ }
+ print_format4(ri->rsb, seq);
+ break;
+ case 5:
+ if (ri->header) {
+ seq_puts(seq, "lkb_id lkb_flags mode flags sb_status sb_flags\n");
+ ri->header = 0;
+ }
+ print_format5(ri->rsb, seq);
+ break;
+ }
+
+ return 0;
+}
+
+static const struct seq_operations format1_seq_ops;
+static const struct seq_operations format2_seq_ops;
+static const struct seq_operations format3_seq_ops;
+static const struct seq_operations format4_seq_ops;
+static const struct seq_operations format5_seq_ops;
+
+static void *table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct rb_root *tree;
+ struct rb_node *node;
+ struct dlm_ls *ls = seq->private;
+ struct rsbtbl_iter *ri;
+ struct dlm_rsb *r;
+ loff_t n = *pos;
+ unsigned bucket, entry;
+ int toss = (seq->op == &format4_seq_ops);
+
+ bucket = n >> 32;
+ entry = n & ((1LL << 32) - 1);
+
+ if (bucket >= ls->ls_rsbtbl_size)
+ return NULL;
+
+ ri = kzalloc(sizeof(*ri), GFP_NOFS);
+ if (!ri)
+ return NULL;
+ if (n == 0)
+ ri->header = 1;
+ if (seq->op == &format1_seq_ops)
+ ri->format = 1;
+ if (seq->op == &format2_seq_ops)
+ ri->format = 2;
+ if (seq->op == &format3_seq_ops)
+ ri->format = 3;
+ if (seq->op == &format4_seq_ops)
+ ri->format = 4;
+ if (seq->op == &format5_seq_ops)
+ ri->format = 5;
+
+ tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ if (!RB_EMPTY_ROOT(tree)) {
+ for (node = rb_first(tree); node; node = rb_next(node)) {
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
+ if (!entry--) {
+ dlm_hold_rsb(r);
+ ri->rsb = r;
+ ri->bucket = bucket;
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ return ri;
+ }
+ }
+ }
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+ /*
+ * move to the first rsb in the next non-empty bucket
+ */
+
+ /* zero the entry */
+ n &= ~((1LL << 32) - 1);
+
+ while (1) {
+ bucket++;
+ n += 1LL << 32;
+
+ if (bucket >= ls->ls_rsbtbl_size) {
+ kfree(ri);
+ return NULL;
+ }
+ tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ if (!RB_EMPTY_ROOT(tree)) {
+ node = rb_first(tree);
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
+ dlm_hold_rsb(r);
+ ri->rsb = r;
+ ri->bucket = bucket;
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ *pos = n;
+ return ri;
+ }
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ }
+}
+
+static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
+{
+ struct dlm_ls *ls = seq->private;
+ struct rsbtbl_iter *ri = iter_ptr;
+ struct rb_root *tree;
+ struct rb_node *next;
+ struct dlm_rsb *r, *rp;
+ loff_t n = *pos;
+ unsigned bucket;
+ int toss = (seq->op == &format4_seq_ops);
+
+ bucket = n >> 32;
+
+ /*
+ * move to the next rsb in the same bucket
+ */
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ rp = ri->rsb;
+ next = rb_next(&rp->res_hashnode);
+
+ if (next) {
+ r = rb_entry(next, struct dlm_rsb, res_hashnode);
+ dlm_hold_rsb(r);
+ ri->rsb = r;
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ dlm_put_rsb(rp);
+ ++*pos;
+ return ri;
+ }
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ dlm_put_rsb(rp);
+
+ /*
+ * move to the first rsb in the next non-empty bucket
+ */
+
+ /* zero the entry */
+ n &= ~((1LL << 32) - 1);
+
+ while (1) {
+ bucket++;
+ n += 1LL << 32;
+
+ if (bucket >= ls->ls_rsbtbl_size) {
+ kfree(ri);
+ ++*pos;
+ return NULL;
+ }
+ tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ if (!RB_EMPTY_ROOT(tree)) {
+ next = rb_first(tree);
+ r = rb_entry(next, struct dlm_rsb, res_hashnode);
+ dlm_hold_rsb(r);
+ ri->rsb = r;
+ ri->bucket = bucket;
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ *pos = n;
+ return ri;
+ }
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ }
+}
+
+static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+ struct rsbtbl_iter *ri = iter_ptr;
+
+ if (ri) {
+ dlm_put_rsb(ri->rsb);
+ kfree(ri);
+ }
+}
+
+static const struct seq_operations format1_seq_ops = {
+ .start = table_seq_start,
+ .next = table_seq_next,
+ .stop = table_seq_stop,
+ .show = table_seq_show,
+};
+
+static const struct seq_operations format2_seq_ops = {
+ .start = table_seq_start,
+ .next = table_seq_next,
+ .stop = table_seq_stop,
+ .show = table_seq_show,
+};
+
+static const struct seq_operations format3_seq_ops = {
+ .start = table_seq_start,
+ .next = table_seq_next,
+ .stop = table_seq_stop,
+ .show = table_seq_show,
+};
+
+static const struct seq_operations format4_seq_ops = {
+ .start = table_seq_start,
+ .next = table_seq_next,
+ .stop = table_seq_stop,
+ .show = table_seq_show,
+};
+
+static const struct seq_operations format5_seq_ops = {
+ .start = table_seq_start,
+ .next = table_seq_next,
+ .stop = table_seq_stop,
+ .show = table_seq_show,
+};
+
+static const struct file_operations format1_fops;
+static const struct file_operations format2_fops;
+static const struct file_operations format3_fops;
+static const struct file_operations format4_fops;
+static const struct file_operations format5_fops;
+
+static int table_open1(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format1_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open2(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format2_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static ssize_t table_write2(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ int n, len, lkb_nodeid, lkb_status, error;
+ char name[DLM_RESNAME_MAXLEN + 1] = {};
+ struct dlm_ls *ls = seq->private;
+ unsigned int lkb_flags;
+ char buf[256] = {};
+ uint32_t lkb_id;
+
+ if (copy_from_user(buf, user_buf,
+ min_t(size_t, sizeof(buf) - 1, count)))
+ return -EFAULT;
+
+ n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d",
+ &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status);
+ if (n != 5)
+ return -EINVAL;
+
+ len = strnlen(name, DLM_RESNAME_MAXLEN);
+ error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags,
+ lkb_nodeid, lkb_status);
+ if (error)
+ return error;
+
+ return count;
+}
+
+static int table_open3(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format3_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open4(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format4_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open5(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format5_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static const struct file_operations format1_fops = {
+ .owner = THIS_MODULE,
+ .open = table_open1,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static const struct file_operations format2_fops = {
+ .owner = THIS_MODULE,
+ .open = table_open2,
+ .read = seq_read,
+ .write = table_write2,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static const struct file_operations format3_fops = {
+ .owner = THIS_MODULE,
+ .open = table_open3,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static const struct file_operations format4_fops = {
+ .owner = THIS_MODULE,
+ .open = table_open4,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static const struct file_operations format5_fops = {
+ .owner = THIS_MODULE,
+ .open = table_open5,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * dump lkb's on the ls_waiters list
+ */
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_ls *ls = file->private_data;
+ struct dlm_lkb *lkb;
+ size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+
+ mutex_lock(&debug_buf_lock);
+ mutex_lock(&ls->ls_waiters_mutex);
+ memset(debug_buf, 0, sizeof(debug_buf));
+
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+ lkb->lkb_id, lkb->lkb_wait_type,
+ lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+ mutex_unlock(&debug_buf_lock);
+ return rv;
+}
+
+static ssize_t waiters_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_ls *ls = file->private_data;
+ int mstype, to_nodeid;
+ char buf[128] = {};
+ uint32_t lkb_id;
+ int n, error;
+
+ if (copy_from_user(buf, user_buf,
+ min_t(size_t, sizeof(buf) - 1, count)))
+ return -EFAULT;
+
+ n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid);
+ if (n != 3)
+ return -EINVAL;
+
+ error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid);
+ if (error)
+ return error;
+
+ return count;
+}
+
+static const struct file_operations waiters_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = waiters_read,
+ .write = waiters_write,
+ .llseek = default_llseek,
+};
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ debugfs_remove(ls->ls_debug_waiters_dentry);
+ debugfs_remove(ls->ls_debug_locks_dentry);
+ debugfs_remove(ls->ls_debug_all_dentry);
+ debugfs_remove(ls->ls_debug_toss_dentry);
+ debugfs_remove(ls->ls_debug_queued_asts_dentry);
+}
+
+static int dlm_state_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%s\n", dlm_midcomms_state(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_state);
+
+static int dlm_flags_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_flags);
+
+static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt);
+
+static int dlm_version_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_version);
+
+static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ void *buf;
+ int ret;
+
+ if (count > PAGE_SIZE || count < sizeof(struct dlm_header))
+ return -EINVAL;
+
+ buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, user_buf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count);
+ if (ret)
+ goto out;
+
+ kfree(buf);
+ return count;
+
+out:
+ kfree(buf);
+ return ret;
+}
+
+static const struct file_operations dlm_rawmsg_fops = {
+ .open = simple_open,
+ .write = dlm_rawmsg_write,
+ .llseek = no_llseek,
+};
+
+void *dlm_create_debug_comms_file(int nodeid, void *data)
+{
+ struct dentry *d_node;
+ char name[256];
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, 256, "%d", nodeid);
+
+ d_node = debugfs_create_dir(name, dlm_comms);
+ debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops);
+ debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops);
+ debugfs_create_file("send_queue_count", 0444, d_node, data,
+ &dlm_send_queue_cnt_fops);
+ debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops);
+ debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops);
+
+ return d_node;
+}
+
+void dlm_delete_debug_comms_file(void *ctx)
+{
+ debugfs_remove(ctx);
+}
+
+void dlm_create_debug_file(struct dlm_ls *ls)
+{
+ /* Reserve enough space for the longest file name */
+ char name[DLM_LOCKSPACE_LEN + sizeof("_queued_asts")];
+
+ /* format 1 */
+
+ ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &format1_fops);
+
+ /* format 2 */
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, sizeof(name), "%s_locks", ls->ls_name);
+
+ ls->ls_debug_locks_dentry = debugfs_create_file(name,
+ 0644,
+ dlm_root,
+ ls,
+ &format2_fops);
+
+ /* format 3 */
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, sizeof(name), "%s_all", ls->ls_name);
+
+ ls->ls_debug_all_dentry = debugfs_create_file(name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &format3_fops);
+
+ /* format 4 */
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, sizeof(name), "%s_toss", ls->ls_name);
+
+ ls->ls_debug_toss_dentry = debugfs_create_file(name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &format4_fops);
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, sizeof(name), "%s_waiters", ls->ls_name);
+
+ ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+ 0644,
+ dlm_root,
+ ls,
+ &waiters_fops);
+
+ /* format 5 */
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, sizeof(name), "%s_queued_asts", ls->ls_name);
+
+ ls->ls_debug_queued_asts_dentry = debugfs_create_file(name,
+ 0644,
+ dlm_root,
+ ls,
+ &format5_fops);
+}
+
+void __init dlm_register_debugfs(void)
+{
+ mutex_init(&debug_buf_lock);
+ dlm_root = debugfs_create_dir("dlm", NULL);
+ dlm_comms = debugfs_create_dir("comms", dlm_root);
+}
+
+void dlm_unregister_debugfs(void)
+{
+ debugfs_remove(dlm_root);
+}
+
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 0000000000..f6acba4310
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value. This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+ uint32_t node;
+
+ if (ls->ls_num_nodes == 1)
+ return dlm_our_nodeid();
+ else {
+ node = (hash >> 16) % ls->ls_total_weight;
+ return ls->ls_node_array[node];
+ }
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+ return r->res_dir_nodeid;
+}
+
+void dlm_recover_dir_nodeid(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ r->res_dir_nodeid = dlm_hash2nodeid(ls, r->res_hash);
+ }
+ up_read(&ls->ls_root_sem);
+}
+
+int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq)
+{
+ struct dlm_member *memb;
+ char *b, *last_name = NULL;
+ int error = -ENOMEM, last_len, nodeid, result;
+ uint16_t namelen;
+ unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
+
+ log_rinfo(ls, "dlm_recover_directory");
+
+ if (dlm_no_directory(ls))
+ goto out_status;
+
+ last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
+ if (!last_name)
+ goto out;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->nodeid == dlm_our_nodeid())
+ continue;
+
+ memset(last_name, 0, DLM_RESNAME_MAXLEN);
+ last_len = 0;
+
+ for (;;) {
+ int left;
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out_free;
+ }
+
+ error = dlm_rcom_names(ls, memb->nodeid,
+ last_name, last_len, seq);
+ if (error)
+ goto out_free;
+
+ cond_resched();
+
+ /*
+ * pick namelen/name pairs out of received buffer
+ */
+
+ b = ls->ls_recover_buf->rc_buf;
+ left = le16_to_cpu(ls->ls_recover_buf->rc_header.h_length);
+ left -= sizeof(struct dlm_rcom);
+
+ for (;;) {
+ __be16 v;
+
+ error = -EINVAL;
+ if (left < sizeof(__be16))
+ goto out_free;
+
+ memcpy(&v, b, sizeof(__be16));
+ namelen = be16_to_cpu(v);
+ b += sizeof(__be16);
+ left -= sizeof(__be16);
+
+ /* namelen of 0xFFFFF marks end of names for
+ this node; namelen of 0 marks end of the
+ buffer */
+
+ if (namelen == 0xFFFF)
+ goto done;
+ if (!namelen)
+ break;
+
+ if (namelen > left)
+ goto out_free;
+
+ if (namelen > DLM_RESNAME_MAXLEN)
+ goto out_free;
+
+ error = dlm_master_lookup(ls, memb->nodeid,
+ b, namelen,
+ DLM_LU_RECOVER_DIR,
+ &nodeid, &result);
+ if (error) {
+ log_error(ls, "recover_dir lookup %d",
+ error);
+ goto out_free;
+ }
+
+ /* The name was found in rsbtbl, but the
+ * master nodeid is different from
+ * memb->nodeid which says it is the master.
+ * This should not happen. */
+
+ if (result == DLM_LU_MATCH &&
+ nodeid != memb->nodeid) {
+ count_bad++;
+ log_error(ls, "recover_dir lookup %d "
+ "nodeid %d memb %d bad %u",
+ result, nodeid, memb->nodeid,
+ count_bad);
+ print_hex_dump_bytes("dlm_recover_dir ",
+ DUMP_PREFIX_NONE,
+ b, namelen);
+ }
+
+ /* The name was found in rsbtbl, and the
+ * master nodeid matches memb->nodeid. */
+
+ if (result == DLM_LU_MATCH &&
+ nodeid == memb->nodeid) {
+ count_match++;
+ }
+
+ /* The name was not found in rsbtbl and was
+ * added with memb->nodeid as the master. */
+
+ if (result == DLM_LU_ADD) {
+ count_add++;
+ }
+
+ last_len = namelen;
+ memcpy(last_name, b, namelen);
+ b += namelen;
+ left -= namelen;
+ count++;
+ }
+ }
+ done:
+ ;
+ }
+
+ out_status:
+ error = 0;
+ dlm_set_recover_status(ls, DLM_RS_DIR);
+
+ log_rinfo(ls, "dlm_recover_directory %u in %u new",
+ count, count_add);
+ out_free:
+ kfree(last_name);
+ out:
+ return error;
+}
+
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, const char *name,
+ int len)
+{
+ struct dlm_rsb *r;
+ uint32_t hash, bucket;
+ int rv;
+
+ hash = jhash(name, len, 0);
+ bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, &r);
+ if (rv)
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss,
+ name, len, &r);
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+ if (!rv)
+ return r;
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+ up_read(&ls->ls_root_sem);
+ log_debug(ls, "find_rsb_root revert to root_list %s",
+ r->res_name);
+ return r;
+ }
+ }
+ up_read(&ls->ls_root_sem);
+ return NULL;
+}
+
+/* Find the rsb where we left off (or start again), then send rsb names
+ for rsb's we're master of and whose directory node matches the requesting
+ node. inbuf is the rsb name last sent, inlen is the name's length */
+
+void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid)
+{
+ struct list_head *list;
+ struct dlm_rsb *r;
+ int offset = 0, dir_nodeid;
+ __be16 be_namelen;
+
+ down_read(&ls->ls_root_sem);
+
+ if (inlen > 1) {
+ r = find_rsb_root(ls, inbuf, inlen);
+ if (!r) {
+ log_error(ls, "copy_master_names from %d start %d %.*s",
+ nodeid, inlen, inlen, inbuf);
+ goto out;
+ }
+ list = r->res_root_list.next;
+ } else {
+ list = ls->ls_root_list.next;
+ }
+
+ for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+ r = list_entry(list, struct dlm_rsb, res_root_list);
+ if (r->res_nodeid)
+ continue;
+
+ dir_nodeid = dlm_dir_nodeid(r);
+ if (dir_nodeid != nodeid)
+ continue;
+
+ /*
+ * The block ends when we can't fit the following in the
+ * remaining buffer space:
+ * namelen (uint16_t) +
+ * name (r->res_length) +
+ * end-of-block record 0x0000 (uint16_t)
+ */
+
+ if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+ /* Write end-of-block record */
+ be_namelen = cpu_to_be16(0);
+ memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+ offset += sizeof(__be16);
+ ls->ls_recover_dir_sent_msg++;
+ goto out;
+ }
+
+ be_namelen = cpu_to_be16(r->res_length);
+ memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+ offset += sizeof(__be16);
+ memcpy(outbuf + offset, r->res_name, r->res_length);
+ offset += r->res_length;
+ ls->ls_recover_dir_sent_res++;
+ }
+
+ /*
+ * If we've reached the end of the list (and there's room) write a
+ * terminating record.
+ */
+
+ if ((list == &ls->ls_root_list) &&
+ (offset + sizeof(uint16_t) <= outlen)) {
+ be_namelen = cpu_to_be16(0xFFFF);
+ memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+ offset += sizeof(__be16);
+ ls->ls_recover_dir_sent_msg++;
+ }
+ out:
+ up_read(&ls->ls_root_sem);
+}
+
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 0000000000..39ecb69d7e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_recover_dir_nodeid(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq);
+void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid);
+
+#endif /* __DIR_DOT_H__ */
+
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 0000000000..dfc444dad3
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,829 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+#include <linux/ratelimit.h>
+#include <linux/uaccess.h>
+
+#include <linux/dlm.h>
+#include "config.h"
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_rsbtable;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+struct dlm_msg;
+
+#define log_print(fmt, args...) \
+ printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_print_ratelimited(fmt, args...) \
+ printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args)
+#define log_error(ls, fmt, args...) \
+ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
+#define log_rinfo(ls, fmt, args...) \
+do { \
+ if (dlm_config.ci_log_info) \
+ printk(KERN_INFO "dlm: %s: " fmt "\n", \
+ (ls)->ls_name, ##args); \
+ else if (dlm_config.ci_log_debug) \
+ printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+ (ls)->ls_name , ##args); \
+} while (0)
+
+#define log_debug(ls, fmt, args...) \
+do { \
+ if (dlm_config.ci_log_debug) \
+ printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+ (ls)->ls_name , ##args); \
+} while (0)
+
+#define log_limit(ls, fmt, args...) \
+do { \
+ if (dlm_config.ci_log_debug) \
+ printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \
+ (ls)->ls_name , ##args); \
+} while (0)
+
+#define DLM_ASSERT(x, do) \
+{ \
+ if (!(x)) \
+ { \
+ printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
+ "DLM: assertion: \"%s\"\n" \
+ "DLM: time = %lu\n", \
+ __LINE__, __FILE__, #x, jiffies); \
+ {do} \
+ printk("\n"); \
+ panic("DLM: Record message above and reboot.\n"); \
+ } \
+}
+
+
+#define DLM_RTF_SHRINK_BIT 0
+
+struct dlm_rsbtable {
+ struct rb_root keep;
+ struct rb_root toss;
+ spinlock_t lock;
+ unsigned long flags;
+};
+
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+ struct list_head list;
+ int nodeid;
+ int weight;
+ int slot;
+ int slot_prev;
+ int comm_seq;
+ uint32_t generation;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+ struct list_head list;
+ struct dlm_config_node *nodes;
+ int nodes_count;
+ uint64_t seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+ uint32_t flags;
+ void (*astfn) (void *astparam);
+ void *astparam;
+ void (*bastfn) (void *astparam, int mode);
+ int mode;
+ struct dlm_lksb *lksb;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy lock is mastered locally
+ * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy lock is mastered on a remote node
+ * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy master node's copy of a lock owned by remote node
+ * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock. The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes. One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v. Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags. These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast. All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed. Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING 1
+#define DLM_LKSTS_GRANTED 2
+#define DLM_LKSTS_CONVERT 3
+
+/* lkb_iflags */
+
+#define DLM_IFL_MSTCPY_BIT 16
+#define __DLM_IFL_MIN_BIT DLM_IFL_MSTCPY_BIT
+#define DLM_IFL_RESEND_BIT 17
+#define DLM_IFL_DEAD_BIT 18
+#define DLM_IFL_OVERLAP_UNLOCK_BIT 19
+#define DLM_IFL_OVERLAP_CANCEL_BIT 20
+#define DLM_IFL_ENDOFLIFE_BIT 21
+#define DLM_IFL_DEADLOCK_CANCEL_BIT 24
+#define DLM_IFL_CB_PENDING_BIT 25
+#define __DLM_IFL_MAX_BIT DLM_IFL_CB_PENDING_BIT
+
+/* lkb_dflags */
+
+#define DLM_DFL_USER_BIT 0
+#define __DLM_DFL_MIN_BIT DLM_DFL_USER_BIT
+#define DLM_DFL_ORPHAN_BIT 1
+#define __DLM_DFL_MAX_BIT DLM_DFL_ORPHAN_BIT
+
+#define DLM_CB_CAST 0x00000001
+#define DLM_CB_BAST 0x00000002
+
+struct dlm_callback {
+ uint32_t flags; /* DLM_CBF_ */
+ int sb_status; /* copy to lksb status */
+ uint8_t sb_flags; /* copy to lksb flags */
+ int8_t mode; /* rq mode of bast, gr mode of cast */
+
+ struct list_head list;
+ struct kref ref;
+};
+
+struct dlm_lkb {
+ struct dlm_rsb *lkb_resource; /* the rsb */
+ struct kref lkb_ref;
+ int lkb_nodeid; /* copied from rsb */
+ int lkb_ownpid; /* pid of lock owner */
+ uint32_t lkb_id; /* our lock ID */
+ uint32_t lkb_remid; /* lock ID on remote partner */
+ uint32_t lkb_exflags; /* external flags from caller */
+ unsigned long lkb_sbflags; /* lksb flags */
+ unsigned long lkb_dflags; /* distributed flags */
+ unsigned long lkb_iflags; /* internal flags */
+ uint32_t lkb_lvbseq; /* lvb sequence number */
+
+ int8_t lkb_status; /* granted, waiting, convert */
+ int8_t lkb_rqmode; /* requested lock mode */
+ int8_t lkb_grmode; /* granted lock mode */
+ int8_t lkb_highbast; /* highest mode bast sent for */
+
+ int8_t lkb_wait_type; /* type of reply waiting for */
+ atomic_t lkb_wait_count;
+ int lkb_wait_nodeid; /* for debugging */
+
+ struct list_head lkb_statequeue; /* rsb g/c/w list */
+ struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
+ struct list_head lkb_wait_reply; /* waiting for remote reply */
+ struct list_head lkb_ownqueue; /* list of locks for a process */
+ ktime_t lkb_timestamp;
+
+ spinlock_t lkb_cb_lock;
+ struct work_struct lkb_cb_work;
+ struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */
+ struct list_head lkb_callbacks;
+ struct dlm_callback *lkb_last_cast;
+ struct dlm_callback *lkb_last_cb;
+ int lkb_last_bast_mode;
+ ktime_t lkb_last_cast_time; /* for debugging */
+ ktime_t lkb_last_bast_time; /* for debugging */
+
+ uint64_t lkb_recover_seq; /* from ls_recover_seq */
+
+ char *lkb_lvbptr;
+ struct dlm_lksb *lkb_lksb; /* caller's status block */
+ void (*lkb_astfn) (void *astparam);
+ void (*lkb_bastfn) (void *astparam, int mode);
+ union {
+ void *lkb_astparam; /* caller's ast arg */
+ struct dlm_user_args *lkb_ua;
+ };
+};
+
+/*
+ * res_master_nodeid is "normal": 0 is unset/invalid, non-zero is the real
+ * nodeid, even when nodeid is our_nodeid.
+ *
+ * res_nodeid is "odd": -1 is unset/invalid, zero means our_nodeid,
+ * greater than zero when another nodeid.
+ *
+ * (TODO: remove res_nodeid and only use res_master_nodeid)
+ */
+
+struct dlm_rsb {
+ struct dlm_ls *res_ls; /* the lockspace */
+ struct kref res_ref;
+ struct mutex res_mutex;
+ unsigned long res_flags;
+ int res_length; /* length of rsb name */
+ int res_nodeid;
+ int res_master_nodeid;
+ int res_dir_nodeid;
+ int res_id; /* for ls_recover_idr */
+ uint32_t res_lvbseq;
+ uint32_t res_hash;
+ uint32_t res_bucket; /* rsbtbl */
+ unsigned long res_toss_time;
+ uint32_t res_first_lkid;
+ struct list_head res_lookup; /* lkbs waiting on first */
+ union {
+ struct list_head res_hashchain;
+ struct rb_node res_hashnode; /* rsbtbl */
+ };
+ struct list_head res_grantqueue;
+ struct list_head res_convertqueue;
+ struct list_head res_waitqueue;
+
+ struct list_head res_root_list; /* used for recovery */
+ struct list_head res_recover_list; /* used for recovery */
+ int res_recover_locks_count;
+
+ char *res_lvbptr;
+ char res_name[DLM_RESNAME_MAXLEN+1];
+};
+
+/* dlm_master_lookup() flags */
+
+#define DLM_LU_RECOVER_DIR 1
+#define DLM_LU_RECOVER_MASTER 2
+
+/* dlm_master_lookup() results */
+
+#define DLM_LU_MATCH 1
+#define DLM_LU_ADD 2
+
+/* find_rsb() flags */
+
+#define R_REQUEST 0x00000001
+#define R_RECEIVE_REQUEST 0x00000002
+#define R_RECEIVE_RECOVER 0x00000004
+
+/* rsb_flags */
+
+enum rsb_flags {
+ RSB_MASTER_UNCERTAIN,
+ RSB_VALNOTVALID,
+ RSB_VALNOTVALID_PREV,
+ RSB_NEW_MASTER,
+ RSB_NEW_MASTER2,
+ RSB_RECOVER_CONVERT,
+ RSB_RECOVER_GRANT,
+ RSB_RECOVER_LVB_INVAL,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ __set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ __clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR 0x00030000
+#define DLM_HEADER_MINOR 0x00000002
+
+#define DLM_VERSION_3_1 0x00030001
+#define DLM_VERSION_3_2 0x00030002
+
+#define DLM_HEADER_SLOTS 0x00000001
+
+#define DLM_MSG 1
+#define DLM_RCOM 2
+#define DLM_OPTS 3
+#define DLM_ACK 4
+#define DLM_FIN 5
+
+struct dlm_header {
+ __le32 h_version;
+ union {
+ /* for DLM_MSG and DLM_RCOM */
+ __le32 h_lockspace;
+ /* for DLM_ACK and DLM_OPTS */
+ __le32 h_seq;
+ } u;
+ __le32 h_nodeid; /* nodeid of sender */
+ __le16 h_length;
+ uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
+ uint8_t h_pad;
+};
+
+#define DLM_MSG_REQUEST 1
+#define DLM_MSG_CONVERT 2
+#define DLM_MSG_UNLOCK 3
+#define DLM_MSG_CANCEL 4
+#define DLM_MSG_REQUEST_REPLY 5
+#define DLM_MSG_CONVERT_REPLY 6
+#define DLM_MSG_UNLOCK_REPLY 7
+#define DLM_MSG_CANCEL_REPLY 8
+#define DLM_MSG_GRANT 9
+#define DLM_MSG_BAST 10
+#define DLM_MSG_LOOKUP 11
+#define DLM_MSG_REMOVE 12
+#define DLM_MSG_LOOKUP_REPLY 13
+#define DLM_MSG_PURGE 14
+
+struct dlm_message {
+ struct dlm_header m_header;
+ __le32 m_type; /* DLM_MSG_ */
+ __le32 m_nodeid;
+ __le32 m_pid;
+ __le32 m_lkid; /* lkid on sender */
+ __le32 m_remid; /* lkid on receiver */
+ __le32 m_parent_lkid;
+ __le32 m_parent_remid;
+ __le32 m_exflags;
+ __le32 m_sbflags;
+ __le32 m_flags;
+ __le32 m_lvbseq;
+ __le32 m_hash;
+ __le32 m_status;
+ __le32 m_grmode;
+ __le32 m_rqmode;
+ __le32 m_bastmode;
+ __le32 m_asts;
+ __le32 m_result; /* 0 or -EXXX */
+ char m_extra[]; /* name or lvb */
+};
+
+
+#define DLM_RS_NODES 0x00000001
+#define DLM_RS_NODES_ALL 0x00000002
+#define DLM_RS_DIR 0x00000004
+#define DLM_RS_DIR_ALL 0x00000008
+#define DLM_RS_LOCKS 0x00000010
+#define DLM_RS_LOCKS_ALL 0x00000020
+#define DLM_RS_DONE 0x00000040
+#define DLM_RS_DONE_ALL 0x00000080
+
+#define DLM_RCOM_STATUS 1
+#define DLM_RCOM_NAMES 2
+#define DLM_RCOM_LOOKUP 3
+#define DLM_RCOM_LOCK 4
+#define DLM_RCOM_STATUS_REPLY 5
+#define DLM_RCOM_NAMES_REPLY 6
+#define DLM_RCOM_LOOKUP_REPLY 7
+#define DLM_RCOM_LOCK_REPLY 8
+
+struct dlm_rcom {
+ struct dlm_header rc_header;
+ __le32 rc_type; /* DLM_RCOM_ */
+ __le32 rc_result; /* multi-purpose */
+ __le64 rc_id; /* match reply with request */
+ __le64 rc_seq; /* sender's ls_recover_seq */
+ __le64 rc_seq_reply; /* remote ls_recover_seq */
+ char rc_buf[];
+};
+
+struct dlm_opt_header {
+ __le16 t_type;
+ __le16 t_length;
+ __le32 t_pad;
+ /* need to be 8 byte aligned */
+ char t_value[];
+};
+
+/* encapsulation header */
+struct dlm_opts {
+ struct dlm_header o_header;
+ uint8_t o_nextcmd;
+ uint8_t o_pad;
+ __le16 o_optlen;
+ __le32 o_pad2;
+ char o_opts[];
+};
+
+union dlm_packet {
+ struct dlm_header header; /* common to other two */
+ struct dlm_message message;
+ struct dlm_rcom rcom;
+ struct dlm_opts opts;
+};
+
+#define DLM_RSF_NEED_SLOTS 0x00000001
+
+/* RCOM_STATUS data */
+struct rcom_status {
+ __le32 rs_flags;
+ __le32 rs_unused1;
+ __le64 rs_unused2;
+};
+
+/* RCOM_STATUS_REPLY data */
+struct rcom_config {
+ __le32 rf_lvblen;
+ __le32 rf_lsflags;
+
+ /* DLM_HEADER_SLOTS adds: */
+ __le32 rf_flags;
+ __le16 rf_our_slot;
+ __le16 rf_num_slots;
+ __le32 rf_generation;
+ __le32 rf_unused1;
+ __le64 rf_unused2;
+};
+
+struct rcom_slot {
+ __le32 ro_nodeid;
+ __le16 ro_slot;
+ __le16 ro_unused1;
+ __le64 ro_unused2;
+};
+
+struct rcom_lock {
+ __le32 rl_ownpid;
+ __le32 rl_lkid;
+ __le32 rl_remid;
+ __le32 rl_parent_lkid;
+ __le32 rl_parent_remid;
+ __le32 rl_exflags;
+ __le32 rl_flags;
+ __le32 rl_lvbseq;
+ __le32 rl_result;
+ int8_t rl_rqmode;
+ int8_t rl_grmode;
+ int8_t rl_status;
+ int8_t rl_asts;
+ __le16 rl_wait_type;
+ __le16 rl_namelen;
+ char rl_name[DLM_RESNAME_MAXLEN];
+ char rl_lvb[];
+};
+
+/*
+ * The max number of resources per rsbtbl bucket that shrink will attempt
+ * to remove in each iteration.
+ */
+
+#define DLM_REMOVE_NAMES_MAX 8
+
+struct dlm_ls {
+ struct list_head ls_list; /* list of lockspaces */
+ dlm_lockspace_t *ls_local_handle;
+ uint32_t ls_global_id; /* global unique lockspace ID */
+ uint32_t ls_generation;
+ uint32_t ls_exflags;
+ int ls_lvblen;
+ atomic_t ls_count; /* refcount of processes in
+ the dlm using this ls */
+ wait_queue_head_t ls_count_wait;
+ int ls_create_count; /* create/release refcount */
+ unsigned long ls_flags; /* LSFL_ */
+ unsigned long ls_scan_time;
+ struct kobject ls_kobj;
+
+ struct idr ls_lkbidr;
+ spinlock_t ls_lkbidr_spin;
+
+ struct dlm_rsbtable *ls_rsbtbl;
+ uint32_t ls_rsbtbl_size;
+
+ struct mutex ls_waiters_mutex;
+ struct list_head ls_waiters; /* lkbs needing a reply */
+
+ struct mutex ls_orphans_mutex;
+ struct list_head ls_orphans;
+
+ spinlock_t ls_new_rsb_spin;
+ int ls_new_rsb_count;
+ struct list_head ls_new_rsb; /* new rsb structs */
+
+ char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
+ int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
+
+ struct list_head ls_nodes; /* current nodes in ls */
+ struct list_head ls_nodes_gone; /* dead node list, recovery */
+ int ls_num_nodes; /* number of nodes in ls */
+ int ls_low_nodeid;
+ int ls_total_weight;
+ int *ls_node_array;
+
+ int ls_slot;
+ int ls_num_slots;
+ int ls_slots_size;
+ struct dlm_slot *ls_slots;
+
+ struct dlm_rsb ls_local_rsb; /* for returning errors */
+ struct dlm_lkb ls_local_lkb; /* for returning errors */
+ struct dlm_message ls_local_ms; /* for faking a reply */
+
+ struct dentry *ls_debug_rsb_dentry; /* debugfs */
+ struct dentry *ls_debug_waiters_dentry; /* debugfs */
+ struct dentry *ls_debug_locks_dentry; /* debugfs */
+ struct dentry *ls_debug_all_dentry; /* debugfs */
+ struct dentry *ls_debug_toss_dentry; /* debugfs */
+ struct dentry *ls_debug_queued_asts_dentry; /* debugfs */
+
+ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
+ int ls_uevent_result;
+ struct completion ls_recovery_done;
+ int ls_recovery_result;
+
+ struct miscdevice ls_device;
+
+ struct workqueue_struct *ls_callback_wq;
+
+ /* recovery related */
+
+ spinlock_t ls_cb_lock;
+ struct list_head ls_cb_delay; /* save for queue_work later */
+ struct timer_list ls_timer;
+ struct task_struct *ls_recoverd_task;
+ struct mutex ls_recoverd_active;
+ spinlock_t ls_recover_lock;
+ unsigned long ls_recover_begin; /* jiffies timestamp */
+ uint32_t ls_recover_status; /* DLM_RS_ */
+ uint64_t ls_recover_seq;
+ struct dlm_recover *ls_recover_args;
+ struct rw_semaphore ls_in_recovery; /* block local requests */
+ struct rw_semaphore ls_recv_active; /* block dlm_recv */
+ struct list_head ls_requestqueue;/* queue remote requests */
+ atomic_t ls_requestqueue_cnt;
+ wait_queue_head_t ls_requestqueue_wait;
+ struct mutex ls_requestqueue_mutex;
+ struct dlm_rcom *ls_recover_buf;
+ int ls_recover_nodeid; /* for debugging */
+ unsigned int ls_recover_dir_sent_res; /* for log info */
+ unsigned int ls_recover_dir_sent_msg; /* for log info */
+ unsigned int ls_recover_locks_in; /* for log info */
+ uint64_t ls_rcom_seq;
+ spinlock_t ls_rcom_spin;
+ struct list_head ls_recover_list;
+ spinlock_t ls_recover_list_lock;
+ int ls_recover_list_count;
+ struct idr ls_recover_idr;
+ spinlock_t ls_recover_idr_lock;
+ wait_queue_head_t ls_wait_general;
+ wait_queue_head_t ls_recover_lock_wait;
+ spinlock_t ls_clear_proc_locks;
+
+ struct list_head ls_root_list; /* root resources */
+ struct rw_semaphore ls_root_sem; /* protect root_list */
+
+ const struct dlm_lockspace_ops *ls_ops;
+ void *ls_ops_arg;
+
+ int ls_namelen;
+ char ls_name[DLM_LOCKSPACE_LEN + 1];
+};
+
+/*
+ * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines
+ * that they should abort what they're doing so new recovery can be started.
+ *
+ * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it
+ * should do down_write() on the in_recovery rw_semaphore. (doing down_write
+ * within dlm_ls_stop causes complaints about the lock acquired/released
+ * in different contexts.)
+ *
+ * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore.
+ * It sets this after it is done with down_write() on the in_recovery
+ * rw_semaphore and clears it after it has released the rw_semaphore.
+ *
+ * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it
+ * should begin recovery of the lockspace.
+ *
+ * LSFL_RUNNING - set when normal locking activity is enabled.
+ * dlm_ls_stop() clears this to tell dlm locking routines that they should
+ * quit what they are doing so recovery can run. dlm_recoverd sets
+ * this after recovery is finished.
+ */
+
+#define LSFL_RECOVER_STOP 0
+#define LSFL_RECOVER_DOWN 1
+#define LSFL_RECOVER_LOCK 2
+#define LSFL_RECOVER_WORK 3
+#define LSFL_RUNNING 4
+
+#define LSFL_RCOM_READY 5
+#define LSFL_RCOM_WAIT 6
+#define LSFL_UEVENT_WAIT 7
+#define LSFL_CB_DELAY 9
+#define LSFL_NODIR 10
+
+/* much of this is just saving user space pointers associated with the
+ lock that we pass back to the user lib with an ast */
+
+struct dlm_user_args {
+ struct dlm_user_proc *proc; /* each process that opens the lockspace
+ device has private data
+ (dlm_user_proc) on the struct file,
+ the process's locks point back to it*/
+ struct dlm_lksb lksb;
+ struct dlm_lksb __user *user_lksb;
+ void __user *castparam;
+ void __user *castaddr;
+ void __user *bastparam;
+ void __user *bastaddr;
+ uint64_t xid;
+};
+
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT 2
+
+/* locks list is kept so we can remove all a process's locks when it
+ exits (or orphan those that are persistent) */
+
+struct dlm_user_proc {
+ dlm_lockspace_t *lockspace;
+ unsigned long flags; /* DLM_PROC_FLAGS */
+ struct list_head asts;
+ spinlock_t asts_spin;
+ struct list_head locks;
+ spinlock_t locks_spin;
+ struct list_head unlocking;
+ wait_queue_head_t wait;
+};
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+ return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_NODIR, &ls->ls_flags);
+}
+
+/* takes a snapshot from dlm atomic flags */
+static inline uint32_t dlm_flags_val(const unsigned long *addr,
+ uint32_t min, uint32_t max)
+{
+ uint32_t bit = min, val = 0;
+
+ for_each_set_bit_from(bit, addr, max + 1) {
+ val |= BIT(bit);
+ }
+
+ return val;
+}
+
+static inline uint32_t dlm_iflags_val(const struct dlm_lkb *lkb)
+{
+ return dlm_flags_val(&lkb->lkb_iflags, __DLM_IFL_MIN_BIT,
+ __DLM_IFL_MAX_BIT);
+}
+
+static inline uint32_t dlm_dflags_val(const struct dlm_lkb *lkb)
+{
+ return dlm_flags_val(&lkb->lkb_dflags, __DLM_DFL_MIN_BIT,
+ __DLM_DFL_MAX_BIT);
+}
+
+/* coming from UAPI header
+ *
+ * TODO:
+ * Move this to UAPI header and let other values point to them and use BIT()
+ */
+#define DLM_SBF_DEMOTED_BIT 0
+#define __DLM_SBF_MIN_BIT DLM_SBF_DEMOTED_BIT
+#define DLM_SBF_VALNOTVALID_BIT 1
+#define DLM_SBF_ALTMODE_BIT 2
+#define __DLM_SBF_MAX_BIT DLM_SBF_ALTMODE_BIT
+
+static inline uint32_t dlm_sbflags_val(const struct dlm_lkb *lkb)
+{
+ /* be sure the next person updates this */
+ BUILD_BUG_ON(BIT(__DLM_SBF_MAX_BIT) != DLM_SBF_ALTMODE);
+
+ return dlm_flags_val(&lkb->lkb_sbflags, __DLM_SBF_MIN_BIT,
+ __DLM_SBF_MAX_BIT);
+}
+
+static inline void dlm_set_flags_val(unsigned long *addr, uint32_t val,
+ uint32_t min, uint32_t max)
+{
+ uint32_t bit;
+
+ for (bit = min; bit < (max + 1); bit++) {
+ if (val & BIT(bit))
+ set_bit(bit, addr);
+ else
+ clear_bit(bit, addr);
+ }
+}
+
+static inline void dlm_set_dflags_val(struct dlm_lkb *lkb, uint32_t val)
+{
+ dlm_set_flags_val(&lkb->lkb_dflags, val, __DLM_DFL_MIN_BIT,
+ __DLM_DFL_MAX_BIT);
+}
+
+static inline void dlm_set_sbflags_val(struct dlm_lkb *lkb, uint32_t val)
+{
+ dlm_set_flags_val(&lkb->lkb_sbflags, val, __DLM_SBF_MIN_BIT,
+ __DLM_SBF_MAX_BIT);
+}
+
+int dlm_plock_init(void);
+void dlm_plock_exit(void);
+
+#ifdef CONFIG_DLM_DEBUG
+void dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+void dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+void *dlm_create_debug_comms_file(int nodeid, void *data);
+void dlm_delete_debug_comms_file(void *ctx);
+#else
+static inline void dlm_register_debugfs(void) { }
+static inline void dlm_unregister_debugfs(void) { }
+static inline void dlm_create_debug_file(struct dlm_ls *ls) { }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; }
+static inline void dlm_delete_debug_comms_file(void *ctx) { }
+#endif
+
+#endif /* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 0000000000..652c51fbbf
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,6153 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+ dlm_lock()
+ dlm_unlock()
+
+ request_lock(ls, lkb)
+ convert_lock(ls, lkb)
+ unlock_lock(ls, lkb)
+ cancel_lock(ls, lkb)
+
+ _request_lock(r, lkb)
+ _convert_lock(r, lkb)
+ _unlock_lock(r, lkb)
+ _cancel_lock(r, lkb)
+
+ do_request(r, lkb)
+ do_convert(r, lkb)
+ do_unlock(r, lkb)
+ do_cancel(r, lkb)
+
+ Stage 1 (lock, unlock) is mainly about checking input args and
+ splitting into one of the four main operations:
+
+ dlm_lock = request_lock
+ dlm_lock+CONVERT = convert_lock
+ dlm_unlock = unlock_lock
+ dlm_unlock+CANCEL = cancel_lock
+
+ Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+ provided to the next stage.
+
+ Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+ When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+ Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
+ given rsb and lkb and queues callbacks.
+
+ For remote operations, send_xxxx() results in the corresponding do_xxxx()
+ function being executed on the remote node. The connecting send/receive
+ calls on local (L) and remote (R) nodes:
+
+ L: send_xxxx() -> R: receive_xxxx()
+ R: do_xxxx()
+ L: receive_xxxx_reply() <- R: send_xxxx_reply()
+*/
+#include <trace/events/dlm.h>
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "midcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local);
+static int receive_extralen(const struct dlm_message *ms);
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
+static void toss_rsb(struct kref *kref);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
+ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
+ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
+ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
+ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
+ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+ /* UN NL CR CW PR PW EX PD*/
+ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
+ { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
+ { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
+ { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
+ { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
+ { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
+ { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
+ { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
+};
+
+#define modes_compat(gr, rq) \
+ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+ return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
+ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
+ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
+ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
+ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
+ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+ printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x "
+ "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n",
+ lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+ dlm_iflags_val(lkb), lkb->lkb_status, lkb->lkb_rqmode,
+ lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid,
+ (unsigned long long)lkb->lkb_recover_seq);
+}
+
+static void dlm_print_rsb(struct dlm_rsb *r)
+{
+ printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x "
+ "rlc %d name %s\n",
+ r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
+ r->res_flags, r->res_first_lkid, r->res_recover_locks_count,
+ r->res_name);
+}
+
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb;
+
+ dlm_print_rsb(r);
+
+ printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+ list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+ printk(KERN_ERR "rsb lookup list\n");
+ list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb grant queue:\n");
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb convert queue:\n");
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb wait queue:\n");
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void dlm_lock_recovery(struct dlm_ls *ls)
+{
+ down_read(&ls->ls_in_recovery);
+}
+
+void dlm_unlock_recovery(struct dlm_ls *ls)
+{
+ up_read(&ls->ls_in_recovery);
+}
+
+int dlm_lock_recovery_try(struct dlm_ls *ls)
+{
+ return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+ return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+ return test_bit(DLM_SBF_DEMOTED_BIT, &lkb->lkb_sbflags);
+}
+
+static inline int is_altmode(struct dlm_lkb *lkb)
+{
+ return test_bit(DLM_SBF_ALTMODE_BIT, &lkb->lkb_sbflags);
+}
+
+static inline int is_granted(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_status == DLM_LKSTS_GRANTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+ DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+ return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+ return lkb->lkb_nodeid &&
+ !test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags);
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+ return test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags);
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+ if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+ (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+ return 1;
+ return 0;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+ return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static inline int is_overlap_unlock(struct dlm_lkb *lkb)
+{
+ return test_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
+}
+
+static inline int is_overlap_cancel(struct dlm_lkb *lkb)
+{
+ return test_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+}
+
+static inline int is_overlap(struct dlm_lkb *lkb)
+{
+ return test_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags) ||
+ test_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ if (is_master_copy(lkb))
+ return;
+
+ DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+ if (rv == -DLM_ECANCEL &&
+ test_and_clear_bit(DLM_IFL_DEADLOCK_CANCEL_BIT, &lkb->lkb_iflags))
+ rv = -EDEADLK;
+
+ dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, dlm_sbflags_val(lkb));
+}
+
+static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ queue_cast(r, lkb,
+ is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+ if (is_master_copy(lkb)) {
+ send_bast(r, lkb, rqmode);
+ } else {
+ dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0);
+ }
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+ kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+ hold_rsb(r);
+}
+
+/* When all references to the rsb are gone it's transferred to
+ the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ uint32_t bucket = r->res_bucket;
+ int rv;
+
+ rv = kref_put_lock(&r->res_ref, toss_rsb,
+ &ls->ls_rsbtbl[bucket].lock);
+ if (rv)
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+ put_rsb(r);
+}
+
+static int pre_rsb_struct(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r1, *r2;
+ int count = 0;
+
+ spin_lock(&ls->ls_new_rsb_spin);
+ if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) {
+ spin_unlock(&ls->ls_new_rsb_spin);
+ return 0;
+ }
+ spin_unlock(&ls->ls_new_rsb_spin);
+
+ r1 = dlm_allocate_rsb(ls);
+ r2 = dlm_allocate_rsb(ls);
+
+ spin_lock(&ls->ls_new_rsb_spin);
+ if (r1) {
+ list_add(&r1->res_hashchain, &ls->ls_new_rsb);
+ ls->ls_new_rsb_count++;
+ }
+ if (r2) {
+ list_add(&r2->res_hashchain, &ls->ls_new_rsb);
+ ls->ls_new_rsb_count++;
+ }
+ count = ls->ls_new_rsb_count;
+ spin_unlock(&ls->ls_new_rsb_spin);
+
+ if (!count)
+ return -ENOMEM;
+ return 0;
+}
+
+/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can
+ unlock any spinlocks, go back and call pre_rsb_struct again.
+ Otherwise, take an rsb off the list and return it. */
+
+static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len,
+ struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int count;
+
+ spin_lock(&ls->ls_new_rsb_spin);
+ if (list_empty(&ls->ls_new_rsb)) {
+ count = ls->ls_new_rsb_count;
+ spin_unlock(&ls->ls_new_rsb_spin);
+ log_debug(ls, "find_rsb retry %d %d %s",
+ count, dlm_config.ci_new_rsb_count,
+ (const char *)name);
+ return -EAGAIN;
+ }
+
+ r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
+ list_del(&r->res_hashchain);
+ /* Convert the empty list_head to a NULL rb_node for tree usage: */
+ memset(&r->res_hashnode, 0, sizeof(struct rb_node));
+ ls->ls_new_rsb_count--;
+ spin_unlock(&ls->ls_new_rsb_spin);
+
+ r->res_ls = ls;
+ r->res_length = len;
+ memcpy(r->res_name, name, len);
+ mutex_init(&r->res_mutex);
+
+ INIT_LIST_HEAD(&r->res_lookup);
+ INIT_LIST_HEAD(&r->res_grantqueue);
+ INIT_LIST_HEAD(&r->res_convertqueue);
+ INIT_LIST_HEAD(&r->res_waitqueue);
+ INIT_LIST_HEAD(&r->res_root_list);
+ INIT_LIST_HEAD(&r->res_recover_list);
+
+ *r_ret = r;
+ return 0;
+}
+
+static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
+{
+ char maxname[DLM_RESNAME_MAXLEN];
+
+ memset(maxname, 0, DLM_RESNAME_MAXLEN);
+ memcpy(maxname, name, nlen);
+ return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
+}
+
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
+ struct dlm_rsb **r_ret)
+{
+ struct rb_node *node = tree->rb_node;
+ struct dlm_rsb *r;
+ int rc;
+
+ while (node) {
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
+ rc = rsb_cmp(r, name, len);
+ if (rc < 0)
+ node = node->rb_left;
+ else if (rc > 0)
+ node = node->rb_right;
+ else
+ goto found;
+ }
+ *r_ret = NULL;
+ return -EBADR;
+
+ found:
+ *r_ret = r;
+ return 0;
+}
+
+static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
+{
+ struct rb_node **newn = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int rc;
+
+ while (*newn) {
+ struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
+ res_hashnode);
+
+ parent = *newn;
+ rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
+ if (rc < 0)
+ newn = &parent->rb_left;
+ else if (rc > 0)
+ newn = &parent->rb_right;
+ else {
+ log_print("rsb_insert match");
+ dlm_dump_rsb(rsb);
+ dlm_dump_rsb(cur);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&rsb->res_hashnode, parent, newn);
+ rb_insert_color(&rsb->res_hashnode, tree);
+ return 0;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused. Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list. When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ *
+ * rsb's on the keep list are being used locally and refcounted.
+ * rsb's on the toss list are not being used locally, and are not refcounted.
+ *
+ * The toss list rsb's were either
+ * - previously used locally but not any more (were on keep list, then
+ * moved to toss list when last refcount dropped)
+ * - created and put on toss list as a directory record for a lookup
+ * (we are the dir node for the res, but are not using the res right now,
+ * but some other node is)
+ *
+ * The purpose of find_rsb() is to return a refcounted rsb for local use.
+ * So, if the given rsb is on the toss list, it is moved to the keep list
+ * before being returned.
+ *
+ * toss_rsb() happens when all local usage of the rsb is done, i.e. no
+ * more refcounts exist, so the rsb is moved from the keep list to the
+ * toss list.
+ *
+ * rsb's on both keep and toss lists are used for doing a name to master
+ * lookups. rsb's that are in use locally (and being refcounted) are on
+ * the keep list, rsb's that are not in use locally (not refcounted) and
+ * only exist for name/master lookups are on the toss list.
+ *
+ * rsb's on the toss list who's dir_nodeid is not local can have stale
+ * name/master mappings. So, remote requests on such rsb's can potentially
+ * return with an error, which means the mapping is stale and needs to
+ * be updated with a new lookup. (The idea behind MASTER UNCERTAIN and
+ * first_lkid is to keep only a single outstanding request on an rsb
+ * while that rsb has a potentially stale master.)
+ */
+
+static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
+ uint32_t hash, uint32_t b,
+ int dir_nodeid, int from_nodeid,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r = NULL;
+ int our_nodeid = dlm_our_nodeid();
+ int from_local = 0;
+ int from_other = 0;
+ int from_dir = 0;
+ int create = 0;
+ int error;
+
+ if (flags & R_RECEIVE_REQUEST) {
+ if (from_nodeid == dir_nodeid)
+ from_dir = 1;
+ else
+ from_other = 1;
+ } else if (flags & R_REQUEST) {
+ from_local = 1;
+ }
+
+ /*
+ * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so
+ * from_nodeid has sent us a lock in dlm_recover_locks, believing
+ * we're the new master. Our local recovery may not have set
+ * res_master_nodeid to our_nodeid yet, so allow either. Don't
+ * create the rsb; dlm_recover_process_copy() will handle EBADR
+ * by resending.
+ *
+ * If someone sends us a request, we are the dir node, and we do
+ * not find the rsb anywhere, then recreate it. This happens if
+ * someone sends us a request after we have removed/freed an rsb
+ * from our toss list. (They sent a request instead of lookup
+ * because they are using an rsb from their toss list.)
+ */
+
+ if (from_local || from_dir ||
+ (from_other && (dir_nodeid == our_nodeid))) {
+ create = 1;
+ }
+
+ retry:
+ if (create) {
+ error = pre_rsb_struct(ls);
+ if (error < 0)
+ goto out;
+ }
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+ if (error)
+ goto do_toss;
+
+ /*
+ * rsb is active, so we can't check master_nodeid without lock_rsb.
+ */
+
+ kref_get(&r->res_ref);
+ goto out_unlock;
+
+
+ do_toss:
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+ if (error)
+ goto do_new;
+
+ /*
+ * rsb found inactive (master_nodeid may be out of date unless
+ * we are the dir_nodeid or were the master) No other thread
+ * is using this rsb because it's on the toss list, so we can
+ * look at or update res_master_nodeid without lock_rsb.
+ */
+
+ if ((r->res_master_nodeid != our_nodeid) && from_other) {
+ /* our rsb was not master, and another node (not the dir node)
+ has sent us a request */
+ log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s",
+ from_nodeid, r->res_master_nodeid, dir_nodeid,
+ r->res_name);
+ error = -ENOTBLK;
+ goto out_unlock;
+ }
+
+ if ((r->res_master_nodeid != our_nodeid) && from_dir) {
+ /* don't think this should ever happen */
+ log_error(ls, "find_rsb toss from_dir %d master %d",
+ from_nodeid, r->res_master_nodeid);
+ dlm_print_rsb(r);
+ /* fix it and go on */
+ r->res_master_nodeid = our_nodeid;
+ r->res_nodeid = 0;
+ rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = 0;
+ }
+
+ if (from_local && (r->res_master_nodeid != our_nodeid)) {
+ /* Because we have held no locks on this rsb,
+ res_master_nodeid could have become stale. */
+ rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = 0;
+ }
+
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ goto out_unlock;
+
+
+ do_new:
+ /*
+ * rsb not found
+ */
+
+ if (error == -EBADR && !create)
+ goto out_unlock;
+
+ error = get_rsb_struct(ls, name, len, &r);
+ if (error == -EAGAIN) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ goto retry;
+ }
+ if (error)
+ goto out_unlock;
+
+ r->res_hash = hash;
+ r->res_bucket = b;
+ r->res_dir_nodeid = dir_nodeid;
+ kref_init(&r->res_ref);
+
+ if (from_dir) {
+ /* want to see how often this happens */
+ log_debug(ls, "find_rsb new from_dir %d recreate %s",
+ from_nodeid, r->res_name);
+ r->res_master_nodeid = our_nodeid;
+ r->res_nodeid = 0;
+ goto out_add;
+ }
+
+ if (from_other && (dir_nodeid != our_nodeid)) {
+ /* should never happen */
+ log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
+ from_nodeid, dir_nodeid, our_nodeid, r->res_name);
+ dlm_free_rsb(r);
+ r = NULL;
+ error = -ENOTBLK;
+ goto out_unlock;
+ }
+
+ if (from_other) {
+ log_debug(ls, "find_rsb new from_other %d dir %d %s",
+ from_nodeid, dir_nodeid, r->res_name);
+ }
+
+ if (dir_nodeid == our_nodeid) {
+ /* When we are the dir nodeid, we can set the master
+ node immediately */
+ r->res_master_nodeid = our_nodeid;
+ r->res_nodeid = 0;
+ } else {
+ /* set_master will send_lookup to dir_nodeid */
+ r->res_master_nodeid = 0;
+ r->res_nodeid = -1;
+ }
+
+ out_add:
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ out_unlock:
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ out:
+ *r_ret = r;
+ return error;
+}
+
+/* During recovery, other nodes can send us new MSTCPY locks (from
+ dlm_recover_locks) before we've made ourself master (in
+ dlm_recover_masters). */
+
+static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
+ uint32_t hash, uint32_t b,
+ int dir_nodeid, int from_nodeid,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r = NULL;
+ int our_nodeid = dlm_our_nodeid();
+ int recover = (flags & R_RECEIVE_RECOVER);
+ int error;
+
+ retry:
+ error = pre_rsb_struct(ls);
+ if (error < 0)
+ goto out;
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+ if (error)
+ goto do_toss;
+
+ /*
+ * rsb is active, so we can't check master_nodeid without lock_rsb.
+ */
+
+ kref_get(&r->res_ref);
+ goto out_unlock;
+
+
+ do_toss:
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+ if (error)
+ goto do_new;
+
+ /*
+ * rsb found inactive. No other thread is using this rsb because
+ * it's on the toss list, so we can look at or update
+ * res_master_nodeid without lock_rsb.
+ */
+
+ if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) {
+ /* our rsb is not master, and another node has sent us a
+ request; this should never happen */
+ log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d",
+ from_nodeid, r->res_master_nodeid, dir_nodeid);
+ dlm_print_rsb(r);
+ error = -ENOTBLK;
+ goto out_unlock;
+ }
+
+ if (!recover && (r->res_master_nodeid != our_nodeid) &&
+ (dir_nodeid == our_nodeid)) {
+ /* our rsb is not master, and we are dir; may as well fix it;
+ this should never happen */
+ log_error(ls, "find_rsb toss our %d master %d dir %d",
+ our_nodeid, r->res_master_nodeid, dir_nodeid);
+ dlm_print_rsb(r);
+ r->res_master_nodeid = our_nodeid;
+ r->res_nodeid = 0;
+ }
+
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ goto out_unlock;
+
+
+ do_new:
+ /*
+ * rsb not found
+ */
+
+ error = get_rsb_struct(ls, name, len, &r);
+ if (error == -EAGAIN) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ goto retry;
+ }
+ if (error)
+ goto out_unlock;
+
+ r->res_hash = hash;
+ r->res_bucket = b;
+ r->res_dir_nodeid = dir_nodeid;
+ r->res_master_nodeid = dir_nodeid;
+ r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid;
+ kref_init(&r->res_ref);
+
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ out_unlock:
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ out:
+ *r_ret = r;
+ return error;
+}
+
+static int find_rsb(struct dlm_ls *ls, const void *name, int len,
+ int from_nodeid, unsigned int flags,
+ struct dlm_rsb **r_ret)
+{
+ uint32_t hash, b;
+ int dir_nodeid;
+
+ if (len > DLM_RESNAME_MAXLEN)
+ return -EINVAL;
+
+ hash = jhash(name, len, 0);
+ b = hash & (ls->ls_rsbtbl_size - 1);
+
+ dir_nodeid = dlm_hash2nodeid(ls, hash);
+
+ if (dlm_no_directory(ls))
+ return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid,
+ from_nodeid, flags, r_ret);
+ else
+ return find_rsb_dir(ls, name, len, hash, b, dir_nodeid,
+ from_nodeid, flags, r_ret);
+}
+
+/* we have received a request and found that res_master_nodeid != our_nodeid,
+ so we need to return an error or make ourself the master */
+
+static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
+ int from_nodeid)
+{
+ if (dlm_no_directory(ls)) {
+ log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d",
+ from_nodeid, r->res_master_nodeid,
+ r->res_dir_nodeid);
+ dlm_print_rsb(r);
+ return -ENOTBLK;
+ }
+
+ if (from_nodeid != r->res_dir_nodeid) {
+ /* our rsb is not master, and another node (not the dir node)
+ has sent us a request. this is much more common when our
+ master_nodeid is zero, so limit debug to non-zero. */
+
+ if (r->res_master_nodeid) {
+ log_debug(ls, "validate master from_other %d master %d "
+ "dir %d first %x %s", from_nodeid,
+ r->res_master_nodeid, r->res_dir_nodeid,
+ r->res_first_lkid, r->res_name);
+ }
+ return -ENOTBLK;
+ } else {
+ /* our rsb is not master, but the dir nodeid has sent us a
+ request; this could happen with master 0 / res_nodeid -1 */
+
+ if (r->res_master_nodeid) {
+ log_error(ls, "validate master from_dir %d master %d "
+ "first %x %s",
+ from_nodeid, r->res_master_nodeid,
+ r->res_first_lkid, r->res_name);
+ }
+
+ r->res_master_nodeid = dlm_our_nodeid();
+ r->res_nodeid = 0;
+ return 0;
+ }
+}
+
+static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid,
+ int from_nodeid, bool toss_list, unsigned int flags,
+ int *r_nodeid, int *result)
+{
+ int fix_master = (flags & DLM_LU_RECOVER_MASTER);
+ int from_master = (flags & DLM_LU_RECOVER_DIR);
+
+ if (r->res_dir_nodeid != our_nodeid) {
+ /* should not happen, but may as well fix it and carry on */
+ log_error(ls, "%s res_dir %d our %d %s", __func__,
+ r->res_dir_nodeid, our_nodeid, r->res_name);
+ r->res_dir_nodeid = our_nodeid;
+ }
+
+ if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
+ /* Recovery uses this function to set a new master when
+ * the previous master failed. Setting NEW_MASTER will
+ * force dlm_recover_masters to call recover_master on this
+ * rsb even though the res_nodeid is no longer removed.
+ */
+
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ rsb_set_flag(r, RSB_NEW_MASTER);
+
+ if (toss_list) {
+ /* I don't think we should ever find it on toss list. */
+ log_error(ls, "%s fix_master on toss", __func__);
+ dlm_dump_rsb(r);
+ }
+ }
+
+ if (from_master && (r->res_master_nodeid != from_nodeid)) {
+ /* this will happen if from_nodeid became master during
+ * a previous recovery cycle, and we aborted the previous
+ * cycle before recovering this master value
+ */
+
+ log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s",
+ __func__, from_nodeid, r->res_master_nodeid,
+ r->res_nodeid, r->res_first_lkid, r->res_name);
+
+ if (r->res_master_nodeid == our_nodeid) {
+ log_error(ls, "from_master %d our_master", from_nodeid);
+ dlm_dump_rsb(r);
+ goto ret_assign;
+ }
+
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ rsb_set_flag(r, RSB_NEW_MASTER);
+ }
+
+ if (!r->res_master_nodeid) {
+ /* this will happen if recovery happens while we're looking
+ * up the master for this rsb
+ */
+
+ log_debug(ls, "%s master 0 to %d first %x %s", __func__,
+ from_nodeid, r->res_first_lkid, r->res_name);
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ }
+
+ if (!from_master && !fix_master &&
+ (r->res_master_nodeid == from_nodeid)) {
+ /* this can happen when the master sends remove, the dir node
+ * finds the rsb on the keep list and ignores the remove,
+ * and the former master sends a lookup
+ */
+
+ log_limit(ls, "%s from master %d flags %x first %x %s",
+ __func__, from_nodeid, flags, r->res_first_lkid,
+ r->res_name);
+ }
+
+ ret_assign:
+ *r_nodeid = r->res_master_nodeid;
+ if (result)
+ *result = DLM_LU_MATCH;
+}
+
+/*
+ * We're the dir node for this res and another node wants to know the
+ * master nodeid. During normal operation (non recovery) this is only
+ * called from receive_lookup(); master lookups when the local node is
+ * the dir node are done by find_rsb().
+ *
+ * normal operation, we are the dir node for a resource
+ * . _request_lock
+ * . set_master
+ * . send_lookup
+ * . receive_lookup
+ * . dlm_master_lookup flags 0
+ *
+ * recover directory, we are rebuilding dir for all resources
+ * . dlm_recover_directory
+ * . dlm_rcom_names
+ * remote node sends back the rsb names it is master of and we are dir of
+ * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1)
+ * we either create new rsb setting remote node as master, or find existing
+ * rsb and set master to be the remote node.
+ *
+ * recover masters, we are finding the new master for resources
+ * . dlm_recover_masters
+ * . recover_master
+ * . dlm_send_rcom_lookup
+ * . receive_rcom_lookup
+ * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
+ */
+
+int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
+ int len, unsigned int flags, int *r_nodeid, int *result)
+{
+ struct dlm_rsb *r = NULL;
+ uint32_t hash, b;
+ int our_nodeid = dlm_our_nodeid();
+ int dir_nodeid, error;
+
+ if (len > DLM_RESNAME_MAXLEN)
+ return -EINVAL;
+
+ if (from_nodeid == our_nodeid) {
+ log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x",
+ our_nodeid, flags);
+ return -EINVAL;
+ }
+
+ hash = jhash(name, len, 0);
+ b = hash & (ls->ls_rsbtbl_size - 1);
+
+ dir_nodeid = dlm_hash2nodeid(ls, hash);
+ if (dir_nodeid != our_nodeid) {
+ log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d",
+ from_nodeid, dir_nodeid, our_nodeid, hash,
+ ls->ls_num_nodes);
+ *r_nodeid = -1;
+ return -EINVAL;
+ }
+
+ retry:
+ error = pre_rsb_struct(ls);
+ if (error < 0)
+ return error;
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+ if (!error) {
+ /* because the rsb is active, we need to lock_rsb before
+ * checking/changing re_master_nodeid
+ */
+
+ hold_rsb(r);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ lock_rsb(r);
+
+ __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false,
+ flags, r_nodeid, result);
+
+ /* the rsb was active */
+ unlock_rsb(r);
+ put_rsb(r);
+
+ return 0;
+ }
+
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+ if (error)
+ goto not_found;
+
+ /* because the rsb is inactive (on toss list), it's not refcounted
+ * and lock_rsb is not used, but is protected by the rsbtbl lock
+ */
+
+ __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
+ r_nodeid, result);
+
+ r->res_toss_time = jiffies;
+ /* the rsb was inactive (on toss list) */
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+ return 0;
+
+ not_found:
+ error = get_rsb_struct(ls, name, len, &r);
+ if (error == -EAGAIN) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ goto retry;
+ }
+ if (error)
+ goto out_unlock;
+
+ r->res_hash = hash;
+ r->res_bucket = b;
+ r->res_dir_nodeid = our_nodeid;
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ kref_init(&r->res_ref);
+ r->res_toss_time = jiffies;
+
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].toss);
+ if (error) {
+ /* should never happen */
+ dlm_free_rsb(r);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ goto retry;
+ }
+
+ if (result)
+ *result = DLM_LU_ADD;
+ *r_nodeid = from_nodeid;
+ out_unlock:
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return error;
+}
+
+static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
+{
+ struct rb_node *n;
+ struct dlm_rsb *r;
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ spin_lock(&ls->ls_rsbtbl[i].lock);
+ for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
+ if (r->res_hash == hash)
+ dlm_dump_rsb(r);
+ }
+ spin_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+}
+
+void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len)
+{
+ struct dlm_rsb *r = NULL;
+ uint32_t hash, b;
+ int error;
+
+ hash = jhash(name, len, 0);
+ b = hash & (ls->ls_rsbtbl_size - 1);
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+ if (!error)
+ goto out_dump;
+
+ error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+ if (error)
+ goto out;
+ out_dump:
+ dlm_dump_rsb(r);
+ out:
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+ struct dlm_ls *ls = r->res_ls;
+
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+ kref_init(&r->res_ref);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
+ rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
+ r->res_toss_time = jiffies;
+ set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[r->res_bucket].flags);
+ if (r->res_lvbptr) {
+ dlm_free_lvb(r->res_lvbptr);
+ r->res_lvbptr = NULL;
+ }
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+ int rv;
+ rv = kref_put(&r->res_ref, toss_rsb);
+ DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the remove and free. */
+
+ DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+ The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ hold_rsb(r);
+ lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_resource) {
+ put_rsb(lkb->lkb_resource);
+ lkb->lkb_resource = NULL;
+ }
+}
+
+static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
+ int start, int end)
+{
+ struct dlm_lkb *lkb;
+ int rv;
+
+ lkb = dlm_allocate_lkb(ls);
+ if (!lkb)
+ return -ENOMEM;
+
+ lkb->lkb_last_bast_mode = -1;
+ lkb->lkb_nodeid = -1;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ kref_init(&lkb->lkb_ref);
+ INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+ INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
+ INIT_LIST_HEAD(&lkb->lkb_cb_list);
+ INIT_LIST_HEAD(&lkb->lkb_callbacks);
+ spin_lock_init(&lkb->lkb_cb_lock);
+ INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
+
+ idr_preload(GFP_NOFS);
+ spin_lock(&ls->ls_lkbidr_spin);
+ rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT);
+ if (rv >= 0)
+ lkb->lkb_id = rv;
+ spin_unlock(&ls->ls_lkbidr_spin);
+ idr_preload_end();
+
+ if (rv < 0) {
+ log_error(ls, "create_lkb idr error %d", rv);
+ dlm_free_lkb(lkb);
+ return rv;
+ }
+
+ *lkb_ret = lkb;
+ return 0;
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ return _create_lkb(ls, lkb_ret, 1, 0);
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+
+ spin_lock(&ls->ls_lkbidr_spin);
+ lkb = idr_find(&ls->ls_lkbidr, lkid);
+ if (lkb)
+ kref_get(&lkb->lkb_ref);
+ spin_unlock(&ls->ls_lkbidr_spin);
+
+ *lkb_ret = lkb;
+ return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the detach_lkb */
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+ it so we need to provide the lockspace explicitly */
+
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ uint32_t lkid = lkb->lkb_id;
+ int rv;
+
+ rv = kref_put_lock(&lkb->lkb_ref, kill_lkb,
+ &ls->ls_lkbidr_spin);
+ if (rv) {
+ idr_remove(&ls->ls_lkbidr, lkid);
+ spin_unlock(&ls->ls_lkbidr_spin);
+
+ detach_lkb(lkb);
+
+ /* for local/process lkbs, lvbptr points to caller's lksb */
+ if (lkb->lkb_lvbptr && is_master_copy(lkb))
+ dlm_free_lvb(lkb->lkb_lvbptr);
+ dlm_free_lkb(lkb);
+ }
+
+ return rv;
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls;
+
+ DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+ DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+
+ ls = lkb->lkb_resource->res_ls;
+ return __put_lkb(ls, lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+ kref_get(&lkb->lkb_ref);
+}
+
+static void unhold_lkb_assert(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ DLM_ASSERT(false, dlm_print_lkb(lkb););
+}
+
+/* This is called when we need to remove a reference and are certain
+ it's not the last ref. e.g. del_lkb is always called between a
+ find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+ put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+ kref_put(&lkb->lkb_ref, unhold_lkb_assert);
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+ int mode)
+{
+ struct dlm_lkb *lkb = NULL, *iter;
+
+ list_for_each_entry(iter, head, lkb_statequeue)
+ if (iter->lkb_rqmode < mode) {
+ lkb = iter;
+ list_add_tail(new, &iter->lkb_statequeue);
+ break;
+ }
+
+ if (!lkb)
+ list_add_tail(new, head);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+ kref_get(&lkb->lkb_ref);
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+ lkb->lkb_timestamp = ktime_get();
+
+ lkb->lkb_status = status;
+
+ switch (status) {
+ case DLM_LKSTS_WAITING:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+ break;
+ case DLM_LKSTS_GRANTED:
+ /* convention says granted locks kept in order of grmode */
+ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+ lkb->lkb_grmode);
+ break;
+ case DLM_LKSTS_CONVERT:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue,
+ &r->res_convertqueue);
+ break;
+ default:
+ DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+ }
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_status = 0;
+ list_del(&lkb->lkb_statequeue);
+ unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+ hold_lkb(lkb);
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, sts);
+ unhold_lkb(lkb);
+}
+
+static int msg_reply_type(int mstype)
+{
+ switch (mstype) {
+ case DLM_MSG_REQUEST:
+ return DLM_MSG_REQUEST_REPLY;
+ case DLM_MSG_CONVERT:
+ return DLM_MSG_CONVERT_REPLY;
+ case DLM_MSG_UNLOCK:
+ return DLM_MSG_UNLOCK_REPLY;
+ case DLM_MSG_CANCEL:
+ return DLM_MSG_CANCEL_REPLY;
+ case DLM_MSG_LOOKUP:
+ return DLM_MSG_LOOKUP_REPLY;
+ }
+ return -1;
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+ a reply from a remote node */
+
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int error = 0;
+ int wc;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+
+ if (is_overlap_unlock(lkb) ||
+ (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
+ switch (mstype) {
+ case DLM_MSG_UNLOCK:
+ set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
+ break;
+ case DLM_MSG_CANCEL:
+ set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+ break;
+ default:
+ error = -EBUSY;
+ goto out;
+ }
+ wc = atomic_inc_return(&lkb->lkb_wait_count);
+ hold_lkb(lkb);
+
+ log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
+ lkb->lkb_id, lkb->lkb_wait_type, mstype, wc,
+ dlm_iflags_val(lkb));
+ goto out;
+ }
+
+ wc = atomic_fetch_inc(&lkb->lkb_wait_count);
+ DLM_ASSERT(!wc, dlm_print_lkb(lkb); printk("wait_count %d\n", wc););
+ lkb->lkb_wait_type = mstype;
+ lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
+ hold_lkb(lkb);
+ list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+ if (error)
+ log_error(ls, "addwait error %x %d flags %x %d %d %s",
+ lkb->lkb_id, error, dlm_iflags_val(lkb), mstype,
+ lkb->lkb_wait_type, lkb->lkb_resource->res_name);
+ mutex_unlock(&ls->ls_waiters_mutex);
+ return error;
+}
+
+/* We clear the RESEND flag because we might be taking an lkb off the waiters
+ list as part of process_requestqueue (e.g. a lookup that has an optimized
+ request reply on the requestqueue) between dlm_recover_waiters_pre() which
+ set RESEND and dlm_recover_waiters_post() */
+
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
+ const struct dlm_message *ms)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int overlap_done = 0;
+
+ if (mstype == DLM_MSG_UNLOCK_REPLY &&
+ test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags)) {
+ log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
+ overlap_done = 1;
+ goto out_del;
+ }
+
+ if (mstype == DLM_MSG_CANCEL_REPLY &&
+ test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags)) {
+ log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
+ overlap_done = 1;
+ goto out_del;
+ }
+
+ /* Cancel state was preemptively cleared by a successful convert,
+ see next comment, nothing to do. */
+
+ if ((mstype == DLM_MSG_CANCEL_REPLY) &&
+ (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
+ log_debug(ls, "remwait %x cancel_reply wait_type %d",
+ lkb->lkb_id, lkb->lkb_wait_type);
+ return -1;
+ }
+
+ /* Remove for the convert reply, and premptively remove for the
+ cancel reply. A convert has been granted while there's still
+ an outstanding cancel on it (the cancel is moot and the result
+ in the cancel reply should be 0). We preempt the cancel reply
+ because the app gets the convert result and then can follow up
+ with another op, like convert. This subsequent op would see the
+ lingering state of the cancel and fail with -EBUSY. */
+
+ if ((mstype == DLM_MSG_CONVERT_REPLY) &&
+ (lkb->lkb_wait_type == DLM_MSG_CONVERT) && ms && !ms->m_result &&
+ test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags)) {
+ log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
+ lkb->lkb_id);
+ lkb->lkb_wait_type = 0;
+ atomic_dec(&lkb->lkb_wait_count);
+ unhold_lkb(lkb);
+ goto out_del;
+ }
+
+ /* N.B. type of reply may not always correspond to type of original
+ msg due to lookup->request optimization, verify others? */
+
+ if (lkb->lkb_wait_type) {
+ lkb->lkb_wait_type = 0;
+ goto out_del;
+ }
+
+ log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
+ lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0,
+ lkb->lkb_remid, mstype, dlm_iflags_val(lkb));
+ return -1;
+
+ out_del:
+ /* the force-unlock/cancel has completed and we haven't recvd a reply
+ to the op that was in progress prior to the unlock/cancel; we
+ give up on any reply to the earlier op. FIXME: not sure when/how
+ this would happen */
+
+ if (overlap_done && lkb->lkb_wait_type) {
+ log_error(ls, "remwait error %x reply %d wait_type %d overlap",
+ lkb->lkb_id, mstype, lkb->lkb_wait_type);
+ atomic_dec(&lkb->lkb_wait_count);
+ unhold_lkb(lkb);
+ lkb->lkb_wait_type = 0;
+ }
+
+ DLM_ASSERT(atomic_read(&lkb->lkb_wait_count), dlm_print_lkb(lkb););
+
+ clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+ if (atomic_dec_and_test(&lkb->lkb_wait_count))
+ list_del_init(&lkb->lkb_wait_reply);
+ unhold_lkb(lkb);
+ return 0;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int error;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ error = _remove_from_waiters(lkb, mstype, NULL);
+ mutex_unlock(&ls->ls_waiters_mutex);
+ return error;
+}
+
+/* Handles situations where we might be processing a "fake" or "local" reply in
+ which we can't try to take waiters_mutex again. */
+
+static int remove_from_waiters_ms(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int error;
+
+ if (!local)
+ mutex_lock(&ls->ls_waiters_mutex);
+ error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
+ if (!local)
+ mutex_unlock(&ls->ls_waiters_mutex);
+ return error;
+}
+
+static void shrink_bucket(struct dlm_ls *ls, int b)
+{
+ struct rb_node *n, *next;
+ struct dlm_rsb *r;
+ char *name;
+ int our_nodeid = dlm_our_nodeid();
+ int remote_count = 0;
+ int need_shrink = 0;
+ int i, len, rv;
+
+ memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+
+ if (!test_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags)) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+
+ for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
+ next = rb_next(n);
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
+
+ /* If we're the directory record for this rsb, and
+ we're not the master of it, then we need to wait
+ for the master node to send us a dir remove for
+ before removing the dir record. */
+
+ if (!dlm_no_directory(ls) &&
+ (r->res_master_nodeid != our_nodeid) &&
+ (dlm_dir_nodeid(r) == our_nodeid)) {
+ continue;
+ }
+
+ need_shrink = 1;
+
+ if (!time_after_eq(jiffies, r->res_toss_time +
+ dlm_config.ci_toss_secs * HZ)) {
+ continue;
+ }
+
+ if (!dlm_no_directory(ls) &&
+ (r->res_master_nodeid == our_nodeid) &&
+ (dlm_dir_nodeid(r) != our_nodeid)) {
+
+ /* We're the master of this rsb but we're not
+ the directory record, so we need to tell the
+ dir node to remove the dir record. */
+
+ ls->ls_remove_lens[remote_count] = r->res_length;
+ memcpy(ls->ls_remove_names[remote_count], r->res_name,
+ DLM_RESNAME_MAXLEN);
+ remote_count++;
+
+ if (remote_count >= DLM_REMOVE_NAMES_MAX)
+ break;
+ continue;
+ }
+
+ if (!kref_put(&r->res_ref, kill_rsb)) {
+ log_error(ls, "tossed rsb in use %s", r->res_name);
+ continue;
+ }
+
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ dlm_free_rsb(r);
+ }
+
+ if (need_shrink)
+ set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags);
+ else
+ clear_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+ /*
+ * While searching for rsb's to free, we found some that require
+ * remote removal. We leave them in place and find them again here
+ * so there is a very small gap between removing them from the toss
+ * list and sending the removal. Keeping this gap small is
+ * important to keep us (the master node) from being out of sync
+ * with the remote dir node for very long.
+ */
+
+ for (i = 0; i < remote_count; i++) {
+ name = ls->ls_remove_names[i];
+ len = ls->ls_remove_lens[i];
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+ if (rv) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ log_debug(ls, "remove_name not toss %s", name);
+ continue;
+ }
+
+ if (r->res_master_nodeid != our_nodeid) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ log_debug(ls, "remove_name master %d dir %d our %d %s",
+ r->res_master_nodeid, r->res_dir_nodeid,
+ our_nodeid, name);
+ continue;
+ }
+
+ if (r->res_dir_nodeid == our_nodeid) {
+ /* should never happen */
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ log_error(ls, "remove_name dir %d master %d our %d %s",
+ r->res_dir_nodeid, r->res_master_nodeid,
+ our_nodeid, name);
+ continue;
+ }
+
+ if (!time_after_eq(jiffies, r->res_toss_time +
+ dlm_config.ci_toss_secs * HZ)) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ log_debug(ls, "remove_name toss_time %lu now %lu %s",
+ r->res_toss_time, jiffies, name);
+ continue;
+ }
+
+ if (!kref_put(&r->res_ref, kill_rsb)) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ log_error(ls, "remove_name in use %s", name);
+ continue;
+ }
+
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ send_remove(r);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+ dlm_free_rsb(r);
+ }
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ shrink_bucket(ls, i);
+ if (dlm_locking_stopped(ls))
+ break;
+ cond_resched();
+ }
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int b, len = r->res_ls->ls_lvblen;
+
+ /* b=1 lvb returned to caller
+ b=0 lvb written to rsb or invalidated
+ b=-1 do nothing */
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+ if (b == 1) {
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+ lkb->lkb_lvbseq = r->res_lvbseq;
+
+ } else if (b == 0) {
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+ r->res_lvbseq++;
+ lkb->lkb_lvbseq = r->res_lvbseq;
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+ }
+
+ if (rsb_flag(r, RSB_VALNOTVALID))
+ set_bit(DLM_SBF_VALNOTVALID_BIT, &lkb->lkb_sbflags);
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode < DLM_LOCK_PW)
+ return;
+
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+ r->res_lvbseq++;
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ const struct dlm_message *ms)
+{
+ int b;
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+ if (b == 1) {
+ int len = receive_extralen(ms);
+ if (len > r->res_ls->ls_lvblen)
+ len = r->res_ls->ls_lvblen;
+ memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+ lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
+ }
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+ remove_lock -- used for unlock, removes lkb from granted
+ revert_lock -- used for cancel, moves lkb from convert to granted
+ grant_lock -- used for request and convert, adds lkb to granted or
+ moves lkb from convert or waiting to granted
+
+ Each of these is used for master or local copy lkb's. There is
+ also a _pc() variation used to make the corresponding change on
+ a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_unlock(r, lkb);
+ _remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ _remove_lock(r, lkb);
+}
+
+/* returns: 0 did nothing
+ 1 moved lock to granted
+ -1 removed lock */
+
+static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int rv = 0;
+
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+
+ switch (lkb->lkb_status) {
+ case DLM_LKSTS_GRANTED:
+ break;
+ case DLM_LKSTS_CONVERT:
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ rv = 1;
+ break;
+ case DLM_LKSTS_WAITING:
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+ rv = -1;
+ break;
+ default:
+ log_print("invalid status for revert %d", lkb->lkb_status);
+ }
+ return rv;
+}
+
+static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ if (lkb->lkb_status)
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ else
+ add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ }
+
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+ lkb->lkb_highbast = 0;
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_lock(r, lkb);
+ _grant_lock(r, lkb);
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ const struct dlm_message *ms)
+{
+ set_lvb_lock_pc(r, lkb, ms);
+ _grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+ be sent to the requesting node in addition to granting the lock if the
+ lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ grant_lock(r, lkb);
+ if (is_master_copy(lkb))
+ send_grant(r, lkb);
+ else
+ queue_cast(r, lkb, 0);
+}
+
+/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
+ change the granted/requested modes. We're munging things accordingly in
+ the process copy.
+ CONVDEADLK: our grmode may have been forced down to NL to resolve a
+ conversion deadlock
+ ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
+ compatible with other granted locks */
+
+static void munge_demoted(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
+ log_print("munge_demoted %x invalid modes gr %d rq %d",
+ lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
+ return;
+ }
+
+ lkb->lkb_grmode = DLM_LOCK_NL;
+}
+
+static void munge_altmode(struct dlm_lkb *lkb, const struct dlm_message *ms)
+{
+ if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) &&
+ ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) {
+ log_print("munge_altmode %x invalid reply type %d",
+ lkb->lkb_id, le32_to_cpu(ms->m_type));
+ return;
+ }
+
+ if (lkb->lkb_exflags & DLM_LKF_ALTPR)
+ lkb->lkb_rqmode = DLM_LOCK_PR;
+ else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
+ lkb->lkb_rqmode = DLM_LOCK_CW;
+ else {
+ log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
+ dlm_print_lkb(lkb);
+ }
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+ struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+ lkb_statequeue);
+ if (lkb->lkb_id == first->lkb_id)
+ return 1;
+
+ return 0;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this;
+
+ list_for_each_entry(this, head, lkb_statequeue) {
+ if (this == lkb)
+ continue;
+ if (!modes_compat(this, lkb))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource. The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
+ * convert queue from being granted, then deadlk/demote lkb.
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ * PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list. We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
+ * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
+ * flag set and return DEMOTED in the lksb flags.
+ *
+ * Originally, this function detected conv-deadlk in a more limited scope:
+ * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
+ * - if lkb1 was the first entry in the queue (not just earlier), and was
+ * blocked by the granted mode of lkb2, and there was nothing on the
+ * granted queue preventing lkb1 from being granted immediately, i.e.
+ * lkb2 was the only thing preventing lkb1 from being granted.
+ *
+ * That second condition meant we'd only say there was conv-deadlk if
+ * resolving it (by demotion) would lead to the first lock on the convert
+ * queue being granted right away. It allowed conversion deadlocks to exist
+ * between locks on the convert queue while they couldn't be granted anyway.
+ *
+ * Now, we detect and take action on conversion deadlocks immediately when
+ * they're created, even if they may not be immediately consequential. If
+ * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
+ * mode that would prevent lkb1's conversion from being granted, we do a
+ * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
+ * I think this means that the lkb_is_ahead condition below should always
+ * be zero, i.e. there will never be conv-deadlk between two locks that are
+ * both already on the convert queue.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
+{
+ struct dlm_lkb *lkb1;
+ int lkb_is_ahead = 0;
+
+ list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
+ if (lkb1 == lkb2) {
+ lkb_is_ahead = 1;
+ continue;
+ }
+
+ if (!lkb_is_ahead) {
+ if (!modes_compat(lkb2, lkb1))
+ return 1;
+ } else {
+ if (!modes_compat(lkb2, lkb1) &&
+ !modes_compat(lkb1, lkb2))
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * recover is 1 if dlm_recover_grant() is trying to grant conversions
+ * after recovery.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
+ int recover)
+{
+ int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+ /*
+ * 6-10: Version 5.4 introduced an option to address the phenomenon of
+ * a new request for a NL mode lock being blocked.
+ *
+ * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+ * request, then it would be granted. In essence, the use of this flag
+ * tells the Lock Manager to expedite theis request by not considering
+ * what may be in the CONVERTING or WAITING queues... As of this
+ * writing, the EXPEDITE flag can be used only with new requests for NL
+ * mode locks. This flag is not valid for conversion requests.
+ *
+ * A shortcut. Earlier checks return an error if EXPEDITE is used in a
+ * conversion or used with a non-NL requested mode. We also know an
+ * EXPEDITE request is always granted immediately, so now must always
+ * be 1. The full condition to grant an expedite request: (now &&
+ * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+ * therefore be shortened to just checking the flag.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+ return 1;
+
+ /*
+ * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+ * added to the remaining conditions.
+ */
+
+ if (queue_conflict(&r->res_grantqueue, lkb))
+ return 0;
+
+ /*
+ * 6-3: By default, a conversion request is immediately granted if the
+ * requested mode is compatible with the modes of all other granted
+ * locks
+ */
+
+ if (queue_conflict(&r->res_convertqueue, lkb))
+ return 0;
+
+ /*
+ * The RECOVER_GRANT flag means dlm_recover_grant() is granting
+ * locks for a recovered rsb, on which lkb's have been rebuilt.
+ * The lkb's may have been rebuilt on the queues in a different
+ * order than they were in on the previous master. So, granting
+ * queued conversions in order after recovery doesn't make sense
+ * since the order hasn't been preserved anyway. The new order
+ * could also have created a new "in place" conversion deadlock.
+ * (e.g. old, failed master held granted EX, with PR->EX, NL->EX.
+ * After recovery, there would be no granted locks, and possibly
+ * NL->EX, PR->EX, an in-place conversion deadlock.) So, after
+ * recovery, grant conversions without considering order.
+ */
+
+ if (conv && recover)
+ return 1;
+
+ /*
+ * 6-5: But the default algorithm for deciding whether to grant or
+ * queue conversion requests does not by itself guarantee that such
+ * requests are serviced on a "first come first serve" basis. This, in
+ * turn, can lead to a phenomenon known as "indefinate postponement".
+ *
+ * 6-7: This issue is dealt with by using the optional QUECVT flag with
+ * the system service employed to request a lock conversion. This flag
+ * forces certain conversion requests to be queued, even if they are
+ * compatible with the granted modes of other locks on the same
+ * resource. Thus, the use of this flag results in conversion requests
+ * being ordered on a "first come first servce" basis.
+ *
+ * DCT: This condition is all about new conversions being able to occur
+ * "in place" while the lock remains on the granted queue (assuming
+ * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
+ * doesn't _have_ to go onto the convert queue where it's processed in
+ * order. The "now" variable is necessary to distinguish converts
+ * being received and processed for the first time now, because once a
+ * convert is moved to the conversion queue the condition below applies
+ * requiring fifo granting.
+ */
+
+ if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+ return 1;
+
+ /*
+ * Even if the convert is compat with all granted locks,
+ * QUECVT forces it behind other locks on the convert queue.
+ */
+
+ if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) {
+ if (list_empty(&r->res_convertqueue))
+ return 1;
+ else
+ return 0;
+ }
+
+ /*
+ * The NOORDER flag is set to avoid the standard vms rules on grant
+ * order.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+ return 1;
+
+ /*
+ * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+ * granted until all other conversion requests ahead of it are granted
+ * and/or canceled.
+ */
+
+ if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+ return 1;
+
+ /*
+ * 6-4: By default, a new request is immediately granted only if all
+ * three of the following conditions are satisfied when the request is
+ * issued:
+ * - The queue of ungranted conversion requests for the resource is
+ * empty.
+ * - The queue of ungranted new requests for the resource is empty.
+ * - The mode of the new request is compatible with the most
+ * restrictive mode of all granted locks on the resource.
+ */
+
+ if (now && !conv && list_empty(&r->res_convertqueue) &&
+ list_empty(&r->res_waitqueue))
+ return 1;
+
+ /*
+ * 6-4: Once a lock request is in the queue of ungranted new requests,
+ * it cannot be granted until the queue of ungranted conversion
+ * requests is empty, all ungranted new requests ahead of it are
+ * granted and/or canceled, and it is compatible with the granted mode
+ * of the most restrictive lock granted on the resource.
+ */
+
+ if (!now && !conv && list_empty(&r->res_convertqueue) &&
+ first_in_list(lkb, &r->res_waitqueue))
+ return 1;
+
+ return 0;
+}
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
+ int recover, int *err)
+{
+ int rv;
+ int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+ int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+ if (err)
+ *err = 0;
+
+ rv = _can_be_granted(r, lkb, now, recover);
+ if (rv)
+ goto out;
+
+ /*
+ * The CONVDEADLK flag is non-standard and tells the dlm to resolve
+ * conversion deadlocks by demoting grmode to NL, otherwise the dlm
+ * cancels one of the locks.
+ */
+
+ if (is_convert && can_be_queued(lkb) &&
+ conversion_deadlock_detect(r, lkb)) {
+ if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
+ lkb->lkb_grmode = DLM_LOCK_NL;
+ set_bit(DLM_SBF_DEMOTED_BIT, &lkb->lkb_sbflags);
+ } else if (err) {
+ *err = -EDEADLK;
+ } else {
+ log_print("can_be_granted deadlock %x now %d",
+ lkb->lkb_id, now);
+ dlm_dump_rsb(r);
+ }
+ goto out;
+ }
+
+ /*
+ * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
+ * to grant a request in a mode other than the normal rqmode. It's a
+ * simple way to provide a big optimization to applications that can
+ * use them.
+ */
+
+ if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
+ alt = DLM_LOCK_PR;
+ else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
+ alt = DLM_LOCK_CW;
+
+ if (alt) {
+ lkb->lkb_rqmode = alt;
+ rv = _can_be_granted(r, lkb, now, 0);
+ if (rv)
+ set_bit(DLM_SBF_ALTMODE_BIT, &lkb->lkb_sbflags);
+ else
+ lkb->lkb_rqmode = rqmode;
+ }
+ out:
+ return rv;
+}
+
+/* Returns the highest requested mode of all blocked conversions; sets
+ cw if there's a blocked conversion to DLM_LOCK_CW. */
+
+static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
+ unsigned int *count)
+{
+ struct dlm_lkb *lkb, *s;
+ int recover = rsb_flag(r, RSB_RECOVER_GRANT);
+ int hi, demoted, quit, grant_restart, demote_restart;
+ int deadlk;
+
+ quit = 0;
+ restart:
+ grant_restart = 0;
+ demote_restart = 0;
+ hi = DLM_LOCK_IV;
+
+ list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+ demoted = is_demoted(lkb);
+ deadlk = 0;
+
+ if (can_be_granted(r, lkb, 0, recover, &deadlk)) {
+ grant_lock_pending(r, lkb);
+ grant_restart = 1;
+ if (count)
+ (*count)++;
+ continue;
+ }
+
+ if (!demoted && is_demoted(lkb)) {
+ log_print("WARN: pending demoted %x node %d %s",
+ lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+ demote_restart = 1;
+ continue;
+ }
+
+ if (deadlk) {
+ /*
+ * If DLM_LKB_NODLKWT flag is set and conversion
+ * deadlock is detected, we request blocking AST and
+ * down (or cancel) conversion.
+ */
+ if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) {
+ if (lkb->lkb_highbast < lkb->lkb_rqmode) {
+ queue_bast(r, lkb, lkb->lkb_rqmode);
+ lkb->lkb_highbast = lkb->lkb_rqmode;
+ }
+ } else {
+ log_print("WARN: pending deadlock %x node %d %s",
+ lkb->lkb_id, lkb->lkb_nodeid,
+ r->res_name);
+ dlm_dump_rsb(r);
+ }
+ continue;
+ }
+
+ hi = max_t(int, lkb->lkb_rqmode, hi);
+
+ if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
+ *cw = 1;
+ }
+
+ if (grant_restart)
+ goto restart;
+ if (demote_restart && !quit) {
+ quit = 1;
+ goto restart;
+ }
+
+ return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw,
+ unsigned int *count)
+{
+ struct dlm_lkb *lkb, *s;
+
+ list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+ if (can_be_granted(r, lkb, 0, 0, NULL)) {
+ grant_lock_pending(r, lkb);
+ if (count)
+ (*count)++;
+ } else {
+ high = max_t(int, lkb->lkb_rqmode, high);
+ if (lkb->lkb_rqmode == DLM_LOCK_CW)
+ *cw = 1;
+ }
+ }
+
+ return high;
+}
+
+/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
+ on either the convert or waiting queue.
+ high is the largest rqmode of all locks blocked on the convert or
+ waiting queue. */
+
+static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
+{
+ if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
+ if (gr->lkb_highbast < DLM_LOCK_EX)
+ return 1;
+ return 0;
+ }
+
+ if (gr->lkb_highbast < high &&
+ !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
+ return 1;
+ return 0;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count)
+{
+ struct dlm_lkb *lkb, *s;
+ int high = DLM_LOCK_IV;
+ int cw = 0;
+
+ if (!is_master(r)) {
+ log_print("grant_pending_locks r nodeid %d", r->res_nodeid);
+ dlm_dump_rsb(r);
+ return;
+ }
+
+ high = grant_pending_convert(r, high, &cw, count);
+ high = grant_pending_wait(r, high, &cw, count);
+
+ if (high == DLM_LOCK_IV)
+ return;
+
+ /*
+ * If there are locks left on the wait/convert queue then send blocking
+ * ASTs to granted locks based on the largest requested mode (high)
+ * found above.
+ */
+
+ list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
+ if (cw && high == DLM_LOCK_PR &&
+ lkb->lkb_grmode == DLM_LOCK_PR)
+ queue_bast(r, lkb, DLM_LOCK_CW);
+ else
+ queue_bast(r, lkb, high);
+ lkb->lkb_highbast = high;
+ }
+ }
+}
+
+static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
+{
+ if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
+ (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
+ if (gr->lkb_highbast < DLM_LOCK_EX)
+ return 1;
+ return 0;
+ }
+
+ if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
+ return 1;
+ return 0;
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+ struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *gr;
+
+ list_for_each_entry(gr, head, lkb_statequeue) {
+ /* skip self when sending basts to convertqueue */
+ if (gr == lkb)
+ continue;
+ if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
+ queue_bast(r, gr, lkb->lkb_rqmode);
+ gr->lkb_highbast = lkb->lkb_rqmode;
+ }
+ }
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+ send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+ The purpose of this function is to set the nodeid field in the given
+ lkb using the nodeid field in the given rsb. If the rsb's nodeid is
+ known, it can just be copied to the lkb and the function will return
+ 0. If the rsb's nodeid is _not_ known, it needs to be looked up
+ before it can be copied to the lkb.
+
+ When the rsb nodeid is being looked up remotely, the initial lkb
+ causing the lookup is kept on the ls_waiters list waiting for the
+ lookup reply. Other lkb's waiting for the same rsb lookup are kept
+ on the rsb's res_lookup list until the master is verified.
+
+ Return values:
+ 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+ 1: the rsb master is not available and the lkb has been placed on
+ a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int our_nodeid = dlm_our_nodeid();
+
+ if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+ rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = lkb->lkb_id;
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+ list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+ return 1;
+ }
+
+ if (r->res_master_nodeid == our_nodeid) {
+ lkb->lkb_nodeid = 0;
+ return 0;
+ }
+
+ if (r->res_master_nodeid) {
+ lkb->lkb_nodeid = r->res_master_nodeid;
+ return 0;
+ }
+
+ if (dlm_dir_nodeid(r) == our_nodeid) {
+ /* This is a somewhat unusual case; find_rsb will usually
+ have set res_master_nodeid when dir nodeid is local, but
+ there are cases where we become the dir node after we've
+ past find_rsb and go through _request_lock again.
+ confirm_master() or process_lookup_list() needs to be
+ called after this. */
+ log_debug(r->res_ls, "set_master %x self master %d dir %d %s",
+ lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid,
+ r->res_name);
+ r->res_master_nodeid = our_nodeid;
+ r->res_nodeid = 0;
+ lkb->lkb_nodeid = 0;
+ return 0;
+ }
+
+ r->res_first_lkid = lkb->lkb_id;
+ send_lookup(r, lkb);
+ return 1;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+ list_del_init(&lkb->lkb_rsb_lookup);
+ _request_lock(r, lkb);
+ schedule();
+ }
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+ struct dlm_lkb *lkb;
+
+ if (!r->res_first_lkid)
+ return;
+
+ switch (error) {
+ case 0:
+ case -EINPROGRESS:
+ r->res_first_lkid = 0;
+ process_lookup_list(r);
+ break;
+
+ case -EAGAIN:
+ case -EBADR:
+ case -ENOTBLK:
+ /* the remote request failed and won't be retried (it was
+ a NOQUEUE, or has been canceled/unlocked); make a waiting
+ lkb the first_lkid */
+
+ r->res_first_lkid = 0;
+
+ if (!list_empty(&r->res_lookup)) {
+ lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+ lkb_rsb_lookup);
+ list_del_init(&lkb->lkb_rsb_lookup);
+ r->res_first_lkid = lkb->lkb_id;
+ _request_lock(r, lkb);
+ }
+ break;
+
+ default:
+ log_error(r->res_ls, "confirm_master unknown error %d", error);
+ }
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, void (*ast)(void *astparam),
+ void *astparam,
+ void (*bast)(void *astparam, int mode),
+ struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ /* check for invalid arg usage */
+
+ if (mode < 0 || mode > DLM_LOCK_EX)
+ goto out;
+
+ if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+ goto out;
+
+ if (flags & DLM_LKF_CANCEL)
+ goto out;
+
+ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+ goto out;
+
+ if (!ast || !lksb)
+ goto out;
+
+ if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+ goto out;
+
+ if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+ goto out;
+
+ /* these args will be copied to the lkb in validate_lock_args,
+ it cannot be done now because when converting locks, fields in
+ an active lkb cannot be modified before locking the rsb */
+
+ args->flags = flags;
+ args->astfn = ast;
+ args->astparam = astparam;
+ args->bastfn = bast;
+ args->mode = mode;
+ args->lksb = lksb;
+ rv = 0;
+ out:
+ return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+ if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+ DLM_LKF_FORCEUNLOCK))
+ return -EINVAL;
+
+ if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
+ return -EINVAL;
+
+ args->flags = flags;
+ args->astparam = astarg;
+ return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ int rv = -EBUSY;
+
+ if (args->flags & DLM_LKF_CONVERT) {
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+
+ /* lock not allowed if there's any op in progress */
+ if (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count))
+ goto out;
+
+ if (is_overlap(lkb))
+ goto out;
+
+ rv = -EINVAL;
+ if (test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags))
+ goto out;
+
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ goto out;
+ }
+
+ lkb->lkb_exflags = args->flags;
+ dlm_set_sbflags_val(lkb, 0);
+ lkb->lkb_astfn = args->astfn;
+ lkb->lkb_astparam = args->astparam;
+ lkb->lkb_bastfn = args->bastfn;
+ lkb->lkb_rqmode = args->mode;
+ lkb->lkb_lksb = args->lksb;
+ lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+ lkb->lkb_ownpid = (int) current->pid;
+ rv = 0;
+ out:
+ switch (rv) {
+ case 0:
+ break;
+ case -EINVAL:
+ /* annoy the user because dlm usage is wrong */
+ WARN_ON(1);
+ log_error(ls, "%s %d %x %x %x %d %d %s", __func__,
+ rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags,
+ lkb->lkb_status, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ default:
+ log_debug(ls, "%s %d %x %x %x %d %d %s", __func__,
+ rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags,
+ lkb->lkb_status, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ }
+
+ return rv;
+}
+
+/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
+ for success */
+
+/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
+ because there may be a lookup in progress and it's valid to do
+ cancel/unlockf on it */
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int rv = -EBUSY;
+
+ /* normal unlock not allowed if there's any op in progress */
+ if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
+ (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count)))
+ goto out;
+
+ /* an lkb may be waiting for an rsb lookup to complete where the
+ lookup was initiated by another lock */
+
+ if (!list_empty(&lkb->lkb_rsb_lookup)) {
+ if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+ log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
+ list_del_init(&lkb->lkb_rsb_lookup);
+ queue_cast(lkb->lkb_resource, lkb,
+ args->flags & DLM_LKF_CANCEL ?
+ -DLM_ECANCEL : -DLM_EUNLOCK);
+ unhold_lkb(lkb); /* undoes create_lkb() */
+ }
+ /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+ goto out;
+ }
+
+ rv = -EINVAL;
+ if (test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) {
+ log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+ dlm_print_lkb(lkb);
+ goto out;
+ }
+
+ /* an lkb may still exist even though the lock is EOL'ed due to a
+ * cancel, unlock or failed noqueue request; an app can't use these
+ * locks; return same error as if the lkid had not been found at all
+ */
+
+ if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) {
+ log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+ rv = -ENOENT;
+ goto out;
+ }
+
+ /* cancel not allowed with another cancel/unlock in progress */
+
+ if (args->flags & DLM_LKF_CANCEL) {
+ if (lkb->lkb_exflags & DLM_LKF_CANCEL)
+ goto out;
+
+ if (is_overlap(lkb))
+ goto out;
+
+ if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) {
+ set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+ rv = -EBUSY;
+ goto out;
+ }
+
+ /* there's nothing to cancel */
+ if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
+ !lkb->lkb_wait_type) {
+ rv = -EBUSY;
+ goto out;
+ }
+
+ switch (lkb->lkb_wait_type) {
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REQUEST:
+ set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+ rv = -EBUSY;
+ goto out;
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_CANCEL:
+ goto out;
+ }
+ /* add_to_waiters() will set OVERLAP_CANCEL */
+ goto out_ok;
+ }
+
+ /* do we need to allow a force-unlock if there's a normal unlock
+ already in progress? in what conditions could the normal unlock
+ fail such that we'd want to send a force-unlock to be sure? */
+
+ if (args->flags & DLM_LKF_FORCEUNLOCK) {
+ if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
+ goto out;
+
+ if (is_overlap_unlock(lkb))
+ goto out;
+
+ if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) {
+ set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
+ rv = -EBUSY;
+ goto out;
+ }
+
+ switch (lkb->lkb_wait_type) {
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REQUEST:
+ set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
+ rv = -EBUSY;
+ goto out;
+ case DLM_MSG_UNLOCK:
+ goto out;
+ }
+ /* add_to_waiters() will set OVERLAP_UNLOCK */
+ }
+
+ out_ok:
+ /* an overlapping op shouldn't blow away exflags from other op */
+ lkb->lkb_exflags |= args->flags;
+ dlm_set_sbflags_val(lkb, 0);
+ lkb->lkb_astparam = args->astparam;
+ rv = 0;
+ out:
+ switch (rv) {
+ case 0:
+ break;
+ case -EINVAL:
+ /* annoy the user because dlm usage is wrong */
+ WARN_ON(1);
+ log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
+ lkb->lkb_id, dlm_iflags_val(lkb), lkb->lkb_exflags,
+ args->flags, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ default:
+ log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
+ lkb->lkb_id, dlm_iflags_val(lkb), lkb->lkb_exflags,
+ args->flags, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ }
+
+ return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (can_be_granted(r, lkb, 1, 0, NULL)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ error = -EINPROGRESS;
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ queue_cast(r, lkb, -EAGAIN);
+ out:
+ return error;
+}
+
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ switch (error) {
+ case -EAGAIN:
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ break;
+ case -EINPROGRESS:
+ send_blocking_asts(r, lkb);
+ break;
+ }
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+ int deadlk = 0;
+
+ /* changing an existing lock may allow others to be granted */
+
+ if (can_be_granted(r, lkb, 1, 0, &deadlk)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ goto out;
+ }
+
+ /* can_be_granted() detected that this lock would block in a conversion
+ deadlock, so we leave it on the granted queue and return EDEADLK in
+ the ast for the convert. */
+
+ if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
+ /* it's left on the granted queue */
+ revert_lock(r, lkb);
+ queue_cast(r, lkb, -EDEADLK);
+ error = -EDEADLK;
+ goto out;
+ }
+
+ /* is_demoted() means the can_be_granted() above set the grmode
+ to NL, and left us on the granted queue. This auto-demotion
+ (due to CONVDEADLK) might mean other locks, and/or this lock, are
+ now grantable. We have to try to grant other converting locks
+ before we try again to grant this one. */
+
+ if (is_demoted(lkb)) {
+ grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL);
+ if (_can_be_granted(r, lkb, 1, 0)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ goto out;
+ }
+ /* else fall through and move to convert queue */
+ }
+
+ if (can_be_queued(lkb)) {
+ error = -EINPROGRESS;
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ queue_cast(r, lkb, -EAGAIN);
+ out:
+ return error;
+}
+
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ switch (error) {
+ case 0:
+ grant_pending_locks(r, NULL);
+ /* grant_pending_locks also sends basts */
+ break;
+ case -EAGAIN:
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ break;
+ case -EINPROGRESS:
+ send_blocking_asts(r, lkb);
+ break;
+ }
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ remove_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ return -DLM_EUNLOCK;
+}
+
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ grant_pending_locks(r, NULL);
+}
+
+/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
+
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ error = revert_lock(r, lkb);
+ if (error) {
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ return -DLM_ECANCEL;
+ }
+ return 0;
+}
+
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ if (error)
+ grant_pending_locks(r, NULL);
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ /* set_master: sets lkb nodeid from r */
+
+ error = set_master(r, lkb);
+ if (error < 0)
+ goto out;
+ if (error) {
+ error = 0;
+ goto out;
+ }
+
+ if (is_remote(r)) {
+ /* receive_request() calls do_request() on remote node */
+ error = send_request(r, lkb);
+ } else {
+ error = do_request(r, lkb);
+ /* for remote locks the request_reply is sent
+ between do_request and do_request_effects */
+ do_request_effects(r, lkb, error);
+ }
+ out:
+ return error;
+}
+
+/* change some property of an existing lkb, e.g. mode */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r)) {
+ /* receive_convert() calls do_convert() on remote node */
+ error = send_convert(r, lkb);
+ } else {
+ error = do_convert(r, lkb);
+ /* for remote locks the convert_reply is sent
+ between do_convert and do_convert_effects */
+ do_convert_effects(r, lkb, error);
+ }
+
+ return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r)) {
+ /* receive_unlock() calls do_unlock() on remote node */
+ error = send_unlock(r, lkb);
+ } else {
+ error = do_unlock(r, lkb);
+ /* for remote locks the unlock_reply is sent
+ between do_unlock and do_unlock_effects */
+ do_unlock_effects(r, lkb, error);
+ }
+
+ return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r)) {
+ /* receive_cancel() calls do_cancel() on remote node */
+ error = send_cancel(r, lkb);
+ } else {
+ error = do_cancel(r, lkb);
+ /* for remote locks the cancel_reply is sent
+ between do_cancel and do_cancel_effects */
+ do_cancel_effects(r, lkb, error);
+ }
+
+ return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const void *name, int len,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ return error;
+
+ error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
+ if (error)
+ return error;
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+ error = _request_lock(r, lkb);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = _convert_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _unlock_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _cancel_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+/*
+ * Two stage 1 varieties: dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+ int mode,
+ struct dlm_lksb *lksb,
+ uint32_t flags,
+ const void *name,
+ unsigned int namelen,
+ uint32_t parent_lkid,
+ void (*ast) (void *astarg),
+ void *astarg,
+ void (*bast) (void *astarg, int mode))
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error, convert = flags & DLM_LKF_CONVERT;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ dlm_lock_recovery(ls);
+
+ if (convert)
+ error = find_lkb(ls, lksb->sb_lkid, &lkb);
+ else
+ error = create_lkb(ls, &lkb);
+
+ if (error)
+ goto out;
+
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+
+ error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast,
+ &args);
+ if (error)
+ goto out_put;
+
+ if (convert)
+ error = convert_lock(ls, lkb, &args);
+ else
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ out_put:
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true);
+
+ if (convert || error)
+ __put_lkb(ls, lkb);
+ if (error == -EAGAIN || error == -EDEADLK)
+ error = 0;
+ out:
+ dlm_unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+ uint32_t lkid,
+ uint32_t flags,
+ struct dlm_lksb *lksb,
+ void *astarg)
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ dlm_lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ trace_dlm_unlock_start(ls, lkb, flags);
+
+ error = set_unlock_args(flags, astarg, &args);
+ if (error)
+ goto out_put;
+
+ if (flags & DLM_LKF_CANCEL)
+ error = cancel_lock(ls, lkb, &args);
+ else
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+ error = 0;
+ if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
+ error = 0;
+ out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
+
+ dlm_put_lkb(lkb);
+ out:
+ dlm_unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request receive_request
+ * send_convert receive_convert
+ * send_unlock receive_unlock
+ * send_cancel receive_cancel
+ * send_grant receive_grant
+ * send_bast receive_bast
+ * send_lookup receive_lookup
+ * send_remove receive_remove
+ *
+ * send_common_reply
+ * receive_request_reply send_request_reply
+ * receive_convert_reply send_convert_reply
+ * receive_unlock_reply send_unlock_reply
+ * receive_cancel_reply send_cancel_reply
+ * receive_lookup_reply send_lookup_reply
+ */
+
+static int _create_message(struct dlm_ls *ls, int mb_len,
+ int to_nodeid, int mstype,
+ struct dlm_message **ms_ret,
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ char *mb;
+
+ /* get_buffer gives us a message handle (mh) that we need to
+ pass into midcomms_commit and a message buffer (mb) that we
+ write our data into */
+
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb);
+ if (!mh)
+ return -ENOBUFS;
+
+ ms = (struct dlm_message *) mb;
+
+ ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
+ ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ ms->m_header.h_length = cpu_to_le16(mb_len);
+ ms->m_header.h_cmd = DLM_MSG;
+
+ ms->m_type = cpu_to_le32(mstype);
+
+ *mh_ret = mh;
+ *ms_ret = ms;
+ return 0;
+}
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int to_nodeid, int mstype,
+ struct dlm_message **ms_ret,
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
+{
+ int mb_len = sizeof(struct dlm_message);
+
+ switch (mstype) {
+ case DLM_MSG_REQUEST:
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REMOVE:
+ mb_len += r->res_length;
+ break;
+ case DLM_MSG_CONVERT:
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_REQUEST_REPLY:
+ case DLM_MSG_CONVERT_REPLY:
+ case DLM_MSG_GRANT:
+ if (lkb && lkb->lkb_lvbptr && (lkb->lkb_exflags & DLM_LKF_VALBLK))
+ mb_len += r->res_ls->ls_lvblen;
+ break;
+ }
+
+ return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
+ ms_ret, mh_ret, allocation);
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+ the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms,
+ const void *name, int namelen)
+{
+ dlm_midcomms_commit_mhandle(mh, name, namelen);
+ return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ ms->m_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+ ms->m_pid = cpu_to_le32(lkb->lkb_ownpid);
+ ms->m_lkid = cpu_to_le32(lkb->lkb_id);
+ ms->m_remid = cpu_to_le32(lkb->lkb_remid);
+ ms->m_exflags = cpu_to_le32(lkb->lkb_exflags);
+ ms->m_sbflags = cpu_to_le32(dlm_sbflags_val(lkb));
+ ms->m_flags = cpu_to_le32(dlm_dflags_val(lkb));
+ ms->m_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
+ ms->m_status = cpu_to_le32(lkb->lkb_status);
+ ms->m_grmode = cpu_to_le32(lkb->lkb_grmode);
+ ms->m_rqmode = cpu_to_le32(lkb->lkb_rqmode);
+ ms->m_hash = cpu_to_le32(r->res_hash);
+
+ /* m_result and m_bastmode are set from function args,
+ not from lkb fields */
+
+ if (lkb->lkb_bastfn)
+ ms->m_asts |= cpu_to_le32(DLM_CB_BAST);
+ if (lkb->lkb_astfn)
+ ms->m_asts |= cpu_to_le32(DLM_CB_CAST);
+
+ /* compare with switch in create_message; send_remove() doesn't
+ use send_args() */
+
+ switch (ms->m_type) {
+ case cpu_to_le32(DLM_MSG_REQUEST):
+ case cpu_to_le32(DLM_MSG_LOOKUP):
+ memcpy(ms->m_extra, r->res_name, r->res_length);
+ break;
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ case cpu_to_le32(DLM_MSG_GRANT):
+ if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ break;
+ memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+ break;
+ }
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = r->res_nodeid;
+
+ error = add_to_waiters(lkb, mstype, to_nodeid);
+ if (error)
+ return error;
+
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms, r->res_name, r->res_length);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb, msg_reply_type(mstype));
+ return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+ /* down conversions go without a reply from the master */
+ if (!error && down_conversion(lkb)) {
+ remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+ r->res_ls->ls_local_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
+ r->res_ls->ls_local_ms.m_result = 0;
+ __receive_convert_reply(r, lkb, &r->res_ls->ls_local_ms, true);
+ }
+
+ return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+ MASTER_UNCERTAIN to force the next request on the rsb to confirm
+ that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh,
+ GFP_NOFS);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = 0;
+
+ error = send_message(mh, ms, r->res_name, r->res_length);
+ out:
+ return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh,
+ GFP_NOFS);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_bastmode = cpu_to_le32(mode);
+
+ error = send_message(mh, ms, r->res_name, r->res_length);
+ out:
+ return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
+ if (error)
+ return error;
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh,
+ GFP_NOFS);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms, r->res_name, r->res_length);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+ return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh,
+ GFP_ATOMIC);
+ if (error)
+ goto out;
+
+ memcpy(ms->m_extra, r->res_name, r->res_length);
+ ms->m_hash = cpu_to_le32(r->res_hash);
+
+ error = send_message(mh, ms, r->res_name, r->res_length);
+ out:
+ return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int mstype, int rv)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = cpu_to_le32(to_dlm_errno(rv));
+
+ error = send_message(mh, ms, r->res_name, r->res_length);
+ out:
+ return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms_in, int ret_nodeid,
+ int rv)
+{
+ struct dlm_rsb *r = &ls->ls_local_rsb;
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
+
+ error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh,
+ GFP_NOFS);
+ if (error)
+ goto out;
+
+ ms->m_lkid = ms_in->m_lkid;
+ ms->m_result = cpu_to_le32(to_dlm_errno(rv));
+ ms->m_nodeid = cpu_to_le32(ret_nodeid);
+
+ error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in));
+ out:
+ return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+ of message, unlike the send side where we can safely send everything about
+ the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, const struct dlm_message *ms)
+{
+ lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
+ dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags));
+ dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags));
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms,
+ bool local)
+{
+ if (local)
+ return;
+
+ dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags));
+ dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags));
+}
+
+static int receive_extralen(const struct dlm_message *ms)
+{
+ return (le16_to_cpu(ms->m_header.h_length) -
+ sizeof(struct dlm_message));
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const struct dlm_message *ms)
+{
+ int len;
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ if (!lkb->lkb_lvbptr)
+ lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ len = receive_extralen(ms);
+ if (len > ls->ls_lvblen)
+ len = ls->ls_lvblen;
+ memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+ }
+ return 0;
+}
+
+static void fake_bastfn(void *astparam, int mode)
+{
+ log_print("fake_bastfn should not be called");
+}
+
+static void fake_astfn(void *astparam)
+{
+ log_print("fake_astfn should not be called");
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const struct dlm_message *ms)
+{
+ lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ lkb->lkb_ownpid = le32_to_cpu(ms->m_pid);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
+
+ lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL;
+ lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL;
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ /* lkb was just created so there won't be an lvb yet */
+ lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const struct dlm_message *ms)
+{
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ return -EBUSY;
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
+ lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
+
+ return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const struct dlm_message *ms)
+{
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+ return 0;
+}
+
+/* We fill in the local-lkb fields with the info that send_xxxx_reply()
+ uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_local_lkb(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb = &ls->ls_local_lkb;
+ lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
+}
+
+/* This is called after the rsb is locked so that we can safely inspect
+ fields in the lkb. */
+
+static int validate_message(struct dlm_lkb *lkb, const struct dlm_message *ms)
+{
+ int from = le32_to_cpu(ms->m_header.h_nodeid);
+ int error = 0;
+
+ /* currently mixing of user/kernel locks are not supported */
+ if (ms->m_flags & cpu_to_le32(BIT(DLM_DFL_USER_BIT)) &&
+ !test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
+ log_error(lkb->lkb_resource->res_ls,
+ "got user dlm message for a kernel lock");
+ error = -EINVAL;
+ goto out;
+ }
+
+ switch (ms->m_type) {
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ case cpu_to_le32(DLM_MSG_CANCEL):
+ if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+ error = -EINVAL;
+ break;
+
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
+ case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
+ case cpu_to_le32(DLM_MSG_GRANT):
+ case cpu_to_le32(DLM_MSG_BAST):
+ if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+ error = -EINVAL;
+ break;
+
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
+ if (!is_process_copy(lkb))
+ error = -EINVAL;
+ else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+ error = -EINVAL;
+ break;
+
+ default:
+ error = -EINVAL;
+ }
+
+out:
+ if (error)
+ log_error(lkb->lkb_resource->res_ls,
+ "ignore invalid message %d from %d %x %x %x %d",
+ le32_to_cpu(ms->m_type), from, lkb->lkb_id,
+ lkb->lkb_remid, dlm_iflags_val(lkb),
+ lkb->lkb_nodeid);
+ return error;
+}
+
+static int receive_request(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int from_nodeid;
+ int error, namelen = 0;
+
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+ set_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags);
+ error = receive_request_args(ls, lkb, ms);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+
+ /* The dir node is the authority on whether we are the master
+ for this rsb or not, so if the master sends us a request, we should
+ recreate the rsb if we've destroyed it. This race happens when we
+ send a remove message to the dir node at the same time that the dir
+ node sends us a request for the rsb. */
+
+ namelen = receive_extralen(ms);
+
+ error = find_rsb(ls, ms->m_extra, namelen, from_nodeid,
+ R_RECEIVE_REQUEST, &r);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+
+ lock_rsb(r);
+
+ if (r->res_master_nodeid != dlm_our_nodeid()) {
+ error = validate_master_nodeid(ls, r, from_nodeid);
+ if (error) {
+ unlock_rsb(r);
+ put_rsb(r);
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+ }
+
+ attach_lkb(r, lkb);
+ error = do_request(r, lkb);
+ send_request_reply(r, lkb, error);
+ do_request_effects(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ if (error)
+ dlm_put_lkb(lkb);
+ return 0;
+
+ fail:
+ /* TODO: instead of returning ENOTBLK, add the lkb to res_lookup
+ and do this receive_request again from process_lookup_list once
+ we get the lookup reply. This would avoid a many repeated
+ ENOTBLK request failures when the lookup reply designating us
+ as master is delayed. */
+
+ if (error != -ENOTBLK) {
+ log_limit(ls, "receive_request %x from %d %d",
+ le32_to_cpu(ms->m_lkid), from_nodeid, error);
+ }
+
+ setup_local_lkb(ls, ms);
+ send_request_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
+ return error;
+}
+
+static int receive_convert(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, reply = 1;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ goto fail;
+
+ if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
+ log_error(ls, "receive_convert %x remid %x recover_seq %llu "
+ "remote %d %x", lkb->lkb_id, lkb->lkb_remid,
+ (unsigned long long)lkb->lkb_recover_seq,
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid));
+ error = -ENOENT;
+ dlm_put_lkb(lkb);
+ goto fail;
+ }
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ receive_flags(lkb, ms);
+
+ error = receive_convert_args(ls, lkb, ms);
+ if (error) {
+ send_convert_reply(r, lkb, error);
+ goto out;
+ }
+
+ reply = !down_conversion(lkb);
+
+ error = do_convert(r, lkb);
+ if (reply)
+ send_convert_reply(r, lkb, error);
+ do_convert_effects(r, lkb, error);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return 0;
+
+ fail:
+ setup_local_lkb(ls, ms);
+ send_convert_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
+ return error;
+}
+
+static int receive_unlock(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ goto fail;
+
+ if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
+ log_error(ls, "receive_unlock %x remid %x remote %d %x",
+ lkb->lkb_id, lkb->lkb_remid,
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid));
+ error = -ENOENT;
+ dlm_put_lkb(lkb);
+ goto fail;
+ }
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ receive_flags(lkb, ms);
+
+ error = receive_unlock_args(ls, lkb, ms);
+ if (error) {
+ send_unlock_reply(r, lkb, error);
+ goto out;
+ }
+
+ error = do_unlock(r, lkb);
+ send_unlock_reply(r, lkb, error);
+ do_unlock_effects(r, lkb, error);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return 0;
+
+ fail:
+ setup_local_lkb(ls, ms);
+ send_unlock_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
+ return error;
+}
+
+static int receive_cancel(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ error = do_cancel(r, lkb);
+ send_cancel_reply(r, lkb, error);
+ do_cancel_effects(r, lkb, error);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return 0;
+
+ fail:
+ setup_local_lkb(ls, ms);
+ send_cancel_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
+ return error;
+}
+
+static int receive_grant(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ return error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ receive_flags_reply(lkb, ms, false);
+ if (is_altmode(lkb))
+ munge_altmode(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return 0;
+}
+
+static int receive_bast(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ return error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode));
+ lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return 0;
+}
+
+static void receive_lookup(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ int len, error, ret_nodeid, from_nodeid, our_nodeid;
+
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ our_nodeid = dlm_our_nodeid();
+
+ len = receive_extralen(ms);
+
+ error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0,
+ &ret_nodeid, NULL);
+
+ /* Optimization: we're master so treat lookup as a request */
+ if (!error && ret_nodeid == our_nodeid) {
+ receive_request(ls, ms);
+ return;
+ }
+ send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ char name[DLM_RESNAME_MAXLEN+1];
+ struct dlm_rsb *r;
+ uint32_t hash, b;
+ int rv, len, dir_nodeid, from_nodeid;
+
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+
+ len = receive_extralen(ms);
+
+ if (len > DLM_RESNAME_MAXLEN) {
+ log_error(ls, "receive_remove from %d bad len %d",
+ from_nodeid, len);
+ return;
+ }
+
+ dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash));
+ if (dir_nodeid != dlm_our_nodeid()) {
+ log_error(ls, "receive_remove from %d bad nodeid %d",
+ from_nodeid, dir_nodeid);
+ return;
+ }
+
+ /* Look for name on rsbtbl.toss, if it's there, kill it.
+ If it's on rsbtbl.keep, it's being used, and we should ignore this
+ message. This is an expected race between the dir node sending a
+ request to the master node at the same time as the master node sends
+ a remove to the dir node. The resolution to that race is for the
+ dir node to ignore the remove message, and the master node to
+ recreate the master rsb when it gets a request from the dir node for
+ an rsb it doesn't have. */
+
+ memset(name, 0, sizeof(name));
+ memcpy(name, ms->m_extra, len);
+
+ hash = jhash(name, len, 0);
+ b = hash & (ls->ls_rsbtbl_size - 1);
+
+ spin_lock(&ls->ls_rsbtbl[b].lock);
+
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
+ if (rv) {
+ /* verify the rsb is on keep list per comment above */
+ rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
+ if (rv) {
+ /* should not happen */
+ log_error(ls, "receive_remove from %d not found %s",
+ from_nodeid, name);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+ if (r->res_master_nodeid != from_nodeid) {
+ /* should not happen */
+ log_error(ls, "receive_remove keep from %d master %d",
+ from_nodeid, r->res_master_nodeid);
+ dlm_print_rsb(r);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+
+ log_debug(ls, "receive_remove from %d master %d first %x %s",
+ from_nodeid, r->res_master_nodeid, r->res_first_lkid,
+ name);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+
+ if (r->res_master_nodeid != from_nodeid) {
+ log_error(ls, "receive_remove toss from %d master %d",
+ from_nodeid, r->res_master_nodeid);
+ dlm_print_rsb(r);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+
+ if (kref_put(&r->res_ref, kill_rsb)) {
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ dlm_free_rsb(r);
+ } else {
+ log_error(ls, "receive_remove from %d rsb ref error",
+ from_nodeid);
+ dlm_print_rsb(r);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ }
+}
+
+static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms)
+{
+ do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid));
+}
+
+static int receive_request_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, mstype, result;
+ int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ return error;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ mstype = lkb->lkb_wait_type;
+ error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
+ if (error) {
+ log_error(ls, "receive_request_reply %x remote %d %x result %d",
+ lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
+ dlm_dump_rsb(r);
+ goto out;
+ }
+
+ /* Optimization: the dir node was also the master, so it took our
+ lookup as a request and sent request reply instead of lookup reply */
+ if (mstype == DLM_MSG_LOOKUP) {
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ lkb->lkb_nodeid = from_nodeid;
+ }
+
+ /* this is the value returned from do_request() on the master */
+ result = from_dlm_errno(le32_to_cpu(ms->m_result));
+
+ switch (result) {
+ case -EAGAIN:
+ /* request would block (be queued) on remote master */
+ queue_cast(r, lkb, -EAGAIN);
+ confirm_master(r, -EAGAIN);
+ unhold_lkb(lkb); /* undoes create_lkb() */
+ break;
+
+ case -EINPROGRESS:
+ case 0:
+ /* request was queued or granted on remote master */
+ receive_flags_reply(lkb, ms, false);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
+ if (is_altmode(lkb))
+ munge_altmode(lkb, ms);
+ if (result) {
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ } else {
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ }
+ confirm_master(r, result);
+ break;
+
+ case -EBADR:
+ case -ENOTBLK:
+ /* find_rsb failed to find rsb or rsb wasn't master */
+ log_limit(ls, "receive_request_reply %x from %d %d "
+ "master %d dir %d first %x %s", lkb->lkb_id,
+ from_nodeid, result, r->res_master_nodeid,
+ r->res_dir_nodeid, r->res_first_lkid, r->res_name);
+
+ if (r->res_dir_nodeid != dlm_our_nodeid() &&
+ r->res_master_nodeid != dlm_our_nodeid()) {
+ /* cause _request_lock->set_master->send_lookup */
+ r->res_master_nodeid = 0;
+ r->res_nodeid = -1;
+ lkb->lkb_nodeid = -1;
+ }
+
+ if (is_overlap(lkb)) {
+ /* we'll ignore error in cancel/unlock reply */
+ queue_cast_overlap(r, lkb);
+ confirm_master(r, result);
+ unhold_lkb(lkb); /* undoes create_lkb() */
+ } else {
+ _request_lock(r, lkb);
+
+ if (r->res_master_nodeid == dlm_our_nodeid())
+ confirm_master(r, 0);
+ }
+ break;
+
+ default:
+ log_error(ls, "receive_request_reply %x error %d",
+ lkb->lkb_id, result);
+ }
+
+ if ((result == 0 || result == -EINPROGRESS) &&
+ test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags)) {
+ log_debug(ls, "receive_request_reply %x result %d unlock",
+ lkb->lkb_id, result);
+ clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+ send_unlock(r, lkb);
+ } else if ((result == -EINPROGRESS) &&
+ test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT,
+ &lkb->lkb_iflags)) {
+ log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
+ clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
+ send_cancel(r, lkb);
+ } else {
+ clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
+ clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
+ }
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return 0;
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
+{
+ /* this is the value returned from do_convert() on the master */
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
+ case -EAGAIN:
+ /* convert would block (be queued) on remote master */
+ queue_cast(r, lkb, -EAGAIN);
+ break;
+
+ case -EDEADLK:
+ receive_flags_reply(lkb, ms, local);
+ revert_lock_pc(r, lkb);
+ queue_cast(r, lkb, -EDEADLK);
+ break;
+
+ case -EINPROGRESS:
+ /* convert was queued on remote master */
+ receive_flags_reply(lkb, ms, local);
+ if (is_demoted(lkb))
+ munge_demoted(lkb);
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ break;
+
+ case 0:
+ /* convert was granted on remote master */
+ receive_flags_reply(lkb, ms, local);
+ if (is_demoted(lkb))
+ munge_demoted(lkb);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ break;
+
+ default:
+ log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
+ dlm_print_rsb(r);
+ dlm_print_lkb(lkb);
+ }
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ /* local reply can happen with waiters_mutex held */
+ error = remove_from_waiters_ms(lkb, ms, local);
+ if (error)
+ goto out;
+
+ __receive_convert_reply(r, lkb, ms, local);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static int receive_convert_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ return error;
+
+ _receive_convert_reply(lkb, ms, false);
+ dlm_put_lkb(lkb);
+ return 0;
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ /* local reply can happen with waiters_mutex held */
+ error = remove_from_waiters_ms(lkb, ms, local);
+ if (error)
+ goto out;
+
+ /* this is the value returned from do_unlock() on the master */
+
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
+ case -DLM_EUNLOCK:
+ receive_flags_reply(lkb, ms, local);
+ remove_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ break;
+ case -ENOENT:
+ break;
+ default:
+ log_error(r->res_ls, "receive_unlock_reply %x error %d",
+ lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result)));
+ }
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static int receive_unlock_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ return error;
+
+ _receive_unlock_reply(lkb, ms, false);
+ dlm_put_lkb(lkb);
+ return 0;
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
+ /* local reply can happen with waiters_mutex held */
+ error = remove_from_waiters_ms(lkb, ms, local);
+ if (error)
+ goto out;
+
+ /* this is the value returned from do_cancel() on the master */
+
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
+ case -DLM_ECANCEL:
+ receive_flags_reply(lkb, ms, local);
+ revert_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ break;
+ case 0:
+ break;
+ default:
+ log_error(r->res_ls, "receive_cancel_reply %x error %d",
+ lkb->lkb_id,
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
+ }
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static int receive_cancel_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
+ if (error)
+ return error;
+
+ _receive_cancel_reply(lkb, ms, false);
+ dlm_put_lkb(lkb);
+ return 0;
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, ret_nodeid;
+ int do_lookup_list = 0;
+
+ error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb);
+ if (error) {
+ log_error(ls, "%s no lkid %x", __func__,
+ le32_to_cpu(ms->m_lkid));
+ return;
+ }
+
+ /* ms->m_result is the value returned by dlm_master_lookup on dir node
+ FIXME: will a non-zero error ever be returned? */
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+ if (error)
+ goto out;
+
+ ret_nodeid = le32_to_cpu(ms->m_nodeid);
+
+ /* We sometimes receive a request from the dir node for this
+ rsb before we've received the dir node's loookup_reply for it.
+ The request from the dir node implies we're the master, so we set
+ ourself as master in receive_request_reply, and verify here that
+ we are indeed the master. */
+
+ if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) {
+ /* This should never happen */
+ log_error(ls, "receive_lookup_reply %x from %d ret %d "
+ "master %d dir %d our %d first %x %s",
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
+ ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
+ dlm_our_nodeid(), r->res_first_lkid, r->res_name);
+ }
+
+ if (ret_nodeid == dlm_our_nodeid()) {
+ r->res_master_nodeid = ret_nodeid;
+ r->res_nodeid = 0;
+ do_lookup_list = 1;
+ r->res_first_lkid = 0;
+ } else if (ret_nodeid == -1) {
+ /* the remote node doesn't believe it's the dir node */
+ log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid));
+ r->res_master_nodeid = 0;
+ r->res_nodeid = -1;
+ lkb->lkb_nodeid = -1;
+ } else {
+ /* set_master() will set lkb_nodeid from r */
+ r->res_master_nodeid = ret_nodeid;
+ r->res_nodeid = ret_nodeid;
+ }
+
+ if (is_overlap(lkb)) {
+ log_debug(ls, "receive_lookup_reply %x unlock %x",
+ lkb->lkb_id, dlm_iflags_val(lkb));
+ queue_cast_overlap(r, lkb);
+ unhold_lkb(lkb); /* undoes create_lkb() */
+ goto out_list;
+ }
+
+ _request_lock(r, lkb);
+
+ out_list:
+ if (do_lookup_list)
+ process_lookup_list(r);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+}
+
+static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms,
+ uint32_t saved_seq)
+{
+ int error = 0, noent = 0;
+
+ if (WARN_ON_ONCE(!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid)))) {
+ log_limit(ls, "receive %d from non-member %d %x %x %d",
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
+ return;
+ }
+
+ switch (ms->m_type) {
+
+ /* messages sent to a master node */
+
+ case cpu_to_le32(DLM_MSG_REQUEST):
+ error = receive_request(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ error = receive_convert(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ error = receive_unlock(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_CANCEL):
+ noent = 1;
+ error = receive_cancel(ls, ms);
+ break;
+
+ /* messages sent from a master node (replies to above) */
+
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
+ error = receive_request_reply(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ error = receive_convert_reply(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
+ error = receive_unlock_reply(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
+ error = receive_cancel_reply(ls, ms);
+ break;
+
+ /* messages sent from a master node (only two types of async msg) */
+
+ case cpu_to_le32(DLM_MSG_GRANT):
+ noent = 1;
+ error = receive_grant(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_BAST):
+ noent = 1;
+ error = receive_bast(ls, ms);
+ break;
+
+ /* messages sent to a dir node */
+
+ case cpu_to_le32(DLM_MSG_LOOKUP):
+ receive_lookup(ls, ms);
+ break;
+
+ case cpu_to_le32(DLM_MSG_REMOVE):
+ receive_remove(ls, ms);
+ break;
+
+ /* messages sent from a dir node (remove has no reply) */
+
+ case cpu_to_le32(DLM_MSG_LOOKUP_REPLY):
+ receive_lookup_reply(ls, ms);
+ break;
+
+ /* other messages */
+
+ case cpu_to_le32(DLM_MSG_PURGE):
+ receive_purge(ls, ms);
+ break;
+
+ default:
+ log_error(ls, "unknown message type %d",
+ le32_to_cpu(ms->m_type));
+ }
+
+ /*
+ * When checking for ENOENT, we're checking the result of
+ * find_lkb(m_remid):
+ *
+ * The lock id referenced in the message wasn't found. This may
+ * happen in normal usage for the async messages and cancel, so
+ * only use log_debug for them.
+ *
+ * Some errors are expected and normal.
+ */
+
+ if (error == -ENOENT && noent) {
+ log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
+ le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), saved_seq);
+ } else if (error == -ENOENT) {
+ log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
+ le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), saved_seq);
+
+ if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT))
+ dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash));
+ }
+
+ if (error == -EINVAL) {
+ log_error(ls, "receive %d inval from %d lkid %x remid %x "
+ "saved_seq %u",
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ saved_seq);
+ }
+}
+
+/* If the lockspace is in recovery mode (locking stopped), then normal
+ messages are saved on the requestqueue for processing after recovery is
+ done. When not in recovery mode, we wait for dlm_recoverd to drain saved
+ messages off the requestqueue before we process new ones. This occurs right
+ after recovery completes when we transition from saving all messages on
+ requestqueue, to processing all the saved messages, to processing new
+ messages as they arrive. */
+
+static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms,
+ int nodeid)
+{
+ if (dlm_locking_stopped(ls)) {
+ /* If we were a member of this lockspace, left, and rejoined,
+ other nodes may still be sending us messages from the
+ lockspace generation before we left. */
+ if (WARN_ON_ONCE(!ls->ls_generation)) {
+ log_limit(ls, "receive %d from %d ignore old gen",
+ le32_to_cpu(ms->m_type), nodeid);
+ return;
+ }
+
+ dlm_add_requestqueue(ls, nodeid, ms);
+ } else {
+ dlm_wait_requestqueue(ls);
+ _receive_message(ls, ms, 0);
+ }
+}
+
+/* This is called by dlm_recoverd to process messages that were saved on
+ the requestqueue. */
+
+void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms,
+ uint32_t saved_seq)
+{
+ _receive_message(ls, ms, saved_seq);
+}
+
+/* This is called by the midcomms layer when something is received for
+ the lockspace. It could be either a MSG (normal message sent as part of
+ standard locking activity) or an RCOM (recovery message sent as part of
+ lockspace recovery). */
+
+void dlm_receive_buffer(const union dlm_packet *p, int nodeid)
+{
+ const struct dlm_header *hd = &p->header;
+ struct dlm_ls *ls;
+ int type = 0;
+
+ switch (hd->h_cmd) {
+ case DLM_MSG:
+ type = le32_to_cpu(p->message.m_type);
+ break;
+ case DLM_RCOM:
+ type = le32_to_cpu(p->rcom.rc_type);
+ break;
+ default:
+ log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
+ return;
+ }
+
+ if (le32_to_cpu(hd->h_nodeid) != nodeid) {
+ log_print("invalid h_nodeid %d from %d lockspace %x",
+ le32_to_cpu(hd->h_nodeid), nodeid,
+ le32_to_cpu(hd->u.h_lockspace));
+ return;
+ }
+
+ ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace));
+ if (!ls) {
+ if (dlm_config.ci_log_debug) {
+ printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
+ "%u from %d cmd %d type %d\n",
+ le32_to_cpu(hd->u.h_lockspace), nodeid,
+ hd->h_cmd, type);
+ }
+
+ if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
+ dlm_send_ls_not_ready(nodeid, &p->rcom);
+ return;
+ }
+
+ /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
+ be inactive (in this ls) before transitioning to recovery mode */
+
+ down_read(&ls->ls_recv_active);
+ if (hd->h_cmd == DLM_MSG)
+ dlm_receive_message(ls, &p->message, nodeid);
+ else if (hd->h_cmd == DLM_RCOM)
+ dlm_receive_rcom(ls, &p->rcom, nodeid);
+ else
+ log_error(ls, "invalid h_cmd %d from %d lockspace %x",
+ hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace));
+ up_read(&ls->ls_recv_active);
+
+ dlm_put_lockspace(ls);
+}
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms_local)
+{
+ if (middle_conversion(lkb)) {
+ hold_lkb(lkb);
+ memset(ms_local, 0, sizeof(struct dlm_message));
+ ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
+ ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
+ ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+ _receive_convert_reply(lkb, ms_local, true);
+
+ /* Same special case as in receive_rcom_lock_args() */
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+ unhold_lkb(lkb);
+
+ } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+ set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+ }
+
+ /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+ conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+ the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ int dir_nodeid)
+{
+ if (dlm_no_directory(ls))
+ return 1;
+
+ if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
+ return 1;
+
+ return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+ gone. We can just complete unlocks and cancels by faking a reply from the
+ dead node. Requests and up-conversions we flag to be resent after
+ recovery. Down-conversions can just be completed with a fake reply like
+ unlocks. Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb, *safe;
+ struct dlm_message *ms_local;
+ int wait_type, local_unlock_result, local_cancel_result;
+ int dir_nodeid;
+
+ ms_local = kmalloc(sizeof(*ms_local), GFP_KERNEL);
+ if (!ms_local)
+ return;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+
+ list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+
+ dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
+
+ /* exclude debug messages about unlocks because there can be so
+ many and they aren't very interesting */
+
+ if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+ log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
+ "lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
+ lkb->lkb_id,
+ lkb->lkb_remid,
+ lkb->lkb_wait_type,
+ lkb->lkb_resource->res_nodeid,
+ lkb->lkb_nodeid,
+ lkb->lkb_wait_nodeid,
+ dir_nodeid);
+ }
+
+ /* all outstanding lookups, regardless of destination will be
+ resent after recovery is done */
+
+ if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+ set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+ continue;
+ }
+
+ if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
+ continue;
+
+ wait_type = lkb->lkb_wait_type;
+ local_unlock_result = -DLM_EUNLOCK;
+ local_cancel_result = -DLM_ECANCEL;
+
+ /* Main reply may have been received leaving a zero wait_type,
+ but a reply for the overlapping op may not have been
+ received. In that case we need to fake the appropriate
+ reply for the overlap op. */
+
+ if (!wait_type) {
+ if (is_overlap_cancel(lkb)) {
+ wait_type = DLM_MSG_CANCEL;
+ if (lkb->lkb_grmode == DLM_LOCK_IV)
+ local_cancel_result = 0;
+ }
+ if (is_overlap_unlock(lkb)) {
+ wait_type = DLM_MSG_UNLOCK;
+ if (lkb->lkb_grmode == DLM_LOCK_IV)
+ local_unlock_result = -ENOENT;
+ }
+
+ log_debug(ls, "rwpre overlap %x %x %d %d %d",
+ lkb->lkb_id, dlm_iflags_val(lkb), wait_type,
+ local_cancel_result, local_unlock_result);
+ }
+
+ switch (wait_type) {
+
+ case DLM_MSG_REQUEST:
+ set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+ break;
+
+ case DLM_MSG_CONVERT:
+ recover_convert_waiter(ls, lkb, ms_local);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ hold_lkb(lkb);
+ memset(ms_local, 0, sizeof(struct dlm_message));
+ ms_local->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
+ ms_local->m_result = cpu_to_le32(to_dlm_errno(local_unlock_result));
+ ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+ _receive_unlock_reply(lkb, ms_local, true);
+ dlm_put_lkb(lkb);
+ break;
+
+ case DLM_MSG_CANCEL:
+ hold_lkb(lkb);
+ memset(ms_local, 0, sizeof(struct dlm_message));
+ ms_local->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
+ ms_local->m_result = cpu_to_le32(to_dlm_errno(local_cancel_result));
+ ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+ _receive_cancel_reply(lkb, ms_local, true);
+ dlm_put_lkb(lkb);
+ break;
+
+ default:
+ log_error(ls, "invalid lkb wait_type %d %d",
+ lkb->lkb_wait_type, wait_type);
+ }
+ schedule();
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+ kfree(ms_local);
+}
+
+static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb = NULL, *iter;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) {
+ if (test_bit(DLM_IFL_RESEND_BIT, &iter->lkb_iflags)) {
+ hold_lkb(iter);
+ lkb = iter;
+ break;
+ }
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ return lkb;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
+ master or dir-node for r. Processing the lkb may result in it being placed
+ back on waiters. */
+
+/* We do this after normal locking has been enabled and any saved messages
+ (in requestqueue) have been processed. We should be confident that at
+ this point we won't get or process a reply to any of these waiting
+ operations. But, new ops may be coming in on the rsbs/locks here from
+ userspace or remotely. */
+
+/* there may have been an overlap unlock/cancel prior to recovery or after
+ recovery. if before, the lkb may still have a pos wait_count; if after, the
+ overlap flag would just have been set and nothing new sent. we can be
+ confident here than any replies to either the initial op or overlap ops
+ prior to recovery have been received. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error = 0, mstype, err, oc, ou;
+
+ while (1) {
+ if (dlm_locking_stopped(ls)) {
+ log_debug(ls, "recover_waiters_post aborted");
+ error = -EINTR;
+ break;
+ }
+
+ lkb = find_resend_waiter(ls);
+ if (!lkb)
+ break;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ mstype = lkb->lkb_wait_type;
+ oc = test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT,
+ &lkb->lkb_iflags);
+ ou = test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT,
+ &lkb->lkb_iflags);
+ err = 0;
+
+ log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
+ "lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
+ "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
+ r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
+ dlm_dir_nodeid(r), oc, ou);
+
+ /* At this point we assume that we won't get a reply to any
+ previous op or overlap op on this lock. First, do a big
+ remove_from_waiters() for all previous ops. */
+
+ clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+ lkb->lkb_wait_type = 0;
+ /* drop all wait_count references we still
+ * hold a reference for this iteration.
+ */
+ while (!atomic_dec_and_test(&lkb->lkb_wait_count))
+ unhold_lkb(lkb);
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ list_del_init(&lkb->lkb_wait_reply);
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ if (oc || ou) {
+ /* do an unlock or cancel instead of resending */
+ switch (mstype) {
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REQUEST:
+ queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
+ -DLM_ECANCEL);
+ unhold_lkb(lkb); /* undoes create_lkb() */
+ break;
+ case DLM_MSG_CONVERT:
+ if (oc) {
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ } else {
+ lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
+ _unlock_lock(r, lkb);
+ }
+ break;
+ default:
+ err = 1;
+ }
+ } else {
+ switch (mstype) {
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REQUEST:
+ _request_lock(r, lkb);
+ if (is_master(r))
+ confirm_master(r, 0);
+ break;
+ case DLM_MSG_CONVERT:
+ _convert_lock(r, lkb);
+ break;
+ default:
+ err = 1;
+ }
+ }
+
+ if (err) {
+ log_error(ls, "waiter %x msg %d r_nodeid %d "
+ "dir_nodeid %d overlap %d %d",
+ lkb->lkb_id, mstype, r->res_nodeid,
+ dlm_dir_nodeid(r), oc, ou);
+ }
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ }
+
+ return error;
+}
+
+static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r,
+ struct list_head *list)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
+ if (!is_master_copy(lkb))
+ continue;
+
+ /* don't purge lkbs we've added in recover_master_copy for
+ the current recovery seq */
+
+ if (lkb->lkb_recover_seq == ls->ls_recover_seq)
+ continue;
+
+ del_lkb(r, lkb);
+
+ /* this put should free the lkb */
+ if (!dlm_put_lkb(lkb))
+ log_error(ls, "purged mstcpy lkb not released");
+ }
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ purge_mstcpy_list(ls, r, &r->res_grantqueue);
+ purge_mstcpy_list(ls, r, &r->res_convertqueue);
+ purge_mstcpy_list(ls, r, &r->res_waitqueue);
+}
+
+static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
+ struct list_head *list,
+ int nodeid_gone, unsigned int *count)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
+ if (!is_master_copy(lkb))
+ continue;
+
+ if ((lkb->lkb_nodeid == nodeid_gone) ||
+ dlm_is_removed(ls, lkb->lkb_nodeid)) {
+
+ /* tell recover_lvb to invalidate the lvb
+ because a node holding EX/PW failed */
+ if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
+ (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+ rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
+ }
+
+ del_lkb(r, lkb);
+
+ /* this put should free the lkb */
+ if (!dlm_put_lkb(lkb))
+ log_error(ls, "purged dead lkb not released");
+
+ rsb_set_flag(r, RSB_RECOVER_GRANT);
+
+ (*count)++;
+ }
+ }
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+void dlm_recover_purge(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ struct dlm_member *memb;
+ int nodes_count = 0;
+ int nodeid_gone = 0;
+ unsigned int lkb_count = 0;
+
+ /* cache one removed nodeid to optimize the common
+ case of a single node removed */
+
+ list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+ nodes_count++;
+ nodeid_gone = memb->nodeid;
+ }
+
+ if (!nodes_count)
+ return;
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ hold_rsb(r);
+ lock_rsb(r);
+ if (is_master(r)) {
+ purge_dead_list(ls, r, &r->res_grantqueue,
+ nodeid_gone, &lkb_count);
+ purge_dead_list(ls, r, &r->res_convertqueue,
+ nodeid_gone, &lkb_count);
+ purge_dead_list(ls, r, &r->res_waitqueue,
+ nodeid_gone, &lkb_count);
+ }
+ unlock_rsb(r);
+ unhold_rsb(r);
+ cond_resched();
+ }
+ up_write(&ls->ls_root_sem);
+
+ if (lkb_count)
+ log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
+ lkb_count, nodes_count);
+}
+
+static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket)
+{
+ struct rb_node *n;
+ struct dlm_rsb *r;
+
+ spin_lock(&ls->ls_rsbtbl[bucket].lock);
+ for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
+
+ if (!rsb_flag(r, RSB_RECOVER_GRANT))
+ continue;
+ if (!is_master(r)) {
+ rsb_clear_flag(r, RSB_RECOVER_GRANT);
+ continue;
+ }
+ hold_rsb(r);
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ return r;
+ }
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ return NULL;
+}
+
+/*
+ * Attempt to grant locks on resources that we are the master of.
+ * Locks may have become grantable during recovery because locks
+ * from departed nodes have been purged (or not rebuilt), allowing
+ * previously blocked locks to now be granted. The subset of rsb's
+ * we are interested in are those with lkb's on either the convert or
+ * waiting queues.
+ *
+ * Simplest would be to go through each master rsb and check for non-empty
+ * convert or waiting queues, and attempt to grant on those rsbs.
+ * Checking the queues requires lock_rsb, though, for which we'd need
+ * to release the rsbtbl lock. This would make iterating through all
+ * rsb's very inefficient. So, we rely on earlier recovery routines
+ * to set RECOVER_GRANT on any rsb's that we should attempt to grant
+ * locks for.
+ */
+
+void dlm_recover_grant(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int bucket = 0;
+ unsigned int count = 0;
+ unsigned int rsb_count = 0;
+ unsigned int lkb_count = 0;
+
+ while (1) {
+ r = find_grant_rsb(ls, bucket);
+ if (!r) {
+ if (bucket == ls->ls_rsbtbl_size - 1)
+ break;
+ bucket++;
+ continue;
+ }
+ rsb_count++;
+ count = 0;
+ lock_rsb(r);
+ /* the RECOVER_GRANT flag is checked in the grant path */
+ grant_pending_locks(r, &count);
+ rsb_clear_flag(r, RSB_RECOVER_GRANT);
+ lkb_count += count;
+ confirm_master(r, 0);
+ unlock_rsb(r);
+ put_rsb(r);
+ cond_resched();
+ }
+
+ if (lkb_count)
+ log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
+ lkb_count, rsb_count);
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ return NULL;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_rsb *r, const struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+
+ lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
+ lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
+ lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
+ lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
+ dlm_set_dflags_val(lkb, le32_to_cpu(rl->rl_flags));
+ set_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags);
+ lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
+ lkb->lkb_rqmode = rl->rl_rqmode;
+ lkb->lkb_grmode = rl->rl_grmode;
+ /* don't set lkb_status because add_lkb wants to itself */
+
+ lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+ lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ int lvblen = le16_to_cpu(rc->rc_header.h_length) -
+ sizeof(struct dlm_rcom) - sizeof(struct rcom_lock);
+ if (lvblen > ls->ls_lvblen)
+ return -EINVAL;
+ lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+ }
+
+ /* Conversions between PR and CW (middle modes) need special handling.
+ The real granted mode of these converting locks cannot be determined
+ until all locks have been rebuilt on the rsb (recover_conversion) */
+
+ if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
+ middle_conversion(lkb)) {
+ rl->rl_status = DLM_LKSTS_CONVERT;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ rsb_set_flag(r, RSB_RECOVER_CONVERT);
+ }
+
+ return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+ to check if the rsb already has an lkb with the given remote nodeid/lkid.
+ If so we just send back a standard reply. If not, we create a new lkb with
+ the given values and send back our lkid. We send back our lkid by sending
+ back the rcom_lock struct we got but with the remid field filled in. */
+
+/* needs at least dlm_rcom + rcom_lock */
+int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ __le32 *rl_remid, __le32 *rl_result)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ uint32_t remid = 0;
+ int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
+ int error;
+
+ /* init rl_remid with rcom lock rl_remid */
+ *rl_remid = rl->rl_remid;
+
+ if (rl->rl_parent_lkid) {
+ error = -EOPNOTSUPP;
+ goto out;
+ }
+
+ remid = le32_to_cpu(rl->rl_lkid);
+
+ /* In general we expect the rsb returned to be R_MASTER, but we don't
+ have to require it. Recovery of masters on one node can overlap
+ recovery of locks on another node, so one node can send us MSTCPY
+ locks before we've made ourselves master of this rsb. We can still
+ add new MSTCPY locks that we receive here without any harm; when
+ we make ourselves master, dlm_recover_masters() won't touch the
+ MSTCPY locks we've received early. */
+
+ error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
+ from_nodeid, R_RECEIVE_RECOVER, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) {
+ log_error(ls, "dlm_recover_master_copy remote %d %x not dir",
+ from_nodeid, remid);
+ error = -EBADR;
+ goto out_unlock;
+ }
+
+ lkb = search_remid(r, from_nodeid, remid);
+ if (lkb) {
+ error = -EEXIST;
+ goto out_remid;
+ }
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto out_unlock;
+
+ error = receive_rcom_lock_args(ls, lkb, r, rc);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto out_unlock;
+ }
+
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, rl->rl_status);
+ ls->ls_recover_locks_in++;
+
+ if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+ rsb_set_flag(r, RSB_RECOVER_GRANT);
+
+ out_remid:
+ /* this is the new value returned to the lock holder for
+ saving in its process-copy lkb */
+ *rl_remid = cpu_to_le32(lkb->lkb_id);
+
+ lkb->lkb_recover_seq = ls->ls_recover_seq;
+
+ out_unlock:
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ if (error && error != -EEXIST)
+ log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
+ from_nodeid, remid, error);
+ *rl_result = cpu_to_le32(error);
+ return error;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ uint64_t seq)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ uint32_t lkid, remid;
+ int error, result;
+
+ lkid = le32_to_cpu(rl->rl_lkid);
+ remid = le32_to_cpu(rl->rl_remid);
+ result = le32_to_cpu(rl->rl_result);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error) {
+ log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
+ return error;
+ }
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ if (!is_process_copy(lkb)) {
+ log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
+ dlm_dump_rsb(r);
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return -EINVAL;
+ }
+
+ switch (result) {
+ case -EBADR:
+ /* There's a chance the new master received our lock before
+ dlm_recover_master_reply(), this wouldn't happen if we did
+ a barrier between recover_masters and recover_locks. */
+
+ log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
+
+ dlm_send_rcom_lock(r, lkb, seq);
+ goto out;
+ case -EEXIST:
+ case 0:
+ lkb->lkb_remid = remid;
+ break;
+ default:
+ log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
+ }
+
+ /* an ack for dlm_recover_locks() which waits for replies from
+ all the locks it sends to new masters */
+ dlm_recovered_lock(r);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+
+ return 0;
+}
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+ int mode, uint32_t flags, void *name, unsigned int namelen)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ bool do_put = true;
+ int error;
+
+ dlm_lock_recovery(ls);
+
+ error = create_lkb(ls, &lkb);
+ if (error) {
+ kfree(ua);
+ goto out;
+ }
+
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+
+ if (flags & DLM_LKF_VALBLK) {
+ ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+ if (!ua->lksb.sb_lvbptr) {
+ kfree(ua);
+ error = -ENOMEM;
+ goto out_put;
+ }
+ }
+ error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua,
+ fake_bastfn, &args);
+ if (error) {
+ kfree(ua->lksb.sb_lvbptr);
+ ua->lksb.sb_lvbptr = NULL;
+ kfree(ua);
+ goto out_put;
+ }
+
+ /* After ua is attached to lkb it will be freed by dlm_free_lkb().
+ When DLM_DFL_USER_BIT is set, the dlm knows that this is a userspace
+ lock and that lkb_astparam is the dlm_user_args structure. */
+ set_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags);
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ switch (error) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ error = 0;
+ break;
+ case -EAGAIN:
+ error = 0;
+ fallthrough;
+ default:
+ goto out_put;
+ }
+
+ /* add this new lkb to the per-process list of locks */
+ spin_lock(&ua->proc->locks_spin);
+ hold_lkb(lkb);
+ list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+ spin_unlock(&ua->proc->locks_spin);
+ do_put = false;
+ out_put:
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false);
+ if (do_put)
+ __put_lkb(ls, lkb);
+ out:
+ dlm_unlock_recovery(ls);
+ return error;
+}
+
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ dlm_lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags);
+
+ /* user can change the params on its lock when it converts it, or
+ add an lvb that didn't exist before */
+
+ ua = lkb->lkb_ua;
+
+ if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+ ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+ if (!ua->lksb.sb_lvbptr) {
+ error = -ENOMEM;
+ goto out_put;
+ }
+ }
+ if (lvb_in && ua->lksb.sb_lvbptr)
+ memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+
+ ua->xid = ua_tmp->xid;
+ ua->castparam = ua_tmp->castparam;
+ ua->castaddr = ua_tmp->castaddr;
+ ua->bastparam = ua_tmp->bastparam;
+ ua->bastaddr = ua_tmp->bastaddr;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua,
+ fake_bastfn, &args);
+ if (error)
+ goto out_put;
+
+ error = convert_lock(ls, lkb, &args);
+
+ if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
+ error = 0;
+ out_put:
+ trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false);
+ dlm_put_lkb(lkb);
+ out:
+ dlm_unlock_recovery(ls);
+ kfree(ua_tmp);
+ return error;
+}
+
+/*
+ * The caller asks for an orphan lock on a given resource with a given mode.
+ * If a matching lock exists, it's moved to the owner's list of locks and
+ * the lkid is returned.
+ */
+
+int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, void *name, unsigned int namelen,
+ uint32_t *lkid)
+{
+ struct dlm_lkb *lkb = NULL, *iter;
+ struct dlm_user_args *ua;
+ int found_other_mode = 0;
+ int rv = 0;
+
+ mutex_lock(&ls->ls_orphans_mutex);
+ list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) {
+ if (iter->lkb_resource->res_length != namelen)
+ continue;
+ if (memcmp(iter->lkb_resource->res_name, name, namelen))
+ continue;
+ if (iter->lkb_grmode != mode) {
+ found_other_mode = 1;
+ continue;
+ }
+
+ lkb = iter;
+ list_del_init(&iter->lkb_ownqueue);
+ clear_bit(DLM_DFL_ORPHAN_BIT, &iter->lkb_dflags);
+ *lkid = iter->lkb_id;
+ break;
+ }
+ mutex_unlock(&ls->ls_orphans_mutex);
+
+ if (!lkb && found_other_mode) {
+ rv = -EAGAIN;
+ goto out;
+ }
+
+ if (!lkb) {
+ rv = -ENOENT;
+ goto out;
+ }
+
+ lkb->lkb_exflags = flags;
+ lkb->lkb_ownpid = (int) current->pid;
+
+ ua = lkb->lkb_ua;
+
+ ua->proc = ua_tmp->proc;
+ ua->xid = ua_tmp->xid;
+ ua->castparam = ua_tmp->castparam;
+ ua->castaddr = ua_tmp->castaddr;
+ ua->bastparam = ua_tmp->bastparam;
+ ua->bastaddr = ua_tmp->bastaddr;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ /*
+ * The lkb reference from the ls_orphans list was not
+ * removed above, and is now considered the reference
+ * for the proc locks list.
+ */
+
+ spin_lock(&ua->proc->locks_spin);
+ list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+ spin_unlock(&ua->proc->locks_spin);
+ out:
+ kfree(ua_tmp);
+ return rv;
+}
+
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ dlm_lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ trace_dlm_unlock_start(ls, lkb, flags);
+
+ ua = lkb->lkb_ua;
+
+ if (lvb_in && ua->lksb.sb_lvbptr)
+ memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+ if (ua_tmp->castparam)
+ ua->castparam = ua_tmp->castparam;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK)
+ error = 0;
+ /* from validate_unlock_args() */
+ if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
+ error = 0;
+ if (error)
+ goto out_put;
+
+ spin_lock(&ua->proc->locks_spin);
+ /* dlm_user_add_cb() may have already taken lkb off the proc list */
+ if (!list_empty(&lkb->lkb_ownqueue))
+ list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
+ spin_unlock(&ua->proc->locks_spin);
+ out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
+ dlm_put_lkb(lkb);
+ out:
+ dlm_unlock_recovery(ls);
+ kfree(ua_tmp);
+ return error;
+}
+
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ dlm_lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ trace_dlm_unlock_start(ls, lkb, flags);
+
+ ua = lkb->lkb_ua;
+ if (ua_tmp->castparam)
+ ua->castparam = ua_tmp->castparam;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ error = cancel_lock(ls, lkb, &args);
+
+ if (error == -DLM_ECANCEL)
+ error = 0;
+ /* from validate_unlock_args() */
+ if (error == -EBUSY)
+ error = 0;
+ out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
+ dlm_put_lkb(lkb);
+ out:
+ dlm_unlock_recovery(ls);
+ kfree(ua_tmp);
+ return error;
+}
+
+int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ struct dlm_rsb *r;
+ int error;
+
+ dlm_lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ trace_dlm_unlock_start(ls, lkb, flags);
+
+ ua = lkb->lkb_ua;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ /* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, &args);
+ if (error)
+ goto out_r;
+ set_bit(DLM_IFL_DEADLOCK_CANCEL_BIT, &lkb->lkb_iflags);
+
+ error = _cancel_lock(r, lkb);
+ out_r:
+ unlock_rsb(r);
+ put_rsb(r);
+
+ if (error == -DLM_ECANCEL)
+ error = 0;
+ /* from validate_unlock_args() */
+ if (error == -EBUSY)
+ error = 0;
+ out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
+ dlm_put_lkb(lkb);
+ out:
+ dlm_unlock_recovery(ls);
+ return error;
+}
+
+/* lkb's that are removed from the waiters list by revert are just left on the
+ orphans list with the granted orphan locks, to be freed by purge */
+
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ struct dlm_args args;
+ int error;
+
+ hold_lkb(lkb); /* reference for the ls_orphans list */
+ mutex_lock(&ls->ls_orphans_mutex);
+ list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
+ mutex_unlock(&ls->ls_orphans_mutex);
+
+ set_unlock_args(0, lkb->lkb_ua, &args);
+
+ error = cancel_lock(ls, lkb, &args);
+ if (error == -DLM_ECANCEL)
+ error = 0;
+ return error;
+}
+
+/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
+ granted. Regardless of what rsb queue the lock is on, it's removed and
+ freed. The IVVALBLK flag causes the lvb on the resource to be invalidated
+ if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
+
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ struct dlm_args args;
+ int error;
+
+ set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
+ lkb->lkb_ua, &args);
+
+ error = unlock_lock(ls, lkb, &args);
+ if (error == -DLM_EUNLOCK)
+ error = 0;
+ return error;
+}
+
+/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
+ (which does lock_rsb) due to deadlock with receiving a message that does
+ lock_rsb followed by dlm_user_add_cb() */
+
+static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
+ struct dlm_user_proc *proc)
+{
+ struct dlm_lkb *lkb = NULL;
+
+ spin_lock(&ls->ls_clear_proc_locks);
+ if (list_empty(&proc->locks))
+ goto out;
+
+ lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
+ list_del_init(&lkb->lkb_ownqueue);
+
+ if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+ set_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags);
+ else
+ set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags);
+ out:
+ spin_unlock(&ls->ls_clear_proc_locks);
+ return lkb;
+}
+
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which
+ 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+ which we clear here. */
+
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+ list, and no more device_writes should add lkb's to proc->locks list; so we
+ shouldn't need to take asts_spin or locks_spin here. this assumes that
+ device reads/writes/closes are serialized -- FIXME: we may need to serialize
+ them ourself. */
+
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ dlm_lock_recovery(ls);
+
+ while (1) {
+ lkb = del_proc_lock(ls, proc);
+ if (!lkb)
+ break;
+ if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+ orphan_proc_lock(ls, lkb);
+ else
+ unlock_proc_lock(ls, lkb);
+
+ /* this removes the reference for the proc->locks list
+ added by dlm_user_request, it may result in the lkb
+ being freed */
+
+ dlm_put_lkb(lkb);
+ }
+
+ spin_lock(&ls->ls_clear_proc_locks);
+
+ /* in-progress unlocks */
+ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+ list_del_init(&lkb->lkb_ownqueue);
+ set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags);
+ dlm_put_lkb(lkb);
+ }
+
+ list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
+ dlm_purge_lkb_callbacks(lkb);
+ list_del_init(&lkb->lkb_cb_list);
+ dlm_put_lkb(lkb);
+ }
+
+ spin_unlock(&ls->ls_clear_proc_locks);
+ dlm_unlock_recovery(ls);
+}
+
+static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ while (1) {
+ lkb = NULL;
+ spin_lock(&proc->locks_spin);
+ if (!list_empty(&proc->locks)) {
+ lkb = list_entry(proc->locks.next, struct dlm_lkb,
+ lkb_ownqueue);
+ list_del_init(&lkb->lkb_ownqueue);
+ }
+ spin_unlock(&proc->locks_spin);
+
+ if (!lkb)
+ break;
+
+ set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags);
+ unlock_proc_lock(ls, lkb);
+ dlm_put_lkb(lkb); /* ref from proc->locks list */
+ }
+
+ spin_lock(&proc->locks_spin);
+ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+ list_del_init(&lkb->lkb_ownqueue);
+ set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags);
+ dlm_put_lkb(lkb);
+ }
+ spin_unlock(&proc->locks_spin);
+
+ spin_lock(&proc->asts_spin);
+ list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
+ dlm_purge_lkb_callbacks(lkb);
+ list_del_init(&lkb->lkb_cb_list);
+ dlm_put_lkb(lkb);
+ }
+ spin_unlock(&proc->asts_spin);
+}
+
+/* pid of 0 means purge all orphans */
+
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ mutex_lock(&ls->ls_orphans_mutex);
+ list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
+ if (pid && lkb->lkb_ownpid != pid)
+ continue;
+ unlock_proc_lock(ls, lkb);
+ list_del_init(&lkb->lkb_ownqueue);
+ dlm_put_lkb(lkb);
+ }
+ mutex_unlock(&ls->ls_orphans_mutex);
+}
+
+static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int error;
+
+ error = _create_message(ls, sizeof(struct dlm_message), nodeid,
+ DLM_MSG_PURGE, &ms, &mh, GFP_NOFS);
+ if (error)
+ return error;
+ ms->m_nodeid = cpu_to_le32(nodeid);
+ ms->m_pid = cpu_to_le32(pid);
+
+ return send_message(mh, ms, NULL, 0);
+}
+
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+ int nodeid, int pid)
+{
+ int error = 0;
+
+ if (nodeid && (nodeid != dlm_our_nodeid())) {
+ error = send_purge(ls, nodeid, pid);
+ } else {
+ dlm_lock_recovery(ls);
+ if (pid == current->pid)
+ purge_proc_locks(ls, proc);
+ else
+ do_purge(ls, nodeid, pid);
+ dlm_unlock_recovery(ls);
+ }
+ return error;
+}
+
+/* debug functionality */
+int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
+ int lkb_nodeid, unsigned int lkb_dflags, int lkb_status)
+{
+ struct dlm_lksb *lksb;
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ /* we currently can't set a valid user lock */
+ if (lkb_dflags & BIT(DLM_DFL_USER_BIT))
+ return -EOPNOTSUPP;
+
+ lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
+ if (!lksb)
+ return -ENOMEM;
+
+ error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1);
+ if (error) {
+ kfree(lksb);
+ return error;
+ }
+
+ dlm_set_dflags_val(lkb, lkb_dflags);
+ lkb->lkb_nodeid = lkb_nodeid;
+ lkb->lkb_lksb = lksb;
+ /* user specific pointer, just don't have it NULL for kernel locks */
+ if (~lkb_dflags & BIT(DLM_DFL_USER_BIT))
+ lkb->lkb_astparam = (void *)0xDEADBEEF;
+
+ error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
+ if (error) {
+ kfree(lksb);
+ __put_lkb(ls, lkb);
+ return error;
+ }
+
+ lock_rsb(r);
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, lkb_status);
+ unlock_rsb(r);
+ put_rsb(r);
+
+ return 0;
+}
+
+int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
+ int mstype, int to_nodeid)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, lkb_id, &lkb);
+ if (error)
+ return error;
+
+ error = add_to_waiters(lkb, mstype, to_nodeid);
+ dlm_put_lkb(lkb);
+ return error;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 0000000000..b54e2cbbe6
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms,
+ uint32_t saved_seq);
+void dlm_receive_buffer(const union dlm_packet *p, int nodeid);
+int dlm_modes_compat(int mode1, int mode2);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+int dlm_lock_recovery_try(struct dlm_ls *ls);
+void dlm_unlock_recovery(struct dlm_ls *ls);
+
+int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
+ int len, unsigned int flags, int *r_nodeid, int *result);
+
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
+ struct dlm_rsb **r_ret);
+
+void dlm_recover_purge(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_recover_grant(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ __le32 *rl_remid, __le32 *rl_result);
+int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ uint64_t seq);
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+ uint32_t flags, void *name, unsigned int namelen);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, void *name, unsigned int namelen,
+ uint32_t *lkid);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+ int nodeid, int pid);
+int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
+ int lkb_nodeid, unsigned int lkb_flags, int lkb_status);
+int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
+ int mstype, int to_nodeid);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+ return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+ mutex_lock(&r->res_mutex);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+ mutex_unlock(&r->res_mutex);
+}
+
+#endif
+
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 0000000000..0455dddb07
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,937 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "dir.h"
+#include "midcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+#include "requestqueue.h"
+#include "user.h"
+#include "ast.h"
+
+static int ls_count;
+static struct mutex ls_lock;
+static struct list_head lslist;
+static spinlock_t lslist_lock;
+static struct task_struct * scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int n;
+ int rc = kstrtoint(buf, 0, &n);
+
+ if (rc)
+ return rc;
+ ls = dlm_find_lockspace_local(ls->ls_local_handle);
+ if (!ls)
+ return -EINVAL;
+
+ switch (n) {
+ case 0:
+ dlm_ls_stop(ls);
+ break;
+ case 1:
+ dlm_ls_start(ls);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ dlm_put_lockspace(ls);
+ return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ int rc = kstrtoint(buf, 0, &ls->ls_uevent_result);
+
+ if (rc)
+ return rc;
+ set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+ wake_up(&ls->ls_uevent_wait);
+ return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ int rc = kstrtouint(buf, 0, &ls->ls_global_id);
+
+ if (rc)
+ return rc;
+ return len;
+}
+
+static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls));
+}
+
+static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ int val;
+ int rc = kstrtoint(buf, 0, &val);
+
+ if (rc)
+ return rc;
+ if (val == 1)
+ set_bit(LSFL_NODIR, &ls->ls_flags);
+ return len;
+}
+
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+ uint32_t status = dlm_recover_status(ls);
+ return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+
+struct dlm_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct dlm_ls *, char *);
+ ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+ .attr = {.name = "control", .mode = S_IWUSR},
+ .store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+ .attr = {.name = "event_done", .mode = S_IWUSR},
+ .store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+ .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+ .show = dlm_id_show,
+ .store = dlm_id_store
+};
+
+static struct dlm_attr dlm_attr_nodir = {
+ .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR},
+ .show = dlm_nodir_show,
+ .store = dlm_nodir_store
+};
+
+static struct dlm_attr dlm_attr_recover_status = {
+ .attr = {.name = "recover_status", .mode = S_IRUGO},
+ .show = dlm_recover_status_show
+};
+
+static struct dlm_attr dlm_attr_recover_nodeid = {
+ .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
+ .show = dlm_recover_nodeid_show
+};
+
+static struct attribute *dlm_attrs[] = {
+ &dlm_attr_control.attr,
+ &dlm_attr_event.attr,
+ &dlm_attr_id.attr,
+ &dlm_attr_nodir.attr,
+ &dlm_attr_recover_status.attr,
+ &dlm_attr_recover_nodeid.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(dlm);
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+ struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+ return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+ struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+ return a->store ? a->store(ls, buf, len) : len;
+}
+
+static void lockspace_kobj_release(struct kobject *k)
+{
+ struct dlm_ls *ls = container_of(k, struct dlm_ls, ls_kobj);
+ kfree(ls);
+}
+
+static const struct sysfs_ops dlm_attr_ops = {
+ .show = dlm_attr_show,
+ .store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+ .default_groups = dlm_groups,
+ .sysfs_ops = &dlm_attr_ops,
+ .release = lockspace_kobj_release,
+};
+
+static struct kset *dlm_kset;
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+ if (in)
+ kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+ else
+ kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+ log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
+
+ /* dlm_controld will see the uevent, do the necessary group management
+ and then write to sysfs to wake us */
+
+ wait_event(ls->ls_uevent_wait,
+ test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+
+ log_rinfo(ls, "group event done %d", ls->ls_uevent_result);
+
+ return ls->ls_uevent_result;
+}
+
+static int dlm_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+ const struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+
+ add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
+ return 0;
+}
+
+static const struct kset_uevent_ops dlm_uevent_ops = {
+ .uevent = dlm_uevent,
+};
+
+int __init dlm_lockspace_init(void)
+{
+ ls_count = 0;
+ mutex_init(&ls_lock);
+ INIT_LIST_HEAD(&lslist);
+ spin_lock_init(&lslist_lock);
+
+ dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
+ if (!dlm_kset) {
+ printk(KERN_WARNING "%s: can not create kset\n", __func__);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void dlm_lockspace_exit(void)
+{
+ kset_unregister(dlm_kset);
+}
+
+static struct dlm_ls *find_ls_to_scan(void)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (time_after_eq(jiffies, ls->ls_scan_time +
+ dlm_config.ci_scan_secs * HZ)) {
+ spin_unlock(&lslist_lock);
+ return ls;
+ }
+ }
+ spin_unlock(&lslist_lock);
+ return NULL;
+}
+
+static int dlm_scand(void *data)
+{
+ struct dlm_ls *ls;
+
+ while (!kthread_should_stop()) {
+ ls = find_ls_to_scan();
+ if (ls) {
+ if (dlm_lock_recovery_try(ls)) {
+ ls->ls_scan_time = jiffies;
+ dlm_scan_rsbs(ls);
+ dlm_unlock_recovery(ls);
+ } else {
+ ls->ls_scan_time += HZ;
+ }
+ continue;
+ }
+ schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
+ }
+ return 0;
+}
+
+static int dlm_scand_start(void)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(dlm_scand, NULL, "dlm_scand");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ scand_task = p;
+ return error;
+}
+
+static void dlm_scand_stop(void)
+{
+ kthread_stop(scand_task);
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_global_id == id) {
+ atomic_inc(&ls->ls_count);
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_local_handle == lockspace) {
+ atomic_inc(&ls->ls_count);
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_device.minor == minor) {
+ atomic_inc(&ls->ls_count);
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+ if (atomic_dec_and_test(&ls->ls_count))
+ wake_up(&ls->ls_count_wait);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+retry:
+ wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0);
+
+ spin_lock(&lslist_lock);
+ if (atomic_read(&ls->ls_count) != 0) {
+ spin_unlock(&lslist_lock);
+ goto retry;
+ }
+
+ WARN_ON(ls->ls_create_count != 0);
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
+}
+
+static int threads_start(void)
+{
+ int error;
+
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
+ if (error) {
+ log_print("cannot start dlm midcomms %d", error);
+ goto fail;
+ }
+
+ error = dlm_scand_start();
+ if (error) {
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
+ }
+
+ return 0;
+
+ midcomms_fail:
+ dlm_midcomms_stop();
+ fail:
+ return error;
+}
+
+static int new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops, void *ops_arg,
+ int *ops_result, dlm_lockspace_t **lockspace)
+{
+ struct dlm_ls *ls;
+ int i, size, error;
+ int do_unreg = 0;
+ int namelen = strlen(name);
+
+ if (namelen > DLM_LOCKSPACE_LEN || namelen == 0)
+ return -EINVAL;
+
+ if (lvblen % 8)
+ return -EINVAL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ if (!dlm_user_daemon_available()) {
+ log_print("dlm user daemon not available");
+ error = -EUNATCH;
+ goto out;
+ }
+
+ if (ops && ops_result) {
+ if (!dlm_config.ci_recover_callbacks)
+ *ops_result = -EOPNOTSUPP;
+ else
+ *ops_result = 0;
+ }
+
+ if (!cluster)
+ log_print("dlm cluster name '%s' is being used without an application provided cluster name",
+ dlm_config.ci_cluster_name);
+
+ if (dlm_config.ci_recover_callbacks && cluster &&
+ strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
+ log_print("dlm cluster name '%s' does not match "
+ "the application cluster name '%s'",
+ dlm_config.ci_cluster_name, cluster);
+ error = -EBADR;
+ goto out;
+ }
+
+ error = 0;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ WARN_ON(ls->ls_create_count <= 0);
+ if (ls->ls_namelen != namelen)
+ continue;
+ if (memcmp(ls->ls_name, name, namelen))
+ continue;
+ if (flags & DLM_LSFL_NEWEXCL) {
+ error = -EEXIST;
+ break;
+ }
+ ls->ls_create_count++;
+ *lockspace = ls;
+ error = 1;
+ break;
+ }
+ spin_unlock(&lslist_lock);
+
+ if (error)
+ goto out;
+
+ error = -ENOMEM;
+
+ ls = kzalloc(sizeof(*ls), GFP_NOFS);
+ if (!ls)
+ goto out;
+ memcpy(ls->ls_name, name, namelen);
+ ls->ls_namelen = namelen;
+ ls->ls_lvblen = lvblen;
+ atomic_set(&ls->ls_count, 0);
+ init_waitqueue_head(&ls->ls_count_wait);
+ ls->ls_flags = 0;
+ ls->ls_scan_time = jiffies;
+
+ if (ops && dlm_config.ci_recover_callbacks) {
+ ls->ls_ops = ops;
+ ls->ls_ops_arg = ops_arg;
+ }
+
+ /* ls_exflags are forced to match among nodes, and we don't
+ * need to require all nodes to have some flags set
+ */
+ ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL));
+
+ size = READ_ONCE(dlm_config.ci_rsbtbl_size);
+ ls->ls_rsbtbl_size = size;
+
+ ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable)));
+ if (!ls->ls_rsbtbl)
+ goto out_lsfree;
+ for (i = 0; i < size; i++) {
+ ls->ls_rsbtbl[i].keep.rb_node = NULL;
+ ls->ls_rsbtbl[i].toss.rb_node = NULL;
+ spin_lock_init(&ls->ls_rsbtbl[i].lock);
+ }
+
+ for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
+ ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
+ GFP_KERNEL);
+ if (!ls->ls_remove_names[i])
+ goto out_rsbtbl;
+ }
+
+ idr_init(&ls->ls_lkbidr);
+ spin_lock_init(&ls->ls_lkbidr_spin);
+
+ INIT_LIST_HEAD(&ls->ls_waiters);
+ mutex_init(&ls->ls_waiters_mutex);
+ INIT_LIST_HEAD(&ls->ls_orphans);
+ mutex_init(&ls->ls_orphans_mutex);
+
+ INIT_LIST_HEAD(&ls->ls_new_rsb);
+ spin_lock_init(&ls->ls_new_rsb_spin);
+
+ INIT_LIST_HEAD(&ls->ls_nodes);
+ INIT_LIST_HEAD(&ls->ls_nodes_gone);
+ ls->ls_num_nodes = 0;
+ ls->ls_low_nodeid = 0;
+ ls->ls_total_weight = 0;
+ ls->ls_node_array = NULL;
+
+ memset(&ls->ls_local_rsb, 0, sizeof(struct dlm_rsb));
+ ls->ls_local_rsb.res_ls = ls;
+
+ ls->ls_debug_rsb_dentry = NULL;
+ ls->ls_debug_waiters_dentry = NULL;
+
+ init_waitqueue_head(&ls->ls_uevent_wait);
+ ls->ls_uevent_result = 0;
+ init_completion(&ls->ls_recovery_done);
+ ls->ls_recovery_result = -1;
+
+ spin_lock_init(&ls->ls_cb_lock);
+ INIT_LIST_HEAD(&ls->ls_cb_delay);
+
+ ls->ls_recoverd_task = NULL;
+ mutex_init(&ls->ls_recoverd_active);
+ spin_lock_init(&ls->ls_recover_lock);
+ spin_lock_init(&ls->ls_rcom_spin);
+ get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t));
+ ls->ls_recover_status = 0;
+ ls->ls_recover_seq = get_random_u64();
+ ls->ls_recover_args = NULL;
+ init_rwsem(&ls->ls_in_recovery);
+ init_rwsem(&ls->ls_recv_active);
+ INIT_LIST_HEAD(&ls->ls_requestqueue);
+ atomic_set(&ls->ls_requestqueue_cnt, 0);
+ init_waitqueue_head(&ls->ls_requestqueue_wait);
+ mutex_init(&ls->ls_requestqueue_mutex);
+ spin_lock_init(&ls->ls_clear_proc_locks);
+
+ /* Due backwards compatibility with 3.1 we need to use maximum
+ * possible dlm message size to be sure the message will fit and
+ * not having out of bounds issues. However on sending side 3.2
+ * might send less.
+ */
+ ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS);
+ if (!ls->ls_recover_buf)
+ goto out_lkbidr;
+
+ ls->ls_slot = 0;
+ ls->ls_num_slots = 0;
+ ls->ls_slots_size = 0;
+ ls->ls_slots = NULL;
+
+ INIT_LIST_HEAD(&ls->ls_recover_list);
+ spin_lock_init(&ls->ls_recover_list_lock);
+ idr_init(&ls->ls_recover_idr);
+ spin_lock_init(&ls->ls_recover_idr_lock);
+ ls->ls_recover_list_count = 0;
+ ls->ls_local_handle = ls;
+ init_waitqueue_head(&ls->ls_wait_general);
+ INIT_LIST_HEAD(&ls->ls_root_list);
+ init_rwsem(&ls->ls_root_sem);
+
+ spin_lock(&lslist_lock);
+ ls->ls_create_count = 1;
+ list_add(&ls->ls_list, &lslist);
+ spin_unlock(&lslist_lock);
+
+ if (flags & DLM_LSFL_FS) {
+ error = dlm_callback_start(ls);
+ if (error) {
+ log_error(ls, "can't start dlm_callback %d", error);
+ goto out_delist;
+ }
+ }
+
+ init_waitqueue_head(&ls->ls_recover_lock_wait);
+
+ /*
+ * Once started, dlm_recoverd first looks for ls in lslist, then
+ * initializes ls_in_recovery as locked in "down" mode. We need
+ * to wait for the wakeup from dlm_recoverd because in_recovery
+ * has to start out in down mode.
+ */
+
+ error = dlm_recoverd_start(ls);
+ if (error) {
+ log_error(ls, "can't start dlm_recoverd %d", error);
+ goto out_callback;
+ }
+
+ wait_event(ls->ls_recover_lock_wait,
+ test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+
+ /* let kobject handle freeing of ls if there's an error */
+ do_unreg = 1;
+
+ ls->ls_kobj.kset = dlm_kset;
+ error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
+ "%s", ls->ls_name);
+ if (error)
+ goto out_recoverd;
+ kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
+
+ /* This uevent triggers dlm_controld in userspace to add us to the
+ group of nodes that are members of this lockspace (managed by the
+ cluster infrastructure.) Once it's done that, it tells us who the
+ current lockspace members are (via configfs) and then tells the
+ lockspace to start running (via sysfs) in dlm_ls_start(). */
+
+ error = do_uevent(ls, 1);
+ if (error)
+ goto out_recoverd;
+
+ /* wait until recovery is successful or failed */
+ wait_for_completion(&ls->ls_recovery_done);
+ error = ls->ls_recovery_result;
+ if (error)
+ goto out_members;
+
+ dlm_create_debug_file(ls);
+
+ log_rinfo(ls, "join complete");
+ *lockspace = ls;
+ return 0;
+
+ out_members:
+ do_uevent(ls, 0);
+ dlm_clear_members(ls);
+ kfree(ls->ls_node_array);
+ out_recoverd:
+ dlm_recoverd_stop(ls);
+ out_callback:
+ dlm_callback_stop(ls);
+ out_delist:
+ spin_lock(&lslist_lock);
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
+ idr_destroy(&ls->ls_recover_idr);
+ kfree(ls->ls_recover_buf);
+ out_lkbidr:
+ idr_destroy(&ls->ls_lkbidr);
+ out_rsbtbl:
+ for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
+ kfree(ls->ls_remove_names[i]);
+ vfree(ls->ls_rsbtbl);
+ out_lsfree:
+ if (do_unreg)
+ kobject_put(&ls->ls_kobj);
+ else
+ kfree(ls);
+ out:
+ module_put(THIS_MODULE);
+ return error;
+}
+
+static int __dlm_new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ int error = 0;
+
+ mutex_lock(&ls_lock);
+ if (!ls_count)
+ error = threads_start();
+ if (error)
+ goto out;
+
+ error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
+ ops_result, lockspace);
+ if (!error)
+ ls_count++;
+ if (error > 0)
+ error = 0;
+ if (!ls_count) {
+ dlm_scand_stop();
+ dlm_midcomms_shutdown();
+ dlm_midcomms_stop();
+ }
+ out:
+ mutex_unlock(&ls_lock);
+ return error;
+}
+
+int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags,
+ int lvblen, const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen,
+ ops, ops_arg, ops_result, lockspace);
+}
+
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ return __dlm_new_lockspace(name, cluster, flags, lvblen, ops,
+ ops_arg, ops_result, lockspace);
+}
+
+static int lkb_idr_is_local(int id, void *p, void *data)
+{
+ struct dlm_lkb *lkb = p;
+
+ return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV;
+}
+
+static int lkb_idr_is_any(int id, void *p, void *data)
+{
+ return 1;
+}
+
+static int lkb_idr_free(int id, void *p, void *data)
+{
+ struct dlm_lkb *lkb = p;
+
+ if (lkb->lkb_lvbptr && test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags))
+ dlm_free_lvb(lkb->lkb_lvbptr);
+
+ dlm_free_lkb(lkb);
+ return 0;
+}
+
+/* NOTE: We check the lkbidr here rather than the resource table.
+ This is because there may be LKBs queued as ASTs that have been unlinked
+ from their RSBs and are pending deletion once the AST has been delivered */
+
+static int lockspace_busy(struct dlm_ls *ls, int force)
+{
+ int rv;
+
+ spin_lock(&ls->ls_lkbidr_spin);
+ if (force == 0) {
+ rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls);
+ } else if (force == 1) {
+ rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls);
+ } else {
+ rv = 0;
+ }
+ spin_unlock(&ls->ls_lkbidr_spin);
+ return rv;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+ struct dlm_rsb *rsb;
+ struct rb_node *n;
+ int i, busy, rv;
+
+ busy = lockspace_busy(ls, force);
+
+ spin_lock(&lslist_lock);
+ if (ls->ls_create_count == 1) {
+ if (busy) {
+ rv = -EBUSY;
+ } else {
+ /* remove_lockspace takes ls off lslist */
+ ls->ls_create_count = 0;
+ rv = 0;
+ }
+ } else if (ls->ls_create_count > 1) {
+ rv = --ls->ls_create_count;
+ } else {
+ rv = -EINVAL;
+ }
+ spin_unlock(&lslist_lock);
+
+ if (rv) {
+ log_debug(ls, "release_lockspace no remove %d", rv);
+ return rv;
+ }
+
+ if (ls_count == 1)
+ dlm_midcomms_version_wait();
+
+ dlm_device_deregister(ls);
+
+ if (force < 3 && dlm_user_daemon_available())
+ do_uevent(ls, 0);
+
+ dlm_recoverd_stop(ls);
+
+ if (ls_count == 1) {
+ dlm_scand_stop();
+ dlm_clear_members(ls);
+ dlm_midcomms_shutdown();
+ }
+
+ dlm_callback_stop(ls);
+
+ remove_lockspace(ls);
+
+ dlm_delete_debug_file(ls);
+
+ idr_destroy(&ls->ls_recover_idr);
+ kfree(ls->ls_recover_buf);
+
+ /*
+ * Free all lkb's in idr
+ */
+
+ idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls);
+ idr_destroy(&ls->ls_lkbidr);
+
+ /*
+ * Free all rsb's on rsbtbl[] lists
+ */
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].keep);
+ dlm_free_rsb(rsb);
+ }
+
+ while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].toss);
+ dlm_free_rsb(rsb);
+ }
+ }
+
+ vfree(ls->ls_rsbtbl);
+
+ for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
+ kfree(ls->ls_remove_names[i]);
+
+ while (!list_empty(&ls->ls_new_rsb)) {
+ rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
+ res_hashchain);
+ list_del(&rsb->res_hashchain);
+ dlm_free_rsb(rsb);
+ }
+
+ /*
+ * Free structures on any other lists
+ */
+
+ dlm_purge_requestqueue(ls);
+ kfree(ls->ls_recover_args);
+ dlm_clear_members(ls);
+ dlm_clear_members_gone(ls);
+ kfree(ls->ls_node_array);
+ log_rinfo(ls, "release_lockspace final free");
+ kobject_put(&ls->ls_kobj);
+ /* The ls structure will be freed when the kobject is done with */
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer. We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died. The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy lockspace if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+ struct dlm_ls *ls;
+ int error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+ dlm_put_lockspace(ls);
+
+ mutex_lock(&ls_lock);
+ error = release_lockspace(ls, force);
+ if (!error)
+ ls_count--;
+ if (!ls_count)
+ dlm_midcomms_stop();
+ mutex_unlock(&ls_lock);
+
+ return error;
+}
+
+void dlm_stop_lockspaces(void)
+{
+ struct dlm_ls *ls;
+ int count;
+
+ restart:
+ count = 0;
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+ count++;
+ continue;
+ }
+ spin_unlock(&lslist_lock);
+ log_error(ls, "no userland control daemon, stopping lockspace");
+ dlm_ls_stop(ls);
+ goto restart;
+ }
+ spin_unlock(&lslist_lock);
+
+ if (count)
+ log_print("dlm user daemon left %d lockspaces", count);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 0000000000..47ebd44119
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+/* DLM_LSFL_FS
+ * The lockspace user is in the kernel (i.e. filesystem). Enables
+ * direct bast/cast callbacks.
+ *
+ * internal lockspace flag - will be removed in future
+ */
+#define DLM_LSFL_FS 0x00000004
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace);
+
+#endif /* __LOCKSPACE_DOT_H__ */
+
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 0000000000..32dbd1a828
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1997 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is its
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * lowcomms will choose to use either TCP or SCTP as its transport layer
+ * depending on the configuration variable 'protocol'. This should be set
+ * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
+ * cluster-wide mechanism as it must be the same on all nodes of the cluster
+ * for the DLM to function.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/sctp.h>
+#include <linux/slab.h>
+#include <net/sctp/sctp.h>
+#include <net/ipv6.h>
+
+#include <trace/events/dlm.h>
+#include <trace/events/sock.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "memory.h"
+#include "config.h"
+
+#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000)
+#define NEEDED_RMEM (4*1024*1024)
+
+struct connection {
+ struct socket *sock; /* NULL if not connected */
+ uint32_t nodeid; /* So we know who we are in the list */
+ /* this semaphore is used to allow parallel recv/send in read
+ * lock mode. When we release a sock we need to held the write lock.
+ *
+ * However this is locking code and not nice. When we remove the
+ * othercon handling we can look into other mechanism to synchronize
+ * io handling to call sock_release() at the right time.
+ */
+ struct rw_semaphore sock_lock;
+ unsigned long flags;
+#define CF_APP_LIMITED 0
+#define CF_RECV_PENDING 1
+#define CF_SEND_PENDING 2
+#define CF_RECV_INTR 3
+#define CF_IO_STOP 4
+#define CF_IS_OTHERCON 5
+ struct list_head writequeue; /* List of outgoing writequeue_entries */
+ spinlock_t writequeue_lock;
+ int retries;
+ struct hlist_node list;
+ /* due some connect()/accept() races we currently have this cross over
+ * connection attempt second connection for one node.
+ *
+ * There is a solution to avoid the race by introducing a connect
+ * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to
+ * connect. Otherside can connect but will only be considered that
+ * the other side wants to have a reconnect.
+ *
+ * However changing to this behaviour will break backwards compatible.
+ * In a DLM protocol major version upgrade we should remove this!
+ */
+ struct connection *othercon;
+ struct work_struct rwork; /* receive worker */
+ struct work_struct swork; /* send worker */
+ wait_queue_head_t shutdown_wait;
+ unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE];
+ int rx_leftover;
+ int mark;
+ int addr_count;
+ int curr_addr_index;
+ struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT];
+ spinlock_t addrs_lock;
+ struct rcu_head rcu;
+};
+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+
+struct listen_connection {
+ struct socket *sock;
+ struct work_struct rwork;
+};
+
+#define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end)
+#define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset)
+
+/* An entry waiting to be sent */
+struct writequeue_entry {
+ struct list_head list;
+ struct page *page;
+ int offset;
+ int len;
+ int end;
+ int users;
+ bool dirty;
+ struct connection *con;
+ struct list_head msgs;
+ struct kref ref;
+};
+
+struct dlm_msg {
+ struct writequeue_entry *entry;
+ struct dlm_msg *orig_msg;
+ bool retransmit;
+ void *ppc;
+ int len;
+ int idx; /* new()/commit() idx exchange */
+
+ struct list_head list;
+ struct kref ref;
+};
+
+struct processqueue_entry {
+ unsigned char *buf;
+ int nodeid;
+ int buflen;
+
+ struct list_head list;
+};
+
+struct dlm_proto_ops {
+ bool try_new_addr;
+ const char *name;
+ int proto;
+
+ int (*connect)(struct connection *con, struct socket *sock,
+ struct sockaddr *addr, int addr_len);
+ void (*sockopts)(struct socket *sock);
+ int (*bind)(struct socket *sock);
+ int (*listen_validate)(void);
+ void (*listen_sockopts)(struct socket *sock);
+ int (*listen_bind)(struct socket *sock);
+};
+
+static struct listen_sock_callbacks {
+ void (*sk_error_report)(struct sock *);
+ void (*sk_data_ready)(struct sock *);
+ void (*sk_state_change)(struct sock *);
+ void (*sk_write_space)(struct sock *);
+} listen_sock;
+
+static struct listen_connection listen_con;
+static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
+
+/* Work queues */
+static struct workqueue_struct *io_workqueue;
+static struct workqueue_struct *process_workqueue;
+
+static struct hlist_head connection_hash[CONN_HASH_SIZE];
+static DEFINE_SPINLOCK(connections_lock);
+DEFINE_STATIC_SRCU(connections_srcu);
+
+static const struct dlm_proto_ops *dlm_proto_ops;
+
+#define DLM_IO_SUCCESS 0
+#define DLM_IO_END 1
+#define DLM_IO_EOF 2
+#define DLM_IO_RESCHED 3
+
+static void process_recv_sockets(struct work_struct *work);
+static void process_send_sockets(struct work_struct *work);
+static void process_dlm_messages(struct work_struct *work);
+
+static DECLARE_WORK(process_work, process_dlm_messages);
+static DEFINE_SPINLOCK(processqueue_lock);
+static bool process_dlm_messages_pending;
+static LIST_HEAD(processqueue);
+
+bool dlm_lowcomms_is_running(void)
+{
+ return !!listen_con.sock;
+}
+
+static void lowcomms_queue_swork(struct connection *con)
+{
+ assert_spin_locked(&con->writequeue_lock);
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_bit(CF_APP_LIMITED, &con->flags) &&
+ !test_and_set_bit(CF_SEND_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->swork);
+}
+
+static void lowcomms_queue_rwork(struct connection *con)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk));
+#endif
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_and_set_bit(CF_RECV_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->rwork);
+}
+
+static void writequeue_entry_ctor(void *data)
+{
+ struct writequeue_entry *entry = data;
+
+ INIT_LIST_HEAD(&entry->msgs);
+}
+
+struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void)
+{
+ return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry),
+ 0, 0, writequeue_entry_ctor);
+}
+
+struct kmem_cache *dlm_lowcomms_msg_cache_create(void)
+{
+ return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL);
+}
+
+/* need to held writequeue_lock */
+static struct writequeue_entry *con_next_wq(struct connection *con)
+{
+ struct writequeue_entry *e;
+
+ e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry,
+ list);
+ /* if len is zero nothing is to send, if there are users filling
+ * buffers we wait until the users are done so we can send more.
+ */
+ if (!e || e->users || e->len == 0)
+ return NULL;
+
+ return e;
+}
+
+static struct connection *__find_con(int nodeid, int r)
+{
+ struct connection *con;
+
+ hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
+ if (con->nodeid == nodeid)
+ return con;
+ }
+
+ return NULL;
+}
+
+static void dlm_con_init(struct connection *con, int nodeid)
+{
+ con->nodeid = nodeid;
+ init_rwsem(&con->sock_lock);
+ INIT_LIST_HEAD(&con->writequeue);
+ spin_lock_init(&con->writequeue_lock);
+ INIT_WORK(&con->swork, process_send_sockets);
+ INIT_WORK(&con->rwork, process_recv_sockets);
+ spin_lock_init(&con->addrs_lock);
+ init_waitqueue_head(&con->shutdown_wait);
+}
+
+/*
+ * If 'allocation' is zero then we don't attempt to create a new
+ * connection structure for this node.
+ */
+static struct connection *nodeid2con(int nodeid, gfp_t alloc)
+{
+ struct connection *con, *tmp;
+ int r;
+
+ r = nodeid_hash(nodeid);
+ con = __find_con(nodeid, r);
+ if (con || !alloc)
+ return con;
+
+ con = kzalloc(sizeof(*con), alloc);
+ if (!con)
+ return NULL;
+
+ dlm_con_init(con, nodeid);
+
+ spin_lock(&connections_lock);
+ /* Because multiple workqueues/threads calls this function it can
+ * race on multiple cpu's. Instead of locking hot path __find_con()
+ * we just check in rare cases of recently added nodes again
+ * under protection of connections_lock. If this is the case we
+ * abort our connection creation and return the existing connection.
+ */
+ tmp = __find_con(nodeid, r);
+ if (tmp) {
+ spin_unlock(&connections_lock);
+ kfree(con);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&con->list, &connection_hash[r]);
+ spin_unlock(&connections_lock);
+
+ return con;
+}
+
+static int addr_compare(const struct sockaddr_storage *x,
+ const struct sockaddr_storage *y)
+{
+ switch (x->ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+ struct sockaddr_in *siny = (struct sockaddr_in *)y;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+ struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
+ struct sockaddr *sa_out, bool try_new_addr,
+ unsigned int *mark)
+{
+ struct sockaddr_storage sas;
+ struct connection *con;
+ int idx;
+
+ if (!dlm_local_count)
+ return -1;
+
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ memcpy(&sas, &con->addr[con->curr_addr_index],
+ sizeof(struct sockaddr_storage));
+
+ if (try_new_addr) {
+ con->curr_addr_index++;
+ if (con->curr_addr_index == con->addr_count)
+ con->curr_addr_index = 0;
+ }
+
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
+
+ if (sas_out)
+ memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
+
+ if (!sa_out) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+ }
+
+ if (dlm_local_addr[0].ss_family == AF_INET) {
+ struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
+ struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
+ ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+ } else {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas;
+ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out;
+ ret6->sin6_addr = in6->sin6_addr;
+ }
+
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+}
+
+static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
+ unsigned int *mark)
+{
+ struct connection *con;
+ int i, idx, addr_i;
+
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ WARN_ON_ONCE(!con->addr_count);
+
+ spin_lock(&con->addrs_lock);
+ for (addr_i = 0; addr_i < con->addr_count; addr_i++) {
+ if (addr_compare(&con->addr[addr_i], addr)) {
+ *nodeid = con->nodeid;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+ }
+ }
+ spin_unlock(&con->addrs_lock);
+ }
+ }
+ srcu_read_unlock(&connections_srcu, idx);
+
+ return -ENOENT;
+}
+
+static bool dlm_lowcomms_con_has_addr(const struct connection *con,
+ const struct sockaddr_storage *addr)
+{
+ int i;
+
+ for (i = 0; i < con->addr_count; i++) {
+ if (addr_compare(&con->addr[i], addr))
+ return true;
+ }
+
+ return false;
+}
+
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+{
+ struct connection *con;
+ bool ret, idx;
+
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, GFP_NOFS);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOMEM;
+ }
+
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ memcpy(&con->addr[0], addr, sizeof(*addr));
+ con->addr_count = 1;
+ con->mark = dlm_config.ci_mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+ }
+
+ ret = dlm_lowcomms_con_has_addr(con, addr);
+ if (ret) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return -EEXIST;
+ }
+
+ if (con->addr_count >= DLM_MAX_ADDR_COUNT) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOSPC;
+ }
+
+ memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr));
+ srcu_read_unlock(&connections_srcu, idx);
+ spin_unlock(&con->addrs_lock);
+ return 0;
+}
+
+/* Data available on socket or listen socket received a connect */
+static void lowcomms_data_ready(struct sock *sk)
+{
+ struct connection *con = sock2con(sk);
+
+ trace_sk_data_ready(sk);
+
+ set_bit(CF_RECV_INTR, &con->flags);
+ lowcomms_queue_rwork(con);
+}
+
+static void lowcomms_write_space(struct sock *sk)
+{
+ struct connection *con = sock2con(sk);
+
+ clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+ spin_lock_bh(&con->writequeue_lock);
+ if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+ con->sock->sk->sk_write_pending--;
+ clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
+ }
+
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+}
+
+static void lowcomms_state_change(struct sock *sk)
+{
+ /* SCTP layer is not calling sk_data_ready when the connection
+ * is done, so we catch the signal through here.
+ */
+ if (sk->sk_shutdown == RCV_SHUTDOWN)
+ lowcomms_data_ready(sk);
+}
+
+static void lowcomms_listen_data_ready(struct sock *sk)
+{
+ trace_sk_data_ready(sk);
+
+ queue_work(io_workqueue, &listen_con.rwork);
+}
+
+int dlm_lowcomms_connect_node(int nodeid)
+{
+ struct connection *con;
+ int idx;
+
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+ up_read(&con->sock_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+
+ cond_resched();
+ return 0;
+}
+
+int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
+{
+ struct connection *con;
+ int idx;
+
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ spin_lock(&con->addrs_lock);
+ con->mark = mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+}
+
+static void lowcomms_error_report(struct sock *sk)
+{
+ struct connection *con = sock2con(sk);
+ struct inet_sock *inet;
+
+ inet = inet_sk(sk);
+ switch (sk->sk_family) {
+ case AF_INET:
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "sending to node %d at %pI4, dport %d, "
+ "sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, &inet->inet_daddr,
+ ntohs(inet->inet_dport), sk->sk_err,
+ READ_ONCE(sk->sk_err_soft));
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "sending to node %d at %pI6c, "
+ "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport), sk->sk_err,
+ READ_ONCE(sk->sk_err_soft));
+ break;
+#endif
+ default:
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "invalid socket family %d set, "
+ "sk_err=%d/%d\n", dlm_our_nodeid(),
+ sk->sk_family, sk->sk_err,
+ READ_ONCE(sk->sk_err_soft));
+ break;
+ }
+
+ dlm_midcomms_unack_msg_resend(con->nodeid);
+
+ listen_sock.sk_error_report(sk);
+}
+
+static void restore_callbacks(struct sock *sk)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(sk));
+#endif
+
+ sk->sk_user_data = NULL;
+ sk->sk_data_ready = listen_sock.sk_data_ready;
+ sk->sk_state_change = listen_sock.sk_state_change;
+ sk->sk_write_space = listen_sock.sk_write_space;
+ sk->sk_error_report = listen_sock.sk_error_report;
+}
+
+/* Make a socket active */
+static void add_sock(struct socket *sock, struct connection *con)
+{
+ struct sock *sk = sock->sk;
+
+ lock_sock(sk);
+ con->sock = sock;
+
+ sk->sk_user_data = con;
+ sk->sk_data_ready = lowcomms_data_ready;
+ sk->sk_write_space = lowcomms_write_space;
+ if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
+ sk->sk_state_change = lowcomms_state_change;
+ sk->sk_allocation = GFP_NOFS;
+ sk->sk_use_task_frag = false;
+ sk->sk_error_report = lowcomms_error_report;
+ release_sock(sk);
+}
+
+/* Add the port number to an IPv6 or 4 sockaddr and return the address
+ length */
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+ int *addr_len)
+{
+ saddr->ss_family = dlm_local_addr[0].ss_family;
+ if (saddr->ss_family == AF_INET) {
+ struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+ in4_addr->sin_port = cpu_to_be16(port);
+ *addr_len = sizeof(struct sockaddr_in);
+ memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+ } else {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+ in6_addr->sin6_port = cpu_to_be16(port);
+ *addr_len = sizeof(struct sockaddr_in6);
+ }
+ memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
+}
+
+static void dlm_page_release(struct kref *kref)
+{
+ struct writequeue_entry *e = container_of(kref, struct writequeue_entry,
+ ref);
+
+ __free_page(e->page);
+ dlm_free_writequeue(e);
+}
+
+static void dlm_msg_release(struct kref *kref)
+{
+ struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref);
+
+ kref_put(&msg->entry->ref, dlm_page_release);
+ dlm_free_msg(msg);
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+ struct dlm_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &e->msgs, list) {
+ if (msg->orig_msg) {
+ msg->orig_msg->retransmit = false;
+ kref_put(&msg->orig_msg->ref, dlm_msg_release);
+ }
+
+ list_del(&msg->list);
+ kref_put(&msg->ref, dlm_msg_release);
+ }
+
+ list_del(&e->list);
+ kref_put(&e->ref, dlm_page_release);
+}
+
+static void dlm_close_sock(struct socket **sock)
+{
+ lock_sock((*sock)->sk);
+ restore_callbacks((*sock)->sk);
+ release_sock((*sock)->sk);
+
+ sock_release(*sock);
+ *sock = NULL;
+}
+
+static void allow_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ clear_bit(CF_IO_STOP, &con->othercon->flags);
+ clear_bit(CF_IO_STOP, &con->flags);
+}
+
+static void stop_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ stop_connection_io(con->othercon);
+
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+
+ down_write(&con->sock_lock);
+ if (con->sock) {
+ lock_sock(con->sock->sk);
+ restore_callbacks(con->sock->sk);
+ release_sock(con->sock->sk);
+ }
+ up_write(&con->sock_lock);
+
+ cancel_work_sync(&con->swork);
+ cancel_work_sync(&con->rwork);
+}
+
+/* Close a remote connection and tidy up */
+static void close_connection(struct connection *con, bool and_other)
+{
+ struct writequeue_entry *e;
+
+ if (con->othercon && and_other)
+ close_connection(con->othercon, false);
+
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ up_write(&con->sock_lock);
+ return;
+ }
+
+ dlm_close_sock(&con->sock);
+
+ /* if we send a writequeue entry only a half way, we drop the
+ * whole entry because reconnection and that we not start of the
+ * middle of a msg which will confuse the other end.
+ *
+ * we can always drop messages because retransmits, but what we
+ * cannot allow is to transmit half messages which may be processed
+ * at the other side.
+ *
+ * our policy is to start on a clean state when disconnects, we don't
+ * know what's send/received on transport layer in this case.
+ */
+ spin_lock_bh(&con->writequeue_lock);
+ if (!list_empty(&con->writequeue)) {
+ e = list_first_entry(&con->writequeue, struct writequeue_entry,
+ list);
+ if (e->dirty)
+ free_entry(e);
+ }
+ spin_unlock_bh(&con->writequeue_lock);
+
+ con->rx_leftover = 0;
+ con->retries = 0;
+ clear_bit(CF_APP_LIMITED, &con->flags);
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ up_write(&con->sock_lock);
+}
+
+static void shutdown_connection(struct connection *con, bool and_other)
+{
+ int ret;
+
+ if (con->othercon && and_other)
+ shutdown_connection(con->othercon, false);
+
+ flush_workqueue(io_workqueue);
+ down_read(&con->sock_lock);
+ /* nothing to shutdown */
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ return;
+ }
+
+ ret = kernel_sock_shutdown(con->sock, SHUT_WR);
+ up_read(&con->sock_lock);
+ if (ret) {
+ log_print("Connection %p failed to shutdown: %d will force close",
+ con, ret);
+ goto force_close;
+ } else {
+ ret = wait_event_timeout(con->shutdown_wait, !con->sock,
+ DLM_SHUTDOWN_WAIT_TIMEOUT);
+ if (ret == 0) {
+ log_print("Connection %p shutdown timed out, will force close",
+ con);
+ goto force_close;
+ }
+ }
+
+ return;
+
+force_close:
+ close_connection(con, false);
+}
+
+static struct processqueue_entry *new_processqueue_entry(int nodeid,
+ int buflen)
+{
+ struct processqueue_entry *pentry;
+
+ pentry = kmalloc(sizeof(*pentry), GFP_NOFS);
+ if (!pentry)
+ return NULL;
+
+ pentry->buf = kmalloc(buflen, GFP_NOFS);
+ if (!pentry->buf) {
+ kfree(pentry);
+ return NULL;
+ }
+
+ pentry->nodeid = nodeid;
+ return pentry;
+}
+
+static void free_processqueue_entry(struct processqueue_entry *pentry)
+{
+ kfree(pentry->buf);
+ kfree(pentry);
+}
+
+struct dlm_processed_nodes {
+ int nodeid;
+
+ struct list_head list;
+};
+
+static void process_dlm_messages(struct work_struct *work)
+{
+ struct processqueue_entry *pentry;
+
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (WARN_ON_ONCE(!pentry)) {
+ process_dlm_messages_pending = false;
+ spin_unlock(&processqueue_lock);
+ return;
+ }
+
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
+
+ for (;;) {
+ dlm_process_incoming_buffer(pentry->nodeid, pentry->buf,
+ pentry->buflen);
+ free_processqueue_entry(pentry);
+
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (!pentry) {
+ process_dlm_messages_pending = false;
+ spin_unlock(&processqueue_lock);
+ break;
+ }
+
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
+ }
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con, int buflen)
+{
+ struct processqueue_entry *pentry;
+ int ret, buflen_real;
+ struct msghdr msg;
+ struct kvec iov;
+
+ pentry = new_processqueue_entry(con->nodeid, buflen);
+ if (!pentry)
+ return DLM_IO_RESCHED;
+
+ memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover);
+
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
+ */
+ iov.iov_base = pentry->buf + con->rx_leftover;
+ iov.iov_len = buflen - con->rx_leftover;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ clear_bit(CF_RECV_INTR, &con->flags);
+again:
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
+ trace_dlm_recv(con->nodeid, ret);
+ if (ret == -EAGAIN) {
+ lock_sock(con->sock->sk);
+ if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) {
+ release_sock(con->sock->sk);
+ goto again;
+ }
+
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ release_sock(con->sock->sk);
+ free_processqueue_entry(pentry);
+ return DLM_IO_END;
+ } else if (ret == 0) {
+ /* close will clear CF_RECV_PENDING */
+ free_processqueue_entry(pentry);
+ return DLM_IO_EOF;
+ } else if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
+ }
+
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen_real = ret + con->rx_leftover;
+ ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf,
+ buflen_real);
+ if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
+ }
+
+ pentry->buflen = ret;
+
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen_real - ret;
+ memmove(con->rx_leftover_buf, pentry->buf + ret,
+ con->rx_leftover);
+
+ spin_lock(&processqueue_lock);
+ list_add_tail(&pentry->list, &processqueue);
+ if (!process_dlm_messages_pending) {
+ process_dlm_messages_pending = true;
+ queue_work(process_workqueue, &process_work);
+ }
+ spin_unlock(&processqueue_lock);
+
+ return DLM_IO_SUCCESS;
+}
+
+/* Listening socket is busy, accept a connection */
+static int accept_from_sock(void)
+{
+ struct sockaddr_storage peeraddr;
+ int len, idx, result, nodeid;
+ struct connection *newcon;
+ struct socket *newsock;
+ unsigned int mark;
+
+ result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK);
+ if (result == -EAGAIN)
+ return DLM_IO_END;
+ else if (result < 0)
+ goto accept_err;
+
+ /* Get the connected socket's peer */
+ memset(&peeraddr, 0, sizeof(peeraddr));
+ len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2);
+ if (len < 0) {
+ result = -ECONNABORTED;
+ goto accept_err;
+ }
+
+ /* Get the new node's NODEID */
+ make_sockaddr(&peeraddr, 0, &len);
+ if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) {
+ switch (peeraddr.ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr;
+
+ log_print("connect from non cluster IPv4 node %pI4",
+ &sin->sin_addr);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr;
+
+ log_print("connect from non cluster IPv6 node %pI6c",
+ &sin6->sin6_addr);
+ break;
+ }
+#endif
+ default:
+ log_print("invalid family from non cluster node");
+ break;
+ }
+
+ sock_release(newsock);
+ return -1;
+ }
+
+ log_print("got connection from %d", nodeid);
+
+ /* Check to see if we already have a connection to this node. This
+ * could happen if the two nodes initiate a connection at roughly
+ * the same time and the connections cross on the wire.
+ * In this case we store the incoming one in "othercon"
+ */
+ idx = srcu_read_lock(&connections_srcu);
+ newcon = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!newcon)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ result = -ENOENT;
+ goto accept_err;
+ }
+
+ sock_set_mark(newsock->sk, mark);
+
+ down_write(&newcon->sock_lock);
+ if (newcon->sock) {
+ struct connection *othercon = newcon->othercon;
+
+ if (!othercon) {
+ othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
+ if (!othercon) {
+ log_print("failed to allocate incoming socket");
+ up_write(&newcon->sock_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ result = -ENOMEM;
+ goto accept_err;
+ }
+
+ dlm_con_init(othercon, nodeid);
+ lockdep_set_subclass(&othercon->sock_lock, 1);
+ newcon->othercon = othercon;
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
+ } else {
+ /* close other sock con if we have something new */
+ close_connection(othercon, false);
+ }
+
+ down_write(&othercon->sock_lock);
+ add_sock(newsock, othercon);
+
+ /* check if we receved something while adding */
+ lock_sock(othercon->sock->sk);
+ lowcomms_queue_rwork(othercon);
+ release_sock(othercon->sock->sk);
+ up_write(&othercon->sock_lock);
+ }
+ else {
+ /* accept copies the sk after we've saved the callbacks, so we
+ don't want to save them a second time or comm errors will
+ result in calling sk_error_report recursively. */
+ add_sock(newsock, newcon);
+
+ /* check if we receved something while adding */
+ lock_sock(newcon->sock->sk);
+ lowcomms_queue_rwork(newcon);
+ release_sock(newcon->sock->sk);
+ }
+ up_write(&newcon->sock_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+
+ return DLM_IO_SUCCESS;
+
+accept_err:
+ if (newsock)
+ sock_release(newsock);
+
+ return result;
+}
+
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+ e->offset += completed;
+ e->len -= completed;
+ /* signal that page was half way transmitted */
+ e->dirty = true;
+
+ if (e->len == 0 && e->users == 0)
+ free_entry(e);
+}
+
+/*
+ * sctp_bind_addrs - bind a SCTP socket to all our addresses
+ */
+static int sctp_bind_addrs(struct socket *sock, uint16_t port)
+{
+ struct sockaddr_storage localaddr;
+ struct sockaddr *addr = (struct sockaddr *)&localaddr;
+ int i, addr_len, result = 0;
+
+ for (i = 0; i < dlm_local_count; i++) {
+ memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
+ make_sockaddr(&localaddr, port, &addr_len);
+
+ if (!i)
+ result = kernel_bind(sock, addr, addr_len);
+ else
+ result = sock_bind_add(sock->sk, addr, addr_len);
+
+ if (result < 0) {
+ log_print("Can't bind to %d addr number %d, %d.\n",
+ port, i + 1, result);
+ break;
+ }
+ }
+ return result;
+}
+
+/* Get local addresses */
+static void init_local(void)
+{
+ struct sockaddr_storage sas;
+ int i;
+
+ dlm_local_count = 0;
+ for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) {
+ if (dlm_our_addr(&sas, i))
+ break;
+
+ memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas));
+ }
+}
+
+static struct writequeue_entry *new_writequeue_entry(struct connection *con)
+{
+ struct writequeue_entry *entry;
+
+ entry = dlm_allocate_writequeue();
+ if (!entry)
+ return NULL;
+
+ entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!entry->page) {
+ dlm_free_writequeue(entry);
+ return NULL;
+ }
+
+ entry->offset = 0;
+ entry->len = 0;
+ entry->end = 0;
+ entry->dirty = false;
+ entry->con = con;
+ entry->users = 1;
+ kref_init(&entry->ref);
+ return entry;
+}
+
+static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
+ char **ppc, void (*cb)(void *data),
+ void *data)
+{
+ struct writequeue_entry *e;
+
+ spin_lock_bh(&con->writequeue_lock);
+ if (!list_empty(&con->writequeue)) {
+ e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
+ if (DLM_WQ_REMAIN_BYTES(e) >= len) {
+ kref_get(&e->ref);
+
+ *ppc = page_address(e->page) + e->end;
+ if (cb)
+ cb(data);
+
+ e->end += len;
+ e->users++;
+ goto out;
+ }
+ }
+
+ e = new_writequeue_entry(con);
+ if (!e)
+ goto out;
+
+ kref_get(&e->ref);
+ *ppc = page_address(e->page);
+ e->end += len;
+ if (cb)
+ cb(data);
+
+ list_add_tail(&e->list, &con->writequeue);
+
+out:
+ spin_unlock_bh(&con->writequeue_lock);
+ return e;
+};
+
+static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
+ gfp_t allocation, char **ppc,
+ void (*cb)(void *data),
+ void *data)
+{
+ struct writequeue_entry *e;
+ struct dlm_msg *msg;
+
+ msg = dlm_allocate_msg(allocation);
+ if (!msg)
+ return NULL;
+
+ kref_init(&msg->ref);
+
+ e = new_wq_entry(con, len, ppc, cb, data);
+ if (!e) {
+ dlm_free_msg(msg);
+ return NULL;
+ }
+
+ msg->retransmit = false;
+ msg->orig_msg = NULL;
+ msg->ppc = *ppc;
+ msg->len = len;
+ msg->entry = e;
+
+ return msg;
+}
+
+/* avoid false positive for nodes_srcu, unlock happens in
+ * dlm_lowcomms_commit_msg which is a must call if success
+ */
+#ifndef __CHECKER__
+struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
+ char **ppc, void (*cb)(void *data),
+ void *data)
+{
+ struct connection *con;
+ struct dlm_msg *msg;
+ int idx;
+
+ if (len > DLM_MAX_SOCKET_BUFSIZE ||
+ len < sizeof(struct dlm_header)) {
+ BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
+ log_print("failed to allocate a buffer of size %d", len);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return NULL;
+ }
+
+ msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data);
+ if (!msg) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return NULL;
+ }
+
+ /* for dlm_lowcomms_commit_msg() */
+ kref_get(&msg->ref);
+ /* we assume if successful commit must called */
+ msg->idx = idx;
+ return msg;
+}
+#endif
+
+static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
+{
+ struct writequeue_entry *e = msg->entry;
+ struct connection *con = e->con;
+ int users;
+
+ spin_lock_bh(&con->writequeue_lock);
+ kref_get(&msg->ref);
+ list_add(&msg->list, &e->msgs);
+
+ users = --e->users;
+ if (users)
+ goto out;
+
+ e->len = DLM_WQ_LENGTH_BYTES(e);
+
+ lowcomms_queue_swork(con);
+
+out:
+ spin_unlock_bh(&con->writequeue_lock);
+ return;
+}
+
+/* avoid false positive for nodes_srcu, lock was happen in
+ * dlm_lowcomms_new_msg
+ */
+#ifndef __CHECKER__
+void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
+{
+ _dlm_lowcomms_commit_msg(msg);
+ srcu_read_unlock(&connections_srcu, msg->idx);
+ /* because dlm_lowcomms_new_msg() */
+ kref_put(&msg->ref, dlm_msg_release);
+}
+#endif
+
+void dlm_lowcomms_put_msg(struct dlm_msg *msg)
+{
+ kref_put(&msg->ref, dlm_msg_release);
+}
+
+/* does not held connections_srcu, usage lowcomms_error_report only */
+int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
+{
+ struct dlm_msg *msg_resend;
+ char *ppc;
+
+ if (msg->retransmit)
+ return 1;
+
+ msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len,
+ GFP_ATOMIC, &ppc, NULL, NULL);
+ if (!msg_resend)
+ return -ENOMEM;
+
+ msg->retransmit = true;
+ kref_get(&msg->ref);
+ msg_resend->orig_msg = msg;
+
+ memcpy(ppc, msg->ppc, msg->len);
+ _dlm_lowcomms_commit_msg(msg_resend);
+ dlm_lowcomms_put_msg(msg_resend);
+
+ return 0;
+}
+
+/* Send a message */
+static int send_to_sock(struct connection *con)
+{
+ struct writequeue_entry *e;
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL,
+ };
+ int len, offset, ret;
+
+ spin_lock_bh(&con->writequeue_lock);
+ e = con_next_wq(con);
+ if (!e) {
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ return DLM_IO_END;
+ }
+
+ len = e->len;
+ offset = e->offset;
+ WARN_ON_ONCE(len == 0 && e->users == 0);
+ spin_unlock_bh(&con->writequeue_lock);
+
+ bvec_set_page(&bvec, e->page, len, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+ ret = sock_sendmsg(con->sock, &msg);
+ trace_dlm_send(con->nodeid, ret);
+ if (ret == -EAGAIN || ret == 0) {
+ lock_sock(con->sock->sk);
+ spin_lock_bh(&con->writequeue_lock);
+ if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
+ !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+ /* Notify TCP that we're limited by the
+ * application window size.
+ */
+ set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags);
+ con->sock->sk->sk_write_pending++;
+
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+
+ /* wait for write_space() event */
+ return DLM_IO_END;
+ }
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+
+ return DLM_IO_RESCHED;
+ } else if (ret < 0) {
+ return ret;
+ }
+
+ spin_lock_bh(&con->writequeue_lock);
+ writequeue_entry_complete(e, ret);
+ spin_unlock_bh(&con->writequeue_lock);
+
+ return DLM_IO_SUCCESS;
+}
+
+static void clean_one_writequeue(struct connection *con)
+{
+ struct writequeue_entry *e, *safe;
+
+ spin_lock_bh(&con->writequeue_lock);
+ list_for_each_entry_safe(e, safe, &con->writequeue, list) {
+ free_entry(e);
+ }
+ spin_unlock_bh(&con->writequeue_lock);
+}
+
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ WARN_ON_ONCE(!list_empty(&con->writequeue));
+ WARN_ON_ONCE(con->sock);
+ kfree(con);
+}
+
+/* Called from recovery when it knows that a node has
+ left the cluster */
+int dlm_lowcomms_close(int nodeid)
+{
+ struct connection *con;
+ int idx;
+
+ log_print("closing connection to node %d", nodeid);
+
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ stop_connection_io(con);
+ log_print("io handling for node: %d stopped", nodeid);
+ close_connection(con, true);
+
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ clean_one_writequeue(con);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
+ call_srcu(&connections_srcu, &con->othercon->rcu, connection_release);
+ }
+ srcu_read_unlock(&connections_srcu, idx);
+
+ /* for debugging we print when we are done to compare with other
+ * messages in between. This function need to be correctly synchronized
+ * with io handling
+ */
+ log_print("closing connection to node %d done", nodeid);
+
+ return 0;
+}
+
+/* Receive worker function */
+static void process_recv_sockets(struct work_struct *work)
+{
+ struct connection *con = container_of(work, struct connection, rwork);
+ int ret, buflen;
+
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ return;
+ }
+
+ buflen = READ_ONCE(dlm_config.ci_buffer_size);
+ do {
+ ret = receive_from_sock(con, buflen);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
+
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_EOF:
+ close_connection(con, false);
+ wake_up(&con->shutdown_wait);
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ cond_resched();
+ queue_work(io_workqueue, &con->rwork);
+ /* CF_RECV_PENDING not cleared */
+ break;
+ default:
+ if (ret < 0) {
+ if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+ close_connection(con, false);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+
+ /* CF_RECV_PENDING cleared for othercon
+ * we trigger send queue if not already done
+ * and process_send_sockets will handle it
+ */
+ break;
+ }
+
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void process_listen_recv_socket(struct work_struct *work)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(!listen_con.sock))
+ return;
+
+ do {
+ ret = accept_from_sock();
+ } while (ret == DLM_IO_SUCCESS);
+
+ if (ret < 0)
+ log_print("critical error accepting connection: %d", ret);
+}
+
+static int dlm_connect(struct connection *con)
+{
+ struct sockaddr_storage addr;
+ int result, addr_len;
+ struct socket *sock;
+ unsigned int mark;
+
+ memset(&addr, 0, sizeof(addr));
+ result = nodeid_to_addr(con->nodeid, &addr, NULL,
+ dlm_proto_ops->try_new_addr, &mark);
+ if (result < 0) {
+ log_print("no address for nodeid %d", con->nodeid);
+ return result;
+ }
+
+ /* Create a socket to communicate with */
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
+ SOCK_STREAM, dlm_proto_ops->proto, &sock);
+ if (result < 0)
+ return result;
+
+ sock_set_mark(sock->sk, mark);
+ dlm_proto_ops->sockopts(sock);
+
+ result = dlm_proto_ops->bind(sock);
+ if (result < 0) {
+ sock_release(sock);
+ return result;
+ }
+
+ add_sock(sock, con);
+
+ log_print_ratelimited("connecting to %d", con->nodeid);
+ make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
+ result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
+ addr_len);
+ switch (result) {
+ case -EINPROGRESS:
+ /* not an error */
+ fallthrough;
+ case 0:
+ break;
+ default:
+ if (result < 0)
+ dlm_close_sock(&con->sock);
+
+ break;
+ }
+
+ return result;
+}
+
+/* Send worker function */
+static void process_send_sockets(struct work_struct *work)
+{
+ struct connection *con = container_of(work, struct connection, swork);
+ int ret;
+
+ WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags));
+
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ ret = dlm_connect(con);
+ switch (ret) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ /* avoid spamming resched on connection
+ * we might can switch to a state_change
+ * event based mechanism if established
+ */
+ msleep(100);
+ break;
+ default:
+ /* CF_SEND_PENDING not cleared */
+ up_write(&con->sock_lock);
+ log_print("connect to node %d try %d error %d",
+ con->nodeid, con->retries++, ret);
+ msleep(1000);
+ /* For now we try forever to reconnect. In
+ * future we should send a event to cluster
+ * manager to fence itself after certain amount
+ * of retries.
+ */
+ queue_work(io_workqueue, &con->swork);
+ return;
+ }
+ }
+ downgrade_write(&con->sock_lock);
+ }
+
+ do {
+ ret = send_to_sock(con);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
+
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_SEND_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ /* CF_SEND_PENDING not cleared */
+ cond_resched();
+ queue_work(io_workqueue, &con->swork);
+ break;
+ default:
+ if (ret < 0) {
+ close_connection(con, false);
+
+ /* CF_SEND_PENDING cleared */
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ break;
+ }
+
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void work_stop(void)
+{
+ if (io_workqueue) {
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
+ }
+
+ if (process_workqueue) {
+ destroy_workqueue(process_workqueue);
+ process_workqueue = NULL;
+ }
+}
+
+static int work_start(void)
+{
+ io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM |
+ WQ_UNBOUND, 0);
+ if (!io_workqueue) {
+ log_print("can't start dlm_io");
+ return -ENOMEM;
+ }
+
+ /* ordered dlm message process queue,
+ * should be converted to a tasklet
+ */
+ process_workqueue = alloc_ordered_workqueue("dlm_process",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM);
+ if (!process_workqueue) {
+ log_print("can't start dlm_process");
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void dlm_lowcomms_shutdown(void)
+{
+ struct connection *con;
+ int i, idx;
+
+ /* stop lowcomms_listen_data_ready calls */
+ lock_sock(listen_con.sock->sk);
+ listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready;
+ release_sock(listen_con.sock->sk);
+
+ cancel_work_sync(&listen_con.rwork);
+ dlm_close_sock(&listen_con.sock);
+
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ shutdown_connection(con, true);
+ stop_connection_io(con);
+ flush_workqueue(process_workqueue);
+ close_connection(con, true);
+
+ clean_one_writequeue(con);
+ if (con->othercon)
+ clean_one_writequeue(con->othercon);
+ allow_connection_io(con);
+ }
+ }
+ srcu_read_unlock(&connections_srcu, idx);
+}
+
+void dlm_lowcomms_stop(void)
+{
+ work_stop();
+ dlm_proto_ops = NULL;
+}
+
+static int dlm_listen_for_all(void)
+{
+ struct socket *sock;
+ int result;
+
+ log_print("Using %s for communications",
+ dlm_proto_ops->name);
+
+ result = dlm_proto_ops->listen_validate();
+ if (result < 0)
+ return result;
+
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
+ SOCK_STREAM, dlm_proto_ops->proto, &sock);
+ if (result < 0) {
+ log_print("Can't create comms socket: %d", result);
+ return result;
+ }
+
+ sock_set_mark(sock->sk, dlm_config.ci_mark);
+ dlm_proto_ops->listen_sockopts(sock);
+
+ result = dlm_proto_ops->listen_bind(sock);
+ if (result < 0)
+ goto out;
+
+ lock_sock(sock->sk);
+ listen_sock.sk_data_ready = sock->sk->sk_data_ready;
+ listen_sock.sk_write_space = sock->sk->sk_write_space;
+ listen_sock.sk_error_report = sock->sk->sk_error_report;
+ listen_sock.sk_state_change = sock->sk->sk_state_change;
+
+ listen_con.sock = sock;
+
+ sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_use_task_frag = false;
+ sock->sk->sk_data_ready = lowcomms_listen_data_ready;
+ release_sock(sock->sk);
+
+ result = sock->ops->listen(sock, 128);
+ if (result < 0) {
+ dlm_close_sock(&listen_con.sock);
+ return result;
+ }
+
+ return 0;
+
+out:
+ sock_release(sock);
+ return result;
+}
+
+static int dlm_tcp_bind(struct socket *sock)
+{
+ struct sockaddr_storage src_addr;
+ int result, addr_len;
+
+ /* Bind to our cluster-known address connecting to avoid
+ * routing problems.
+ */
+ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
+ make_sockaddr(&src_addr, 0, &addr_len);
+
+ result = kernel_bind(sock, (struct sockaddr *)&src_addr,
+ addr_len);
+ if (result < 0) {
+ /* This *may* not indicate a critical error */
+ log_print("could not bind for connect: %d", result);
+ }
+
+ return 0;
+}
+
+static int dlm_tcp_connect(struct connection *con, struct socket *sock,
+ struct sockaddr *addr, int addr_len)
+{
+ return kernel_connect(sock, addr, addr_len, O_NONBLOCK);
+}
+
+static int dlm_tcp_listen_validate(void)
+{
+ /* We don't support multi-homed hosts */
+ if (dlm_local_count > 1) {
+ log_print("TCP protocol can't handle multi-homed hosts, try SCTP");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void dlm_tcp_sockopts(struct socket *sock)
+{
+ /* Turn off Nagle's algorithm */
+ tcp_sock_set_nodelay(sock->sk);
+}
+
+static void dlm_tcp_listen_sockopts(struct socket *sock)
+{
+ dlm_tcp_sockopts(sock);
+ sock_set_reuseaddr(sock->sk);
+}
+
+static int dlm_tcp_listen_bind(struct socket *sock)
+{
+ int addr_len;
+
+ /* Bind to our port */
+ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+ return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
+ addr_len);
+}
+
+static const struct dlm_proto_ops dlm_tcp_ops = {
+ .name = "TCP",
+ .proto = IPPROTO_TCP,
+ .connect = dlm_tcp_connect,
+ .sockopts = dlm_tcp_sockopts,
+ .bind = dlm_tcp_bind,
+ .listen_validate = dlm_tcp_listen_validate,
+ .listen_sockopts = dlm_tcp_listen_sockopts,
+ .listen_bind = dlm_tcp_listen_bind,
+};
+
+static int dlm_sctp_bind(struct socket *sock)
+{
+ return sctp_bind_addrs(sock, 0);
+}
+
+static int dlm_sctp_connect(struct connection *con, struct socket *sock,
+ struct sockaddr *addr, int addr_len)
+{
+ int ret;
+
+ /*
+ * Make kernel_connect() function return in specified time,
+ * since O_NONBLOCK argument in connect() function does not work here,
+ * then, we should restore the default value of this attribute.
+ */
+ sock_set_sndtimeo(sock->sk, 5);
+ ret = kernel_connect(sock, addr, addr_len, 0);
+ sock_set_sndtimeo(sock->sk, 0);
+ return ret;
+}
+
+static int dlm_sctp_listen_validate(void)
+{
+ if (!IS_ENABLED(CONFIG_IP_SCTP)) {
+ log_print("SCTP is not enabled by this kernel");
+ return -EOPNOTSUPP;
+ }
+
+ request_module("sctp");
+ return 0;
+}
+
+static int dlm_sctp_bind_listen(struct socket *sock)
+{
+ return sctp_bind_addrs(sock, dlm_config.ci_tcp_port);
+}
+
+static void dlm_sctp_sockopts(struct socket *sock)
+{
+ /* Turn off Nagle's algorithm */
+ sctp_sock_set_nodelay(sock->sk);
+ sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
+}
+
+static const struct dlm_proto_ops dlm_sctp_ops = {
+ .name = "SCTP",
+ .proto = IPPROTO_SCTP,
+ .try_new_addr = true,
+ .connect = dlm_sctp_connect,
+ .sockopts = dlm_sctp_sockopts,
+ .bind = dlm_sctp_bind,
+ .listen_validate = dlm_sctp_listen_validate,
+ .listen_sockopts = dlm_sctp_sockopts,
+ .listen_bind = dlm_sctp_bind_listen,
+};
+
+int dlm_lowcomms_start(void)
+{
+ int error;
+
+ init_local();
+ if (!dlm_local_count) {
+ error = -ENOTCONN;
+ log_print("no local IP address has been set");
+ goto fail;
+ }
+
+ error = work_start();
+ if (error)
+ goto fail;
+
+ /* Start listening */
+ switch (dlm_config.ci_protocol) {
+ case DLM_PROTO_TCP:
+ dlm_proto_ops = &dlm_tcp_ops;
+ break;
+ case DLM_PROTO_SCTP:
+ dlm_proto_ops = &dlm_sctp_ops;
+ break;
+ default:
+ log_print("Invalid protocol identifier %d set",
+ dlm_config.ci_protocol);
+ error = -EINVAL;
+ goto fail_proto_ops;
+ }
+
+ error = dlm_listen_for_all();
+ if (error)
+ goto fail_listen;
+
+ return 0;
+
+fail_listen:
+ dlm_proto_ops = NULL;
+fail_proto_ops:
+ work_stop();
+fail:
+ return error;
+}
+
+void dlm_lowcomms_init(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&connection_hash[i]);
+
+ INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+}
+
+void dlm_lowcomms_exit(void)
+{
+ struct connection *con;
+ int i, idx;
+
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ if (con->othercon)
+ call_srcu(&connections_srcu, &con->othercon->rcu,
+ connection_release);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ }
+ }
+ srcu_read_unlock(&connections_srcu, idx);
+}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 0000000000..3e8dca6618
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+#include "dlm_internal.h"
+
+#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts)
+#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \
+ DLM_MIDCOMMS_OPT_LEN)
+
+#define CONN_HASH_SIZE 32
+
+/* This is deliberately very simple because most clusters have simple
+ * sequential nodeids, so we should be able to go straight to a connection
+ * struct in the array
+ */
+static inline int nodeid_hash(int nodeid)
+{
+ return nodeid & (CONN_HASH_SIZE-1);
+}
+
+/* check if dlm is running */
+bool dlm_lowcomms_is_running(void);
+
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_shutdown(void);
+void dlm_lowcomms_shutdown_node(int nodeid, bool force);
+void dlm_lowcomms_stop(void);
+void dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_close(int nodeid);
+struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
+ char **ppc, void (*cb)(void *data),
+ void *data);
+void dlm_lowcomms_commit_msg(struct dlm_msg *msg);
+void dlm_lowcomms_put_msg(struct dlm_msg *msg);
+int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
+int dlm_lowcomms_connect_node(int nodeid);
+int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
+void dlm_midcomms_receive_done(int nodeid);
+struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void);
+struct kmem_cache *dlm_lowcomms_msg_cache_create(void);
+
+#endif /* __LOWCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 0000000000..09052d9671
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 0000000000..6ca28299c9
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "config.h"
+#include "midcomms.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/dlm.h>
+
+static int __init init_dlm(void)
+{
+ int error;
+
+ error = dlm_memory_init();
+ if (error)
+ goto out;
+
+ dlm_midcomms_init();
+
+ error = dlm_lockspace_init();
+ if (error)
+ goto out_mem;
+
+ error = dlm_config_init();
+ if (error)
+ goto out_lockspace;
+
+ dlm_register_debugfs();
+
+ error = dlm_user_init();
+ if (error)
+ goto out_debug;
+
+ error = dlm_plock_init();
+ if (error)
+ goto out_user;
+
+ printk("DLM installed\n");
+
+ return 0;
+
+ out_user:
+ dlm_user_exit();
+ out_debug:
+ dlm_unregister_debugfs();
+ dlm_config_exit();
+ out_lockspace:
+ dlm_lockspace_exit();
+ out_mem:
+ dlm_midcomms_exit();
+ dlm_memory_exit();
+ out:
+ return error;
+}
+
+static void __exit exit_dlm(void)
+{
+ dlm_plock_exit();
+ dlm_user_exit();
+ dlm_config_exit();
+ dlm_lockspace_exit();
+ dlm_midcomms_exit();
+ dlm_unregister_debugfs();
+ dlm_memory_exit();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 0000000000..be7909ead7
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+#include "midcomms.h"
+#include "lowcomms.h"
+
+int dlm_slots_version(const struct dlm_header *h)
+{
+ if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS)
+ return 0;
+ return 1;
+}
+
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+ struct dlm_member *memb)
+{
+ struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+
+ if (!dlm_slots_version(&rc->rc_header))
+ return;
+
+ memb->slot = le16_to_cpu(rf->rf_our_slot);
+ memb->generation = le32_to_cpu(rf->rf_generation);
+}
+
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct dlm_slot *slot;
+ struct rcom_slot *ro;
+ int i;
+
+ ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+ /* ls_slots array is sparse, but not rcom_slots */
+
+ for (i = 0; i < ls->ls_slots_size; i++) {
+ slot = &ls->ls_slots[i];
+ if (!slot->nodeid)
+ continue;
+ ro->ro_nodeid = cpu_to_le32(slot->nodeid);
+ ro->ro_slot = cpu_to_le16(slot->slot);
+ ro++;
+ }
+}
+
+#define SLOT_DEBUG_LINE 128
+
+static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+ struct rcom_slot *ro0, struct dlm_slot *array,
+ int array_size)
+{
+ char line[SLOT_DEBUG_LINE];
+ int len = SLOT_DEBUG_LINE - 1;
+ int pos = 0;
+ int ret, i;
+
+ memset(line, 0, sizeof(line));
+
+ if (array) {
+ for (i = 0; i < array_size; i++) {
+ if (!array[i].nodeid)
+ continue;
+
+ ret = snprintf(line + pos, len - pos, " %d:%d",
+ array[i].slot, array[i].nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ } else if (ro0) {
+ for (i = 0; i < num_slots; i++) {
+ ret = snprintf(line + pos, len - pos, " %d:%d",
+ ro0[i].ro_slot, ro0[i].ro_nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ }
+
+ log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
+}
+
+int dlm_slots_copy_in(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_rcom *rc = ls->ls_recover_buf;
+ struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+ struct rcom_slot *ro0, *ro;
+ int our_nodeid = dlm_our_nodeid();
+ int i, num_slots;
+ uint32_t gen;
+
+ if (!dlm_slots_version(&rc->rc_header))
+ return -1;
+
+ gen = le32_to_cpu(rf->rf_generation);
+ if (gen <= ls->ls_generation) {
+ log_error(ls, "dlm_slots_copy_in gen %u old %u",
+ gen, ls->ls_generation);
+ }
+ ls->ls_generation = gen;
+
+ num_slots = le16_to_cpu(rf->rf_num_slots);
+ if (!num_slots)
+ return -1;
+
+ ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+ log_slots(ls, gen, num_slots, ro0, NULL, 0);
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+ if (le32_to_cpu(ro->ro_nodeid) != memb->nodeid)
+ continue;
+ memb->slot = le16_to_cpu(ro->ro_slot);
+ memb->slot_prev = memb->slot;
+ break;
+ }
+
+ if (memb->nodeid == our_nodeid) {
+ if (ls->ls_slot && ls->ls_slot != memb->slot) {
+ log_error(ls, "dlm_slots_copy_in our slot "
+ "changed %d %d", ls->ls_slot,
+ memb->slot);
+ return -1;
+ }
+
+ if (!ls->ls_slot)
+ ls->ls_slot = memb->slot;
+ }
+
+ if (!memb->slot) {
+ log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
+ memb->nodeid);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* for any nodes that do not support slots, we will not have set memb->slot
+ in wait_status_all(), so memb->slot will remain -1, and we will not
+ assign slots or set ls_num_slots here */
+
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+ struct dlm_slot **slots_out, uint32_t *gen_out)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *array;
+ int our_nodeid = dlm_our_nodeid();
+ int array_size, max_slots, i;
+ int need = 0;
+ int max = 0;
+ int num = 0;
+ uint32_t gen = 0;
+
+ /* our own memb struct will have slot -1 gen 0 */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->nodeid == our_nodeid) {
+ memb->slot = ls->ls_slot;
+ memb->generation = ls->ls_generation;
+ break;
+ }
+ }
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->generation > gen)
+ gen = memb->generation;
+
+ /* node doesn't support slots */
+
+ if (memb->slot == -1)
+ return -1;
+
+ /* node needs a slot assigned */
+
+ if (!memb->slot)
+ need++;
+
+ /* node has a slot assigned */
+
+ num++;
+
+ if (!max || max < memb->slot)
+ max = memb->slot;
+
+ /* sanity check, once slot is assigned it shouldn't change */
+
+ if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
+ log_error(ls, "nodeid %d slot changed %d %d",
+ memb->nodeid, memb->slot_prev, memb->slot);
+ return -1;
+ }
+ memb->slot_prev = memb->slot;
+ }
+
+ array_size = max + need;
+ array = kcalloc(array_size, sizeof(*array), GFP_NOFS);
+ if (!array)
+ return -ENOMEM;
+
+ num = 0;
+
+ /* fill in slots (offsets) that are used */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (!memb->slot)
+ continue;
+
+ if (memb->slot > array_size) {
+ log_error(ls, "invalid slot number %d", memb->slot);
+ kfree(array);
+ return -1;
+ }
+
+ array[memb->slot - 1].nodeid = memb->nodeid;
+ array[memb->slot - 1].slot = memb->slot;
+ num++;
+ }
+
+ /* assign new slots from unused offsets */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->slot)
+ continue;
+
+ for (i = 0; i < array_size; i++) {
+ if (array[i].nodeid)
+ continue;
+
+ memb->slot = i + 1;
+ memb->slot_prev = memb->slot;
+ array[i].nodeid = memb->nodeid;
+ array[i].slot = memb->slot;
+ num++;
+
+ if (!ls->ls_slot && memb->nodeid == our_nodeid)
+ ls->ls_slot = memb->slot;
+ break;
+ }
+
+ if (!memb->slot) {
+ log_error(ls, "no free slot found");
+ kfree(array);
+ return -1;
+ }
+ }
+
+ gen++;
+
+ log_slots(ls, gen, num, NULL, array, array_size);
+
+ max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) -
+ sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
+
+ if (num > max_slots) {
+ log_error(ls, "num_slots %d exceeds max_slots %d",
+ num, max_slots);
+ kfree(array);
+ return -1;
+ }
+
+ *gen_out = gen;
+ *slots_out = array;
+ *slots_size = array_size;
+ *num_slots = num;
+ return 0;
+}
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+ struct dlm_member *memb = NULL;
+ struct list_head *tmp;
+ struct list_head *newlist = &new->list;
+ struct list_head *head = &ls->ls_nodes;
+
+ list_for_each(tmp, head) {
+ memb = list_entry(tmp, struct dlm_member, list);
+ if (new->nodeid < memb->nodeid)
+ break;
+ }
+
+ if (!memb)
+ list_add_tail(newlist, head);
+ else {
+ /* FIXME: can use list macro here */
+ newlist->prev = tmp->prev;
+ newlist->next = tmp;
+ tmp->prev->next = newlist;
+ tmp->prev = newlist;
+ }
+}
+
+static int add_remote_member(int nodeid)
+{
+ int error;
+
+ if (nodeid == dlm_our_nodeid())
+ return 0;
+
+ error = dlm_lowcomms_connect_node(nodeid);
+ if (error < 0)
+ return error;
+
+ dlm_midcomms_add_member(nodeid);
+ return 0;
+}
+
+static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
+{
+ struct dlm_member *memb;
+ int error;
+
+ memb = kzalloc(sizeof(*memb), GFP_NOFS);
+ if (!memb)
+ return -ENOMEM;
+
+ memb->nodeid = node->nodeid;
+ memb->weight = node->weight;
+ memb->comm_seq = node->comm_seq;
+
+ error = add_remote_member(node->nodeid);
+ if (error < 0) {
+ kfree(memb);
+ return error;
+ }
+
+ add_ordered_member(ls, memb);
+ ls->ls_num_nodes++;
+ return 0;
+}
+
+static struct dlm_member *find_memb(struct list_head *head, int nodeid)
+{
+ struct dlm_member *memb;
+
+ list_for_each_entry(memb, head, list) {
+ if (memb->nodeid == nodeid)
+ return memb;
+ }
+ return NULL;
+}
+
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+ if (find_memb(&ls->ls_nodes, nodeid))
+ return 1;
+ return 0;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+ if (find_memb(&ls->ls_nodes_gone, nodeid))
+ return 1;
+ return 0;
+}
+
+static void clear_memb_list(struct list_head *head,
+ void (*after_del)(int nodeid))
+{
+ struct dlm_member *memb;
+
+ while (!list_empty(head)) {
+ memb = list_entry(head->next, struct dlm_member, list);
+ list_del(&memb->list);
+ if (after_del)
+ after_del(memb->nodeid);
+ kfree(memb);
+ }
+}
+
+static void remove_remote_member(int nodeid)
+{
+ if (nodeid == dlm_our_nodeid())
+ return;
+
+ dlm_midcomms_remove_member(nodeid);
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+ clear_memb_list(&ls->ls_nodes, remove_remote_member);
+ ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+ clear_memb_list(&ls->ls_nodes_gone, NULL);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+ kfree(ls->ls_node_array);
+ ls->ls_node_array = NULL;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->weight)
+ total += memb->weight;
+ }
+
+ /* all nodes revert to weight of 1 if all have weight 0 */
+
+ if (!total) {
+ total = ls->ls_num_nodes;
+ all_zero = 1;
+ }
+
+ ls->ls_total_weight = total;
+ array = kmalloc_array(total, sizeof(*array), GFP_NOFS);
+ if (!array)
+ return;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (!all_zero && !memb->weight)
+ continue;
+
+ if (all_zero)
+ w = 1;
+ else
+ w = memb->weight;
+
+ DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+ for (i = 0; i < w; i++)
+ array[x++] = memb->nodeid;
+ }
+
+ ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static int ping_members(struct dlm_ls *ls, uint64_t seq)
+{
+ struct dlm_member *memb;
+ int error = 0;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ break;
+ }
+ error = dlm_rcom_status(ls, memb->nodeid, 0, seq);
+ if (error)
+ break;
+ }
+ if (error)
+ log_rinfo(ls, "ping_members aborted %d last nodeid %d",
+ error, ls->ls_recover_nodeid);
+ return error;
+}
+
+static void dlm_lsop_recover_prep(struct dlm_ls *ls)
+{
+ if (!ls->ls_ops || !ls->ls_ops->recover_prep)
+ return;
+ ls->ls_ops->recover_prep(ls->ls_ops_arg);
+}
+
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+{
+ struct dlm_slot slot;
+ uint32_t seq;
+ int error;
+
+ if (!ls->ls_ops || !ls->ls_ops->recover_slot)
+ return;
+
+ /* if there is no comms connection with this node
+ or the present comms connection is newer
+ than the one when this member was added, then
+ we consider the node to have failed (versus
+ being removed due to dlm_release_lockspace) */
+
+ error = dlm_comm_seq(memb->nodeid, &seq);
+
+ if (!error && seq == memb->comm_seq)
+ return;
+
+ slot.nodeid = memb->nodeid;
+ slot.slot = memb->slot;
+
+ ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
+}
+
+void dlm_lsop_recover_done(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *slots;
+ int i, num;
+
+ if (!ls->ls_ops || !ls->ls_ops->recover_done)
+ return;
+
+ num = ls->ls_num_nodes;
+ slots = kcalloc(num, sizeof(*slots), GFP_KERNEL);
+ if (!slots)
+ return;
+
+ i = 0;
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (i == num) {
+ log_error(ls, "dlm_lsop_recover_done bad num %d", num);
+ goto out;
+ }
+ slots[i].nodeid = memb->nodeid;
+ slots[i].slot = memb->slot;
+ i++;
+ }
+
+ ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
+ ls->ls_slot, ls->ls_generation);
+ out:
+ kfree(slots);
+}
+
+static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
+ int nodeid)
+{
+ int i;
+
+ for (i = 0; i < rv->nodes_count; i++) {
+ if (rv->nodes[i].nodeid == nodeid)
+ return &rv->nodes[i];
+ }
+ return NULL;
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+ struct dlm_member *memb, *safe;
+ struct dlm_config_node *node;
+ int i, error, neg = 0, low = -1;
+
+ /* previously removed members that we've not finished removing need to
+ * count as a negative change so the "neg" recovery steps will happen
+ *
+ * This functionality must report all member changes to lsops or
+ * midcomms layer and must never return before.
+ */
+
+ list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+ log_rinfo(ls, "prev removed member %d", memb->nodeid);
+ neg++;
+ }
+
+ /* move departed members from ls_nodes to ls_nodes_gone */
+
+ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+ node = find_config_node(rv, memb->nodeid);
+ if (node && !node->new)
+ continue;
+
+ if (!node) {
+ log_rinfo(ls, "remove member %d", memb->nodeid);
+ } else {
+ /* removed and re-added */
+ log_rinfo(ls, "remove member %d comm_seq %u %u",
+ memb->nodeid, memb->comm_seq, node->comm_seq);
+ }
+
+ neg++;
+ list_move(&memb->list, &ls->ls_nodes_gone);
+ remove_remote_member(memb->nodeid);
+ ls->ls_num_nodes--;
+ dlm_lsop_recover_slot(ls, memb);
+ }
+
+ /* add new members to ls_nodes */
+
+ for (i = 0; i < rv->nodes_count; i++) {
+ node = &rv->nodes[i];
+ if (dlm_is_member(ls, node->nodeid))
+ continue;
+ error = dlm_add_member(ls, node);
+ if (error)
+ return error;
+
+ log_rinfo(ls, "add member %d", node->nodeid);
+ }
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (low == -1 || memb->nodeid < low)
+ low = memb->nodeid;
+ }
+ ls->ls_low_nodeid = low;
+
+ make_member_array(ls);
+ *neg_out = neg;
+
+ error = ping_members(ls, rv->seq);
+ log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
+ return error;
+}
+
+/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before
+ dlm_ls_start() is called on any of them to start the new recovery. */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+ int new;
+
+ /*
+ * Prevent dlm_recv from being in the middle of something when we do
+ * the stop. This includes ensuring dlm_recv isn't processing a
+ * recovery message (rcom), while dlm_recoverd is aborting and
+ * resetting things from an in-progress recovery. i.e. we want
+ * dlm_recoverd to abort its recovery without worrying about dlm_recv
+ * processing an rcom at the same time. Stopping dlm_recv also makes
+ * it easy for dlm_receive_message() to check locking stopped and add a
+ * message to the requestqueue without races.
+ */
+
+ down_write(&ls->ls_recv_active);
+
+ /*
+ * Abort any recovery that's in progress (see RECOVER_STOP,
+ * dlm_recovery_stopped()) and tell any other threads running in the
+ * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
+ */
+
+ spin_lock(&ls->ls_recover_lock);
+ set_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+ new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+ ls->ls_recover_seq++;
+ spin_unlock(&ls->ls_recover_lock);
+
+ /*
+ * Let dlm_recv run again, now any normal messages will be saved on the
+ * requestqueue for later.
+ */
+
+ up_write(&ls->ls_recv_active);
+
+ /*
+ * This in_recovery lock does two things:
+ * 1) Keeps this function from returning until all threads are out
+ * of locking routines and locking is truly stopped.
+ * 2) Keeps any new requests from being processed until it's unlocked
+ * when recovery is complete.
+ */
+
+ if (new) {
+ set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags);
+ wake_up_process(ls->ls_recoverd_task);
+ wait_event(ls->ls_recover_lock_wait,
+ test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+ }
+
+ /*
+ * The recoverd suspend/resume makes sure that dlm_recoverd (if
+ * running) has noticed RECOVER_STOP above and quit processing the
+ * previous recovery.
+ */
+
+ dlm_recoverd_suspend(ls);
+
+ spin_lock(&ls->ls_recover_lock);
+ kfree(ls->ls_slots);
+ ls->ls_slots = NULL;
+ ls->ls_num_slots = 0;
+ ls->ls_slots_size = 0;
+ ls->ls_recover_status = 0;
+ spin_unlock(&ls->ls_recover_lock);
+
+ dlm_recoverd_resume(ls);
+
+ if (!ls->ls_recover_begin)
+ ls->ls_recover_begin = jiffies;
+
+ /* call recover_prep ops only once and not multiple times
+ * for each possible dlm_ls_stop() when recovery is already
+ * stopped.
+ *
+ * If we successful was able to clear LSFL_RUNNING bit and
+ * it was set we know it is the first dlm_ls_stop() call.
+ */
+ if (new)
+ dlm_lsop_recover_prep(ls);
+
+ return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+ struct dlm_recover *rv, *rv_old;
+ struct dlm_config_node *nodes = NULL;
+ int error, count;
+
+ rv = kzalloc(sizeof(*rv), GFP_NOFS);
+ if (!rv)
+ return -ENOMEM;
+
+ error = dlm_config_nodes(ls->ls_name, &nodes, &count);
+ if (error < 0)
+ goto fail_rv;
+
+ spin_lock(&ls->ls_recover_lock);
+
+ /* the lockspace needs to be stopped before it can be started */
+
+ if (!dlm_locking_stopped(ls)) {
+ spin_unlock(&ls->ls_recover_lock);
+ log_error(ls, "start ignored: lockspace running");
+ error = -EINVAL;
+ goto fail;
+ }
+
+ rv->nodes = nodes;
+ rv->nodes_count = count;
+ rv->seq = ++ls->ls_recover_seq;
+ rv_old = ls->ls_recover_args;
+ ls->ls_recover_args = rv;
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (rv_old) {
+ log_error(ls, "unused recovery %llx %d",
+ (unsigned long long)rv_old->seq, rv_old->nodes_count);
+ kfree(rv_old->nodes);
+ kfree(rv_old);
+ }
+
+ set_bit(LSFL_RECOVER_WORK, &ls->ls_flags);
+ wake_up_process(ls->ls_recoverd_task);
+ return 0;
+
+ fail:
+ kfree(nodes);
+ fail_rv:
+ kfree(rv);
+ return error;
+}
+
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 0000000000..f61cfde463
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
+int dlm_slots_version(const struct dlm_header *h);
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+ struct dlm_member *memb);
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_slots_copy_in(struct dlm_ls *ls);
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+ struct dlm_slot **slots_out, uint32_t *gen_out);
+void dlm_lsop_recover_done(struct dlm_ls *ls);
+
+#endif /* __MEMBER_DOT_H__ */
+
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 0000000000..64f212a066
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "midcomms.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "ast.h"
+
+static struct kmem_cache *writequeue_cache;
+static struct kmem_cache *mhandle_cache;
+static struct kmem_cache *msg_cache;
+static struct kmem_cache *lkb_cache;
+static struct kmem_cache *rsb_cache;
+static struct kmem_cache *cb_cache;
+
+
+int __init dlm_memory_init(void)
+{
+ writequeue_cache = dlm_lowcomms_writequeue_cache_create();
+ if (!writequeue_cache)
+ goto out;
+
+ mhandle_cache = dlm_midcomms_cache_create();
+ if (!mhandle_cache)
+ goto mhandle;
+
+ lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+ __alignof__(struct dlm_lkb), 0, NULL);
+ if (!lkb_cache)
+ goto lkb;
+
+ msg_cache = dlm_lowcomms_msg_cache_create();
+ if (!msg_cache)
+ goto msg;
+
+ rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb),
+ __alignof__(struct dlm_rsb), 0, NULL);
+ if (!rsb_cache)
+ goto rsb;
+
+ cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback),
+ __alignof__(struct dlm_callback), 0,
+ NULL);
+ if (!cb_cache)
+ goto cb;
+
+ return 0;
+
+cb:
+ kmem_cache_destroy(rsb_cache);
+rsb:
+ kmem_cache_destroy(msg_cache);
+msg:
+ kmem_cache_destroy(lkb_cache);
+lkb:
+ kmem_cache_destroy(mhandle_cache);
+mhandle:
+ kmem_cache_destroy(writequeue_cache);
+out:
+ return -ENOMEM;
+}
+
+void dlm_memory_exit(void)
+{
+ kmem_cache_destroy(writequeue_cache);
+ kmem_cache_destroy(mhandle_cache);
+ kmem_cache_destroy(msg_cache);
+ kmem_cache_destroy(lkb_cache);
+ kmem_cache_destroy(rsb_cache);
+ kmem_cache_destroy(cb_cache);
+}
+
+char *dlm_allocate_lvb(struct dlm_ls *ls)
+{
+ char *p;
+
+ p = kzalloc(ls->ls_lvblen, GFP_NOFS);
+ return p;
+}
+
+void dlm_free_lvb(char *p)
+{
+ kfree(p);
+}
+
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+
+ r = kmem_cache_zalloc(rsb_cache, GFP_NOFS);
+ return r;
+}
+
+void dlm_free_rsb(struct dlm_rsb *r)
+{
+ if (r->res_lvbptr)
+ dlm_free_lvb(r->res_lvbptr);
+ kmem_cache_free(rsb_cache, r);
+}
+
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
+ return lkb;
+}
+
+void dlm_free_lkb(struct dlm_lkb *lkb)
+{
+ if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
+ struct dlm_user_args *ua;
+ ua = lkb->lkb_ua;
+ if (ua) {
+ kfree(ua->lksb.sb_lvbptr);
+ kfree(ua);
+ }
+ }
+
+ /* drop references if they are set */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+
+ kmem_cache_free(lkb_cache, lkb);
+}
+
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation)
+{
+ return kmem_cache_alloc(mhandle_cache, allocation);
+}
+
+void dlm_free_mhandle(struct dlm_mhandle *mhandle)
+{
+ kmem_cache_free(mhandle_cache, mhandle);
+}
+
+struct writequeue_entry *dlm_allocate_writequeue(void)
+{
+ return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC);
+}
+
+void dlm_free_writequeue(struct writequeue_entry *writequeue)
+{
+ kmem_cache_free(writequeue_cache, writequeue);
+}
+
+struct dlm_msg *dlm_allocate_msg(gfp_t allocation)
+{
+ return kmem_cache_alloc(msg_cache, allocation);
+}
+
+void dlm_free_msg(struct dlm_msg *msg)
+{
+ kmem_cache_free(msg_cache, msg);
+}
+
+struct dlm_callback *dlm_allocate_cb(void)
+{
+ return kmem_cache_alloc(cb_cache, GFP_ATOMIC);
+}
+
+void dlm_free_cb(struct dlm_callback *cb)
+{
+ kmem_cache_free(cb_cache, cb);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 0000000000..6b29563d24
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls);
+void dlm_free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
+void dlm_free_lkb(struct dlm_lkb *l);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
+void dlm_free_lvb(char *l);
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation);
+void dlm_free_mhandle(struct dlm_mhandle *mhandle);
+struct writequeue_entry *dlm_allocate_writequeue(void);
+void dlm_free_writequeue(struct writequeue_entry *writequeue);
+struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
+void dlm_free_msg(struct dlm_msg *msg);
+struct dlm_callback *dlm_allocate_cb(void);
+void dlm_free_cb(struct dlm_callback *cb);
+
+#endif /* __MEMORY_DOT_H__ */
+
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 0000000000..2247ebb61b
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,1514 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer. It takes care about
+ * deliver an on application layer "reliable" communication above the used
+ * lowcomms transport layer.
+ *
+ * How it works:
+ *
+ * Each nodes keeps track of all send DLM messages in send_queue with a sequence
+ * number. The receive will send an DLM_ACK message back for every DLM message
+ * received at the other side. If a reconnect happens in lowcomms we will send
+ * all unacknowledged dlm messages again. The receiving side might drop any already
+ * received message by comparing sequence numbers.
+ *
+ * How version detection works:
+ *
+ * Due the fact that dlm has pre-configured node addresses on every side
+ * it is in it's nature that every side connects at starts to transmit
+ * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS
+ * and their replies are the first messages which are exchanges. Due backwards
+ * compatibility these messages are not covered by the midcomms re-transmission
+ * layer. These messages have their own re-transmission handling in the dlm
+ * application layer. The version field of every node will be set on these RCOM
+ * messages as soon as they arrived and the node isn't yet part of the nodes
+ * hash. There exists also logic to detect version mismatched if something weird
+ * going on or the first messages isn't an expected one.
+ *
+ * Termination:
+ *
+ * The midcomms layer does a 4 way handshake for termination on DLM protocol
+ * like TCP supports it with half-closed socket support. SCTP doesn't support
+ * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be
+ * interrupted by .e.g. tcp reset itself. Additional there exists the othercon
+ * paradigm in lowcomms which cannot be easily without breaking backwards
+ * compatibility. A node cannot send anything to another node when a DLM_FIN
+ * message was send. There exists additional logic to print a warning if
+ * DLM wants to do it. There exists a state handling like RFC 793 but reduced
+ * to termination only. The event "member removal event" describes the cluster
+ * manager removed the node from internal lists, at this point DLM does not
+ * send any message to the other node. There exists two cases:
+ *
+ * 1. The cluster member was removed and we received a FIN
+ * OR
+ * 2. We received a FIN but the member was not removed yet
+ *
+ * One of these cases will do the CLOSE_WAIT to LAST_ACK change.
+ *
+ *
+ * +---------+
+ * | CLOSED |
+ * +---------+
+ * | add member/receive RCOM version
+ * | detection msg
+ * V
+ * +---------+
+ * | ESTAB |
+ * +---------+
+ * CLOSE | | rcv FIN
+ * ------- | | -------
+ * +---------+ snd FIN / \ snd ACK +---------+
+ * | FIN |<----------------- ------------------>| CLOSE |
+ * | WAIT-1 |------------------ | WAIT |
+ * +---------+ rcv FIN \ +---------+
+ * | rcv ACK of FIN ------- | CLOSE | member
+ * | -------------- snd ACK | ------- | removal
+ * V x V snd FIN V event
+ * +---------+ +---------+ +---------+
+ * |FINWAIT-2| | CLOSING | | LAST-ACK|
+ * +---------+ +---------+ +---------+
+ * | rcv ACK of FIN | rcv ACK of FIN |
+ * | rcv FIN -------------- | -------------- |
+ * | ------- x V x V
+ * \ snd ACK +---------+ +---------+
+ * ------------------------>| CLOSED | | CLOSED |
+ * +---------+ +---------+
+ *
+ * NOTE: any state can interrupted by midcomms_close() and state will be
+ * switched to CLOSED in case of fencing. There exists also some timeout
+ * handling when we receive the version detection RCOM messages which is
+ * made by observation.
+ *
+ * Future improvements:
+ *
+ * There exists some known issues/improvements of the dlm handling. Some
+ * of them should be done in a next major dlm version bump which makes
+ * it incompatible with previous versions.
+ *
+ * Unaligned memory access:
+ *
+ * There exists cases when the dlm message buffer length is not aligned
+ * to 8 byte. However seems nobody detected any problem with it. This
+ * can be fixed in the next major version bump of dlm.
+ *
+ * Version detection:
+ *
+ * The version detection and how it's done is related to backwards
+ * compatibility. There exists better ways to make a better handling.
+ * However this should be changed in the next major version bump of dlm.
+ *
+ * Tail Size checking:
+ *
+ * There exists a message tail payload in e.g. DLM_MSG however we don't
+ * check it against the message length yet regarding to the receive buffer
+ * length. That need to be validated.
+ *
+ * Fencing bad nodes:
+ *
+ * At timeout places or weird sequence number behaviours we should send
+ * a fencing request to the cluster manager.
+ */
+
+/* Debug switch to enable a 5 seconds sleep waiting of a termination.
+ * This can be useful to test fencing while termination is running.
+ * This requires a setup with only gfs2 as dlm user, so that the
+ * last umount will terminate the connection.
+ *
+ * However it became useful to test, while the 5 seconds block in umount
+ * just press the reset button. In a lot of dropping the termination
+ * process can could take several seconds.
+ */
+#define DLM_DEBUG_FENCE_TERMINATION 0
+
+#include <trace/events/dlm.h>
+#include <net/tcp.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+#include "midcomms.h"
+
+/* init value for sequence numbers for testing purpose only e.g. overflows */
+#define DLM_SEQ_INIT 0
+/* 5 seconds wait to sync ending of dlm */
+#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000)
+#define DLM_VERSION_NOT_SET 0
+#define DLM_SEND_ACK_BACK_MSG_THRESHOLD 32
+#define DLM_RECV_ACK_BACK_MSG_THRESHOLD (DLM_SEND_ACK_BACK_MSG_THRESHOLD * 8)
+
+struct midcomms_node {
+ int nodeid;
+ uint32_t version;
+ atomic_t seq_send;
+ atomic_t seq_next;
+ /* These queues are unbound because we cannot drop any message in dlm.
+ * We could send a fence signal for a specific node to the cluster
+ * manager if queues hits some maximum value, however this handling
+ * not supported yet.
+ */
+ struct list_head send_queue;
+ spinlock_t send_queue_lock;
+ atomic_t send_queue_cnt;
+#define DLM_NODE_FLAG_CLOSE 1
+#define DLM_NODE_FLAG_STOP_TX 2
+#define DLM_NODE_FLAG_STOP_RX 3
+ atomic_t ulp_delivered;
+ unsigned long flags;
+ wait_queue_head_t shutdown_wait;
+
+ /* dlm tcp termination state */
+#define DLM_CLOSED 1
+#define DLM_ESTABLISHED 2
+#define DLM_FIN_WAIT1 3
+#define DLM_FIN_WAIT2 4
+#define DLM_CLOSE_WAIT 5
+#define DLM_LAST_ACK 6
+#define DLM_CLOSING 7
+ int state;
+ spinlock_t state_lock;
+
+ /* counts how many lockspaces are using this node
+ * this refcount is necessary to determine if the
+ * node wants to disconnect.
+ */
+ int users;
+
+ /* not protected by srcu, node_hash lifetime */
+ void *debugfs;
+
+ struct hlist_node hlist;
+ struct rcu_head rcu;
+};
+
+struct dlm_mhandle {
+ const union dlm_packet *inner_p;
+ struct midcomms_node *node;
+ struct dlm_opts *opts;
+ struct dlm_msg *msg;
+ bool committed;
+ uint32_t seq;
+
+ void (*ack_rcv)(struct midcomms_node *node);
+
+ /* get_mhandle/commit srcu idx exchange */
+ int idx;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct hlist_head node_hash[CONN_HASH_SIZE];
+static DEFINE_SPINLOCK(nodes_lock);
+DEFINE_STATIC_SRCU(nodes_srcu);
+
+/* This mutex prevents that midcomms_close() is running while
+ * stop() or remove(). As I experienced invalid memory access
+ * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and
+ * resetting machines. I will end in some double deletion in nodes
+ * datastructure.
+ */
+static DEFINE_MUTEX(close_lock);
+
+struct kmem_cache *dlm_midcomms_cache_create(void)
+{
+ return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle),
+ 0, 0, NULL);
+}
+
+static inline const char *dlm_state_str(int state)
+{
+ switch (state) {
+ case DLM_CLOSED:
+ return "CLOSED";
+ case DLM_ESTABLISHED:
+ return "ESTABLISHED";
+ case DLM_FIN_WAIT1:
+ return "FIN_WAIT1";
+ case DLM_FIN_WAIT2:
+ return "FIN_WAIT2";
+ case DLM_CLOSE_WAIT:
+ return "CLOSE_WAIT";
+ case DLM_LAST_ACK:
+ return "LAST_ACK";
+ case DLM_CLOSING:
+ return "CLOSING";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+const char *dlm_midcomms_state(struct midcomms_node *node)
+{
+ return dlm_state_str(node->state);
+}
+
+unsigned long dlm_midcomms_flags(struct midcomms_node *node)
+{
+ return node->flags;
+}
+
+int dlm_midcomms_send_queue_cnt(struct midcomms_node *node)
+{
+ return atomic_read(&node->send_queue_cnt);
+}
+
+uint32_t dlm_midcomms_version(struct midcomms_node *node)
+{
+ return node->version;
+}
+
+static struct midcomms_node *__find_node(int nodeid, int r)
+{
+ struct midcomms_node *node;
+
+ hlist_for_each_entry_rcu(node, &node_hash[r], hlist) {
+ if (node->nodeid == nodeid)
+ return node;
+ }
+
+ return NULL;
+}
+
+static void dlm_mhandle_release(struct rcu_head *rcu)
+{
+ struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu);
+
+ dlm_lowcomms_put_msg(mh->msg);
+ dlm_free_mhandle(mh);
+}
+
+static void dlm_mhandle_delete(struct midcomms_node *node,
+ struct dlm_mhandle *mh)
+{
+ list_del_rcu(&mh->list);
+ atomic_dec(&node->send_queue_cnt);
+ call_rcu(&mh->rcu, dlm_mhandle_release);
+}
+
+static void dlm_send_queue_flush(struct midcomms_node *node)
+{
+ struct dlm_mhandle *mh;
+
+ pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
+
+ rcu_read_lock();
+ spin_lock_bh(&node->send_queue_lock);
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ dlm_mhandle_delete(node, mh);
+ }
+ spin_unlock_bh(&node->send_queue_lock);
+ rcu_read_unlock();
+}
+
+static void midcomms_node_reset(struct midcomms_node *node)
+{
+ pr_debug("reset node %d\n", node->nodeid);
+
+ atomic_set(&node->seq_next, DLM_SEQ_INIT);
+ atomic_set(&node->seq_send, DLM_SEQ_INIT);
+ atomic_set(&node->ulp_delivered, 0);
+ node->version = DLM_VERSION_NOT_SET;
+ node->flags = 0;
+
+ dlm_send_queue_flush(node);
+ node->state = DLM_CLOSED;
+ wake_up(&node->shutdown_wait);
+}
+
+static struct midcomms_node *nodeid2node(int nodeid)
+{
+ return __find_node(nodeid, nodeid_hash(nodeid));
+}
+
+int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+{
+ int ret, idx, r = nodeid_hash(nodeid);
+ struct midcomms_node *node;
+
+ ret = dlm_lowcomms_addr(nodeid, addr, len);
+ if (ret)
+ return ret;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = __find_node(nodeid, r);
+ if (node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return 0;
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return -ENOMEM;
+
+ node->nodeid = nodeid;
+ spin_lock_init(&node->state_lock);
+ spin_lock_init(&node->send_queue_lock);
+ atomic_set(&node->send_queue_cnt, 0);
+ INIT_LIST_HEAD(&node->send_queue);
+ init_waitqueue_head(&node->shutdown_wait);
+ node->users = 0;
+ midcomms_node_reset(node);
+
+ spin_lock(&nodes_lock);
+ hlist_add_head_rcu(&node->hlist, &node_hash[r]);
+ spin_unlock(&nodes_lock);
+
+ node->debugfs = dlm_create_debug_comms_file(nodeid, node);
+ return 0;
+}
+
+static int dlm_send_ack(int nodeid, uint32_t seq)
+{
+ int mb_len = sizeof(struct dlm_header);
+ struct dlm_header *m_header;
+ struct dlm_msg *msg;
+ char *ppc;
+
+ msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc,
+ NULL, NULL);
+ if (!msg)
+ return -ENOMEM;
+
+ m_header = (struct dlm_header *)ppc;
+
+ m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ m_header->h_length = cpu_to_le16(mb_len);
+ m_header->h_cmd = DLM_ACK;
+ m_header->u.h_seq = cpu_to_le32(seq);
+
+ dlm_lowcomms_commit_msg(msg);
+ dlm_lowcomms_put_msg(msg);
+
+ return 0;
+}
+
+static void dlm_send_ack_threshold(struct midcomms_node *node,
+ uint32_t threshold)
+{
+ uint32_t oval, nval;
+ bool send_ack;
+
+ /* let only send one user trigger threshold to send ack back */
+ do {
+ oval = atomic_read(&node->ulp_delivered);
+ send_ack = (oval > threshold);
+ /* abort if threshold is not reached */
+ if (!send_ack)
+ break;
+
+ nval = 0;
+ /* try to reset ulp_delivered counter */
+ } while (atomic_cmpxchg(&node->ulp_delivered, oval, nval) != oval);
+
+ if (send_ack)
+ dlm_send_ack(node->nodeid, atomic_read(&node->seq_next));
+}
+
+static int dlm_send_fin(struct midcomms_node *node,
+ void (*ack_rcv)(struct midcomms_node *node))
+{
+ int mb_len = sizeof(struct dlm_header);
+ struct dlm_header *m_header;
+ struct dlm_mhandle *mh;
+ char *ppc;
+
+ mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc);
+ if (!mh)
+ return -ENOMEM;
+
+ set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
+ mh->ack_rcv = ack_rcv;
+
+ m_header = (struct dlm_header *)ppc;
+
+ m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ m_header->h_length = cpu_to_le16(mb_len);
+ m_header->h_cmd = DLM_FIN;
+
+ pr_debug("sending fin msg to node %d\n", node->nodeid);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
+
+ return 0;
+}
+
+static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
+{
+ struct dlm_mhandle *mh;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (before(mh->seq, seq)) {
+ if (mh->ack_rcv)
+ mh->ack_rcv(node);
+ } else {
+ /* send queue should be ordered */
+ break;
+ }
+ }
+
+ spin_lock_bh(&node->send_queue_lock);
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (before(mh->seq, seq)) {
+ dlm_mhandle_delete(node, mh);
+ } else {
+ /* send queue should be ordered */
+ break;
+ }
+ }
+ spin_unlock_bh(&node->send_queue_lock);
+ rcu_read_unlock();
+}
+
+static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
+{
+ spin_lock(&node->state_lock);
+ pr_debug("receive passive fin ack from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_LAST_ACK:
+ /* DLM_CLOSED */
+ midcomms_node_reset(node);
+ break;
+ case DLM_CLOSED:
+ /* not valid but somehow we got what we want */
+ wake_up(&node->shutdown_wait);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d",
+ __func__, node->state);
+ WARN_ON_ONCE(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+}
+
+static void dlm_receive_buffer_3_2_trace(uint32_t seq,
+ const union dlm_packet *p)
+{
+ switch (p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message);
+ break;
+ case DLM_RCOM:
+ trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom);
+ break;
+ default:
+ break;
+ }
+}
+
+static void dlm_midcomms_receive_buffer(const union dlm_packet *p,
+ struct midcomms_node *node,
+ uint32_t seq)
+{
+ bool is_expected_seq;
+ uint32_t oval, nval;
+
+ do {
+ oval = atomic_read(&node->seq_next);
+ is_expected_seq = (oval == seq);
+ if (!is_expected_seq)
+ break;
+
+ nval = oval + 1;
+ } while (atomic_cmpxchg(&node->seq_next, oval, nval) != oval);
+
+ if (is_expected_seq) {
+ switch (p->header.h_cmd) {
+ case DLM_FIN:
+ spin_lock(&node->state_lock);
+ pr_debug("receive fin msg from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ dlm_send_ack(node->nodeid, nval);
+
+ /* passive shutdown DLM_LAST_ACK case 1
+ * additional we check if the node is used by
+ * cluster manager events at all.
+ */
+ if (node->users == 0) {
+ node->state = DLM_LAST_ACK;
+ pr_debug("switch node %d to state %s case 1\n",
+ node->nodeid, dlm_state_str(node->state));
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ dlm_send_fin(node, dlm_pas_fin_ack_rcv);
+ } else {
+ node->state = DLM_CLOSE_WAIT;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ }
+ break;
+ case DLM_FIN_WAIT1:
+ dlm_send_ack(node->nodeid, nval);
+ node->state = DLM_CLOSING;
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_FIN_WAIT2:
+ dlm_send_ack(node->nodeid, nval);
+ midcomms_node_reset(node);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_LAST_ACK:
+ /* probably remove_member caught it, do nothing */
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d",
+ __func__, node->state);
+ WARN_ON_ONCE(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+ break;
+ default:
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer_3_2_trace(seq, p);
+ dlm_receive_buffer(p, node->nodeid);
+ atomic_inc(&node->ulp_delivered);
+ /* unlikely case to send ack back when we don't transmit */
+ dlm_send_ack_threshold(node, DLM_RECV_ACK_BACK_MSG_THRESHOLD);
+ break;
+ }
+ } else {
+ /* retry to ack message which we already have by sending back
+ * current node->seq_next number as ack.
+ */
+ if (seq < oval)
+ dlm_send_ack(node->nodeid, oval);
+
+ log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d",
+ seq, oval, node->nodeid);
+ }
+}
+
+static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen,
+ int nodeid)
+{
+ int len = msglen;
+
+ /* we only trust outer header msglen because
+ * it's checked against receive buffer length.
+ */
+ if (len < sizeof(struct dlm_opts))
+ return -1;
+ len -= sizeof(struct dlm_opts);
+
+ if (len < le16_to_cpu(p->opts.o_optlen))
+ return -1;
+ len -= le16_to_cpu(p->opts.o_optlen);
+
+ switch (p->opts.o_nextcmd) {
+ case DLM_FIN:
+ if (len < sizeof(struct dlm_header)) {
+ log_print("fin too small: %d, will skip this message from node %d",
+ len, nodeid);
+ return -1;
+ }
+
+ break;
+ case DLM_MSG:
+ if (len < sizeof(struct dlm_message)) {
+ log_print("msg too small: %d, will skip this message from node %d",
+ msglen, nodeid);
+ return -1;
+ }
+
+ break;
+ case DLM_RCOM:
+ if (len < sizeof(struct dlm_rcom)) {
+ log_print("rcom msg too small: %d, will skip this message from node %d",
+ len, nodeid);
+ return -1;
+ }
+
+ break;
+ default:
+ log_print("unsupported o_nextcmd received: %u, will skip this message from node %d",
+ p->opts.o_nextcmd, nodeid);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid)
+{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ uint32_t seq;
+ int ret, idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node))
+ goto out;
+
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_2;
+ wake_up(&node->shutdown_wait);
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2,
+ node->nodeid);
+
+ spin_lock(&node->state_lock);
+ switch (node->state) {
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&node->state_lock);
+
+ break;
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_2, node->nodeid, node->version);
+ goto out;
+ }
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ /* these rcom message we use to determine version.
+ * they have their own retransmission handling and
+ * are the first messages of dlm.
+ *
+ * length already checked.
+ */
+ switch (p->rcom.rc_type) {
+ case cpu_to_le32(DLM_RCOM_NAMES):
+ fallthrough;
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
+ fallthrough;
+ case cpu_to_le32(DLM_RCOM_STATUS):
+ fallthrough;
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
+ break;
+ default:
+ log_print("unsupported rcom type received: %u, will skip this message from node %d",
+ le32_to_cpu(p->rcom.rc_type), nodeid);
+ goto out;
+ }
+
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer(p, nodeid);
+ break;
+ case DLM_OPTS:
+ seq = le32_to_cpu(p->header.u.h_seq);
+
+ ret = dlm_opts_check_msglen(p, msglen, nodeid);
+ if (ret < 0) {
+ log_print("opts msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ p = (union dlm_packet *)((unsigned char *)p->opts.o_opts +
+ le16_to_cpu(p->opts.o_optlen));
+
+ /* recheck inner msglen just if it's not garbage */
+ msglen = le16_to_cpu(p->header.h_length);
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("inner rcom msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("inner msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ case DLM_FIN:
+ if (msglen < sizeof(struct dlm_header)) {
+ log_print("inner fin too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ default:
+ log_print("unsupported inner h_cmd received: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ dlm_midcomms_receive_buffer(p, node, seq);
+ break;
+ case DLM_ACK:
+ seq = le32_to_cpu(p->header.u.h_seq);
+ dlm_receive_ack(node, seq);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message from node %d",
+ p->header.h_cmd, nodeid);
+ break;
+ }
+
+out:
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid)
+{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_1;
+ wake_up(&node->shutdown_wait);
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1,
+ node->nodeid);
+ break;
+ case DLM_VERSION_3_1:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_1, node->nodeid, node->version);
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ /* length already checked */
+ break;
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return;
+ }
+
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message from node %d",
+ p->header.h_cmd, nodeid);
+ return;
+ }
+
+ dlm_receive_buffer(p, nodeid);
+}
+
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len)
+{
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
+ uint16_t msglen;
+ int ret = 0;
+
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or
+ * less than dlm_header size.
+ *
+ * Some messages does not have a 8 byte length boundary yet
+ * which can occur in a unaligned memory access of some dlm
+ * messages. However this problem need to be fixed at the
+ * sending side, for now it seems nobody run into architecture
+ * related issues yet but it slows down some processing.
+ * Fixing this issue should be scheduled in future by doing
+ * the next major version bump.
+ */
+ msglen = le16_to_cpu(hd->h_length);
+ if (msglen > DLM_MAX_SOCKET_BUFSIZE ||
+ msglen < sizeof(struct dlm_header)) {
+ log_print("received invalid length header: %u from node %d, will abort message parsing",
+ msglen, nodeid);
+ return -EBADMSG;
+ }
+
+ /* caller will take care that leftover
+ * will be parsed next call with more data
+ */
+ if (msglen > len)
+ break;
+
+ ret += msglen;
+ len -= msglen;
+ ptr += msglen;
+ }
+
+ return ret;
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ */
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+{
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
+ uint16_t msglen;
+ int ret = 0;
+
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ msglen = le16_to_cpu(hd->h_length);
+ if (msglen > len)
+ break;
+
+ switch (hd->h_version) {
+ case cpu_to_le32(DLM_VERSION_3_1):
+ dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid);
+ break;
+ case cpu_to_le32(DLM_VERSION_3_2):
+ dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid);
+ break;
+ default:
+ log_print("received invalid version header: %u from node %d, will skip this message",
+ le32_to_cpu(hd->h_version), nodeid);
+ break;
+ }
+
+ ret += msglen;
+ len -= msglen;
+ ptr += msglen;
+ }
+
+ return ret;
+}
+
+void dlm_midcomms_unack_msg_resend(int nodeid)
+{
+ struct midcomms_node *node;
+ struct dlm_mhandle *mh;
+ int idx, ret;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ /* old protocol, we don't support to retransmit on failure */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (!mh->committed)
+ continue;
+
+ ret = dlm_lowcomms_resend_msg(mh->msg);
+ if (!ret)
+ log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d",
+ mh->seq, node->nodeid);
+ }
+ rcu_read_unlock();
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
+ uint32_t seq)
+{
+ opts->o_header.h_cmd = DLM_OPTS;
+ opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len);
+ opts->o_header.u.h_seq = cpu_to_le32(seq);
+}
+
+static void midcomms_new_msg_cb(void *data)
+{
+ struct dlm_mhandle *mh = data;
+
+ atomic_inc(&mh->node->send_queue_cnt);
+
+ spin_lock_bh(&mh->node->send_queue_lock);
+ list_add_tail_rcu(&mh->list, &mh->node->send_queue);
+ spin_unlock_bh(&mh->node->send_queue_lock);
+
+ mh->seq = atomic_fetch_inc(&mh->node->seq_send);
+}
+
+static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid,
+ int len, gfp_t allocation, char **ppc)
+{
+ struct dlm_opts *opts;
+ struct dlm_msg *msg;
+
+ msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN,
+ allocation, ppc, midcomms_new_msg_cb, mh);
+ if (!msg)
+ return NULL;
+
+ opts = (struct dlm_opts *)*ppc;
+ mh->opts = opts;
+
+ /* add possible options here */
+ dlm_fill_opts_header(opts, len, mh->seq);
+
+ *ppc += sizeof(*opts);
+ mh->inner_p = (const union dlm_packet *)*ppc;
+ return msg;
+}
+
+/* avoid false positive for nodes_srcu, unlock happens in
+ * dlm_midcomms_commit_mhandle which is a must call if success
+ */
+#ifndef __CHECKER__
+struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
+ gfp_t allocation, char **ppc)
+{
+ struct midcomms_node *node;
+ struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node))
+ goto err;
+
+ /* this is a bug, however we going on and hope it will be resolved */
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+
+ mh = dlm_allocate_mhandle(allocation);
+ if (!mh)
+ goto err;
+
+ mh->committed = false;
+ mh->ack_rcv = NULL;
+ mh->idx = idx;
+ mh->node = node;
+
+ switch (node->version) {
+ case DLM_VERSION_3_1:
+ msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc,
+ NULL, NULL);
+ if (!msg) {
+ dlm_free_mhandle(mh);
+ goto err;
+ }
+
+ break;
+ case DLM_VERSION_3_2:
+ /* send ack back if necessary */
+ dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD);
+
+ msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
+ ppc);
+ if (!msg) {
+ dlm_free_mhandle(mh);
+ goto err;
+ }
+ break;
+ default:
+ dlm_free_mhandle(mh);
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+
+ mh->msg = msg;
+
+ /* keep in mind that is a must to call
+ * dlm_midcomms_commit_msg() which releases
+ * nodes_srcu using mh->idx which is assumed
+ * here that the application will call it.
+ */
+ return mh;
+
+err:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return NULL;
+}
+#endif
+
+static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh,
+ const void *name, int namelen)
+{
+ switch (mh->inner_p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_send_message(mh->node->nodeid, mh->seq,
+ &mh->inner_p->message,
+ name, namelen);
+ break;
+ case DLM_RCOM:
+ trace_dlm_send_rcom(mh->node->nodeid, mh->seq,
+ &mh->inner_p->rcom);
+ break;
+ default:
+ /* nothing to trace */
+ break;
+ }
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh,
+ const void *name, int namelen)
+{
+ /* nexthdr chain for fast lookup */
+ mh->opts->o_nextcmd = mh->inner_p->header.h_cmd;
+ mh->committed = true;
+ dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen);
+ dlm_lowcomms_commit_msg(mh->msg);
+}
+
+/* avoid false positive for nodes_srcu, lock was happen in
+ * dlm_midcomms_get_mhandle
+ */
+#ifndef __CHECKER__
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
+ const void *name, int namelen)
+{
+
+ switch (mh->node->version) {
+ case DLM_VERSION_3_1:
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+
+ dlm_lowcomms_commit_msg(mh->msg);
+ dlm_lowcomms_put_msg(mh->msg);
+ /* mh is not part of rcu list in this case */
+ dlm_free_mhandle(mh);
+ break;
+ case DLM_VERSION_3_2:
+ /* held rcu read lock here, because we sending the
+ * dlm message out, when we do that we could receive
+ * an ack back which releases the mhandle and we
+ * get a use after free.
+ */
+ rcu_read_lock();
+ dlm_midcomms_commit_msg_3_2(mh, name, namelen);
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+ rcu_read_unlock();
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+#endif
+
+int dlm_midcomms_start(void)
+{
+ return dlm_lowcomms_start();
+}
+
+void dlm_midcomms_stop(void)
+{
+ dlm_lowcomms_stop();
+}
+
+void dlm_midcomms_init(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&node_hash[i]);
+
+ dlm_lowcomms_init();
+}
+
+static void midcomms_node_release(struct rcu_head *rcu)
+{
+ struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
+
+ WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
+ dlm_send_queue_flush(node);
+ kfree(node);
+}
+
+void dlm_midcomms_exit(void)
+{
+ struct midcomms_node *node;
+ int i, idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ dlm_delete_debug_comms_file(node->debugfs);
+
+ spin_lock(&nodes_lock);
+ hlist_del_rcu(&node->hlist);
+ spin_unlock(&nodes_lock);
+
+ call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ dlm_lowcomms_exit();
+}
+
+static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
+{
+ spin_lock(&node->state_lock);
+ pr_debug("receive active fin ack from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_FIN_WAIT1:
+ node->state = DLM_FIN_WAIT2;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_CLOSING:
+ midcomms_node_reset(node);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_CLOSED:
+ /* not valid but somehow we got what we want */
+ wake_up(&node->shutdown_wait);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d",
+ __func__, node->state);
+ WARN_ON_ONCE(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+}
+
+void dlm_midcomms_add_member(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ if (!node->users) {
+ pr_debug("receive add member from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ break;
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ default:
+ /* some invalid state passive shutdown
+ * was failed, we try to reset and
+ * hope it will go on.
+ */
+ log_print("reset node %d because shutdown stuck",
+ node->nodeid);
+
+ midcomms_node_reset(node);
+ node->state = DLM_ESTABLISHED;
+ break;
+ }
+ }
+
+ node->users++;
+ pr_debug("node %d users inc count %d\n", nodeid, node->users);
+ spin_unlock(&node->state_lock);
+
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+void dlm_midcomms_remove_member(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ /* in case of dlm_midcomms_close() removes node */
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ /* case of dlm_midcomms_addr() created node but
+ * was not added before because dlm_midcomms_close()
+ * removed the node
+ */
+ if (!node->users) {
+ spin_unlock(&node->state_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ node->users--;
+ pr_debug("node %d users dec count %d\n", nodeid, node->users);
+
+ /* hitting users count to zero means the
+ * other side is running dlm_midcomms_stop()
+ * we meet us to have a clean disconnect.
+ */
+ if (node->users == 0) {
+ pr_debug("receive remove member from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ break;
+ case DLM_CLOSE_WAIT:
+ /* passive shutdown DLM_LAST_ACK case 2 */
+ node->state = DLM_LAST_ACK;
+ pr_debug("switch node %d to state %s case 2\n",
+ node->nodeid, dlm_state_str(node->state));
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ dlm_send_fin(node, dlm_pas_fin_ack_rcv);
+ break;
+ case DLM_LAST_ACK:
+ /* probably receive fin caught it, do nothing */
+ break;
+ case DLM_CLOSED:
+ /* already gone, do nothing */
+ break;
+ default:
+ log_print("%s: unexpected state: %d",
+ __func__, node->state);
+ break;
+ }
+ }
+ spin_unlock(&node->state_lock);
+
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+void dlm_midcomms_version_wait(void)
+{
+ struct midcomms_node *node;
+ int i, idx, ret;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ ret = wait_event_timeout(node->shutdown_wait,
+ node->version != DLM_VERSION_NOT_SET ||
+ node->state == DLM_CLOSED ||
+ test_bit(DLM_NODE_FLAG_CLOSE, &node->flags),
+ DLM_SHUTDOWN_TIMEOUT);
+ if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags))
+ pr_debug("version wait timed out for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void midcomms_shutdown(struct midcomms_node *node)
+{
+ int ret;
+
+ /* old protocol, we don't wait for pending operations */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ pr_debug("receive active shutdown for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ node->state = DLM_FIN_WAIT1;
+ pr_debug("switch node %d to state %s case 2\n",
+ node->nodeid, dlm_state_str(node->state));
+ dlm_send_fin(node, dlm_act_fin_ack_rcv);
+ break;
+ case DLM_CLOSED:
+ /* we have what we want */
+ break;
+ default:
+ /* busy to enter DLM_FIN_WAIT1, wait until passive
+ * done in shutdown_wait to enter DLM_CLOSED.
+ */
+ break;
+ }
+ spin_unlock(&node->state_lock);
+
+ if (DLM_DEBUG_FENCE_TERMINATION)
+ msleep(5000);
+
+ /* wait for other side dlm + fin */
+ ret = wait_event_timeout(node->shutdown_wait,
+ node->state == DLM_CLOSED ||
+ test_bit(DLM_NODE_FLAG_CLOSE, &node->flags),
+ DLM_SHUTDOWN_TIMEOUT);
+ if (!ret)
+ pr_debug("active shutdown timed out for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ else
+ pr_debug("active shutdown done for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+}
+
+void dlm_midcomms_shutdown(void)
+{
+ struct midcomms_node *node;
+ int i, idx;
+
+ mutex_lock(&close_lock);
+ idx = srcu_read_lock(&nodes_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ midcomms_shutdown(node);
+ }
+ }
+
+ dlm_lowcomms_shutdown();
+
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ midcomms_node_reset(node);
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
+}
+
+int dlm_midcomms_close(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx, ret;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ /* Abort pending close/remove operation */
+ node = nodeid2node(nodeid);
+ if (node) {
+ /* let shutdown waiters leave */
+ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags);
+ wake_up(&node->shutdown_wait);
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ synchronize_srcu(&nodes_srcu);
+
+ mutex_lock(&close_lock);
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
+ return dlm_lowcomms_close(nodeid);
+ }
+
+ ret = dlm_lowcomms_close(nodeid);
+ dlm_delete_debug_comms_file(node->debugfs);
+
+ spin_lock(&nodes_lock);
+ hlist_del_rcu(&node->hlist);
+ spin_unlock(&nodes_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ /* wait that all readers left until flush send queue */
+ synchronize_srcu(&nodes_srcu);
+
+ /* drop all pending dlm messages, this is fine as
+ * this function get called when the node is fenced
+ */
+ dlm_send_queue_flush(node);
+
+ call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
+ mutex_unlock(&close_lock);
+
+ return ret;
+}
+
+/* debug functionality to send raw dlm msg from user space */
+struct dlm_rawmsg_data {
+ struct midcomms_node *node;
+ void *buf;
+};
+
+static void midcomms_new_rawmsg_cb(void *data)
+{
+ struct dlm_rawmsg_data *rd = data;
+ struct dlm_header *h = rd->buf;
+
+ switch (h->h_version) {
+ case cpu_to_le32(DLM_VERSION_3_1):
+ break;
+ default:
+ switch (h->h_cmd) {
+ case DLM_OPTS:
+ if (!h->u.h_seq)
+ h->u.h_seq = cpu_to_le32(atomic_fetch_inc(&rd->node->seq_send));
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+}
+
+int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf,
+ int buflen)
+{
+ struct dlm_rawmsg_data rd;
+ struct dlm_msg *msg;
+ char *msgbuf;
+
+ rd.node = node;
+ rd.buf = buf;
+
+ msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS,
+ &msgbuf, midcomms_new_rawmsg_cb, &rd);
+ if (!msg)
+ return -ENOMEM;
+
+ memcpy(msgbuf, buf, buflen);
+ dlm_lowcomms_commit_msg(msg);
+ return 0;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 0000000000..e7246fb3ef
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+struct midcomms_node;
+
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len);
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
+struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
+ gfp_t allocation, char **ppc);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
+ int namelen);
+int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
+void dlm_midcomms_version_wait(void);
+int dlm_midcomms_close(int nodeid);
+int dlm_midcomms_start(void);
+void dlm_midcomms_stop(void);
+void dlm_midcomms_init(void);
+void dlm_midcomms_exit(void);
+void dlm_midcomms_shutdown(void);
+void dlm_midcomms_add_member(int nodeid);
+void dlm_midcomms_remove_member(int nodeid);
+void dlm_midcomms_unack_msg_resend(int nodeid);
+const char *dlm_midcomms_state(struct midcomms_node *node);
+unsigned long dlm_midcomms_flags(struct midcomms_node *node);
+int dlm_midcomms_send_queue_cnt(struct midcomms_node *node);
+uint32_t dlm_midcomms_version(struct midcomms_node *node);
+int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf,
+ int buflen);
+struct kmem_cache *dlm_midcomms_cache_create(void);
+
+#endif /* __MIDCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
new file mode 100644
index 0000000000..e6b4c1a214
--- /dev/null
+++ b/fs/dlm/plock.c
@@ -0,0 +1,640 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/filelock.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+#include <linux/slab.h>
+
+#include <trace/events/dlm.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+
+static DEFINE_SPINLOCK(ops_lock);
+static LIST_HEAD(send_list);
+static LIST_HEAD(recv_list);
+static DECLARE_WAIT_QUEUE_HEAD(send_wq);
+static DECLARE_WAIT_QUEUE_HEAD(recv_wq);
+
+struct plock_async_data {
+ void *fl;
+ void *file;
+ struct file_lock flc;
+ int (*callback)(struct file_lock *fl, int result);
+};
+
+struct plock_op {
+ struct list_head list;
+ int done;
+ struct dlm_plock_info info;
+ /* if set indicates async handling */
+ struct plock_async_data *data;
+};
+
+static inline void set_version(struct dlm_plock_info *info)
+{
+ info->version[0] = DLM_PLOCK_VERSION_MAJOR;
+ info->version[1] = DLM_PLOCK_VERSION_MINOR;
+ info->version[2] = DLM_PLOCK_VERSION_PATCH;
+}
+
+static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info)
+{
+ struct plock_op *op = NULL, *iter;
+
+ list_for_each_entry(iter, &recv_list, list) {
+ if (iter->info.fsid == info->fsid &&
+ iter->info.number == info->number &&
+ iter->info.owner == info->owner &&
+ iter->info.pid == info->pid &&
+ iter->info.start == info->start &&
+ iter->info.end == info->end &&
+ iter->info.ex == info->ex &&
+ iter->info.wait) {
+ op = iter;
+ break;
+ }
+ }
+
+ return op;
+}
+
+static int check_version(struct dlm_plock_info *info)
+{
+ if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+ (DLM_PLOCK_VERSION_MINOR < info->version[1])) {
+ log_print("plock device version mismatch: "
+ "kernel (%u.%u.%u), user (%u.%u.%u)",
+ DLM_PLOCK_VERSION_MAJOR,
+ DLM_PLOCK_VERSION_MINOR,
+ DLM_PLOCK_VERSION_PATCH,
+ info->version[0],
+ info->version[1],
+ info->version[2]);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void dlm_release_plock_op(struct plock_op *op)
+{
+ kfree(op->data);
+ kfree(op);
+}
+
+static void send_op(struct plock_op *op)
+{
+ set_version(&op->info);
+ spin_lock(&ops_lock);
+ list_add_tail(&op->list, &send_list);
+ spin_unlock(&ops_lock);
+ wake_up(&send_wq);
+}
+
+static int do_lock_cancel(const struct dlm_plock_info *orig_info)
+{
+ struct plock_op *op;
+ int rv;
+
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op)
+ return -ENOMEM;
+
+ op->info = *orig_info;
+ op->info.optype = DLM_PLOCK_OP_CANCEL;
+ op->info.wait = 0;
+
+ send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ rv = op->info.rv;
+
+ dlm_release_plock_op(op);
+ return rv;
+}
+
+int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ int cmd, struct file_lock *fl)
+{
+ struct plock_async_data *op_data;
+ struct dlm_ls *ls;
+ struct plock_op *op;
+ int rv;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ op->info.optype = DLM_PLOCK_OP_LOCK;
+ op->info.pid = fl->fl_pid;
+ op->info.ex = (fl->fl_type == F_WRLCK);
+ op->info.wait = IS_SETLKW(cmd);
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
+ op->info.start = fl->fl_start;
+ op->info.end = fl->fl_end;
+ /* async handling */
+ if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data) {
+ dlm_release_plock_op(op);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* fl_owner is lockd which doesn't distinguish
+ processes on the nfs client */
+ op->info.owner = (__u64) fl->fl_pid;
+ op_data->callback = fl->fl_lmops->lm_grant;
+ locks_init_lock(&op_data->flc);
+ locks_copy_lock(&op_data->flc, fl);
+ op_data->fl = fl;
+ op_data->file = file;
+
+ op->data = op_data;
+
+ send_op(op);
+ rv = FILE_LOCK_DEFERRED;
+ goto out;
+ } else {
+ op->info.owner = (__u64)(long) fl->fl_owner;
+ }
+
+ send_op(op);
+
+ if (op->info.wait) {
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
+ if (rv == -ERESTARTSYS) {
+ spin_lock(&ops_lock);
+ /* recheck under ops_lock if we got a done != 0,
+ * if so this interrupt case should be ignored
+ */
+ if (op->done != 0) {
+ spin_unlock(&ops_lock);
+ goto do_lock_wait;
+ }
+ spin_unlock(&ops_lock);
+
+ rv = do_lock_cancel(&op->info);
+ switch (rv) {
+ case 0:
+ /* waiter was deleted in user space, answer will never come
+ * remove original request. The original request must be
+ * on recv_list because the answer of do_lock_cancel()
+ * synchronized it.
+ */
+ spin_lock(&ops_lock);
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ rv = -EINTR;
+ break;
+ case -ENOENT:
+ /* cancellation wasn't successful but op should be done */
+ fallthrough;
+ default:
+ /* internal error doing cancel we need to wait */
+ goto wait;
+ }
+
+ log_debug(ls, "%s: wait interrupted %x %llx pid %d",
+ __func__, ls->ls_global_id,
+ (unsigned long long)number, op->info.pid);
+ dlm_release_plock_op(op);
+ goto out;
+ }
+ } else {
+wait:
+ wait_event(recv_wq, (op->done != 0));
+ }
+
+do_lock_wait:
+
+ WARN_ON(!list_empty(&op->list));
+
+ rv = op->info.rv;
+
+ if (!rv) {
+ if (locks_lock_file_wait(file, fl) < 0)
+ log_error(ls, "dlm_posix_lock: vfs lock error %llx",
+ (unsigned long long)number);
+ }
+
+ dlm_release_plock_op(op);
+out:
+ dlm_put_lockspace(ls);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_lock);
+
+/* Returns failure iff a successful lock operation should be canceled */
+static int dlm_plock_callback(struct plock_op *op)
+{
+ struct plock_async_data *op_data = op->data;
+ struct file *file;
+ struct file_lock *fl;
+ struct file_lock *flc;
+ int (*notify)(struct file_lock *fl, int result) = NULL;
+ int rv = 0;
+
+ WARN_ON(!list_empty(&op->list));
+
+ /* check if the following 2 are still valid or make a copy */
+ file = op_data->file;
+ flc = &op_data->flc;
+ fl = op_data->fl;
+ notify = op_data->callback;
+
+ if (op->info.rv) {
+ notify(fl, op->info.rv);
+ goto out;
+ }
+
+ /* got fs lock; bookkeep locally as well: */
+ flc->fl_flags &= ~FL_SLEEP;
+ if (posix_lock_file(file, flc, NULL)) {
+ /*
+ * This can only happen in the case of kmalloc() failure.
+ * The filesystem's own lock is the authoritative lock,
+ * so a failure to get the lock locally is not a disaster.
+ * As long as the fs cannot reliably cancel locks (especially
+ * in a low-memory situation), we're better off ignoring
+ * this failure than trying to recover.
+ */
+ log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p",
+ (unsigned long long)op->info.number, file, fl);
+ }
+
+ rv = notify(fl, 0);
+ if (rv) {
+ /* XXX: We need to cancel the fs lock here: */
+ log_print("%s: lock granted after lock request failed; dangling lock!",
+ __func__);
+ goto out;
+ }
+
+out:
+ dlm_release_plock_op(op);
+ return rv;
+}
+
+int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ struct file_lock *fl)
+{
+ struct dlm_ls *ls;
+ struct plock_op *op;
+ int rv;
+ unsigned char fl_flags = fl->fl_flags;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* cause the vfs unlock to return ENOENT if lock is not found */
+ fl->fl_flags |= FL_EXISTS;
+
+ rv = locks_lock_file_wait(file, fl);
+ if (rv == -ENOENT) {
+ rv = 0;
+ goto out_free;
+ }
+ if (rv < 0) {
+ log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx",
+ rv, (unsigned long long)number);
+ }
+
+ op->info.optype = DLM_PLOCK_OP_UNLOCK;
+ op->info.pid = fl->fl_pid;
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
+ op->info.start = fl->fl_start;
+ op->info.end = fl->fl_end;
+ if (fl->fl_lmops && fl->fl_lmops->lm_grant)
+ op->info.owner = (__u64) fl->fl_pid;
+ else
+ op->info.owner = (__u64)(long) fl->fl_owner;
+
+ if (fl->fl_flags & FL_CLOSE) {
+ op->info.flags |= DLM_PLOCK_FL_CLOSE;
+ send_op(op);
+ rv = 0;
+ goto out;
+ }
+
+ send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ WARN_ON(!list_empty(&op->list));
+
+ rv = op->info.rv;
+
+ if (rv == -ENOENT)
+ rv = 0;
+
+out_free:
+ dlm_release_plock_op(op);
+out:
+ dlm_put_lockspace(ls);
+ fl->fl_flags = fl_flags;
+ return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_unlock);
+
+/*
+ * NOTE: This implementation can only handle async lock requests as nfs
+ * do it. It cannot handle cancellation of a pending lock request sitting
+ * in wait_event(), but for now only nfs is the only user local kernel
+ * user.
+ */
+int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ struct file_lock *fl)
+{
+ struct dlm_plock_info info;
+ struct plock_op *op;
+ struct dlm_ls *ls;
+ int rv;
+
+ /* this only works for async request for now and nfs is the only
+ * kernel user right now.
+ */
+ if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant))
+ return -EOPNOTSUPP;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ memset(&info, 0, sizeof(info));
+ info.pid = fl->fl_pid;
+ info.ex = (fl->fl_type == F_WRLCK);
+ info.fsid = ls->ls_global_id;
+ dlm_put_lockspace(ls);
+ info.number = number;
+ info.start = fl->fl_start;
+ info.end = fl->fl_end;
+ info.owner = (__u64)fl->fl_pid;
+
+ rv = do_lock_cancel(&info);
+ switch (rv) {
+ case 0:
+ spin_lock(&ops_lock);
+ /* lock request to cancel must be on recv_list because
+ * do_lock_cancel() synchronizes it.
+ */
+ op = plock_lookup_waiter(&info);
+ if (WARN_ON_ONCE(!op)) {
+ spin_unlock(&ops_lock);
+ rv = -ENOLCK;
+ break;
+ }
+
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK);
+ op->data->callback(op->data->fl, -EINTR);
+ dlm_release_plock_op(op);
+ rv = -EINTR;
+ break;
+ case -ENOENT:
+ /* if cancel wasn't successful we probably were to late
+ * or it was a non-blocking lock request, so just unlock it.
+ */
+ rv = dlm_posix_unlock(lockspace, number, file, fl);
+ break;
+ default:
+ break;
+ }
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_cancel);
+
+int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ struct file_lock *fl)
+{
+ struct dlm_ls *ls;
+ struct plock_op *op;
+ int rv;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ op->info.optype = DLM_PLOCK_OP_GET;
+ op->info.pid = fl->fl_pid;
+ op->info.ex = (fl->fl_type == F_WRLCK);
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
+ op->info.start = fl->fl_start;
+ op->info.end = fl->fl_end;
+ if (fl->fl_lmops && fl->fl_lmops->lm_grant)
+ op->info.owner = (__u64) fl->fl_pid;
+ else
+ op->info.owner = (__u64)(long) fl->fl_owner;
+
+ send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ WARN_ON(!list_empty(&op->list));
+
+ /* info.rv from userspace is 1 for conflict, 0 for no-conflict,
+ -ENOENT if there are no locks on the file */
+
+ rv = op->info.rv;
+
+ fl->fl_type = F_UNLCK;
+ if (rv == -ENOENT)
+ rv = 0;
+ else if (rv > 0) {
+ locks_init_lock(fl);
+ fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+ fl->fl_flags = FL_POSIX;
+ fl->fl_pid = op->info.pid;
+ if (op->info.nodeid != dlm_our_nodeid())
+ fl->fl_pid = -fl->fl_pid;
+ fl->fl_start = op->info.start;
+ fl->fl_end = op->info.end;
+ rv = 0;
+ }
+
+ dlm_release_plock_op(op);
+out:
+ dlm_put_lockspace(ls);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_get);
+
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+ loff_t *ppos)
+{
+ struct dlm_plock_info info;
+ struct plock_op *op = NULL;
+
+ if (count < sizeof(info))
+ return -EINVAL;
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&send_list)) {
+ op = list_first_entry(&send_list, struct plock_op, list);
+ if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+ list_del(&op->list);
+ else
+ list_move_tail(&op->list, &recv_list);
+ memcpy(&info, &op->info, sizeof(info));
+ }
+ spin_unlock(&ops_lock);
+
+ if (!op)
+ return -EAGAIN;
+
+ trace_dlm_plock_read(&info);
+
+ /* there is no need to get a reply from userspace for unlocks
+ that were generated by the vfs cleaning up for a close
+ (the process did not make an unlock call). */
+
+ if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+ dlm_release_plock_op(op);
+
+ if (copy_to_user(u, &info, sizeof(info)))
+ return -EFAULT;
+ return sizeof(info);
+}
+
+/* a write copies in one plock result that should match a plock_op
+ on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+ loff_t *ppos)
+{
+ struct plock_op *op = NULL, *iter;
+ struct dlm_plock_info info;
+ int do_callback = 0;
+
+ if (count != sizeof(info))
+ return -EINVAL;
+
+ if (copy_from_user(&info, u, sizeof(info)))
+ return -EFAULT;
+
+ trace_dlm_plock_write(&info);
+
+ if (check_version(&info))
+ return -EINVAL;
+
+ /*
+ * The results for waiting ops (SETLKW) can be returned in any
+ * order, so match all fields to find the op. The results for
+ * non-waiting ops are returned in the order that they were sent
+ * to userspace, so match the result with the first non-waiting op.
+ */
+ spin_lock(&ops_lock);
+ if (info.wait) {
+ op = plock_lookup_waiter(&info);
+ } else {
+ list_for_each_entry(iter, &recv_list, list) {
+ if (!iter->info.wait &&
+ iter->info.fsid == info.fsid) {
+ op = iter;
+ break;
+ }
+ }
+ }
+
+ if (op) {
+ /* Sanity check that op and info match. */
+ if (info.wait)
+ WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK);
+ else
+ WARN_ON(op->info.number != info.number ||
+ op->info.owner != info.owner ||
+ op->info.optype != info.optype);
+
+ list_del_init(&op->list);
+ memcpy(&op->info, &info, sizeof(info));
+ if (op->data)
+ do_callback = 1;
+ else
+ op->done = 1;
+ }
+ spin_unlock(&ops_lock);
+
+ if (op) {
+ if (do_callback)
+ dlm_plock_callback(op);
+ else
+ wake_up(&recv_wq);
+ } else
+ pr_debug("%s: no op %x %llx", __func__,
+ info.fsid, (unsigned long long)info.number);
+ return count;
+}
+
+static __poll_t dev_poll(struct file *file, poll_table *wait)
+{
+ __poll_t mask = 0;
+
+ poll_wait(file, &send_wq, wait);
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&send_list))
+ mask = EPOLLIN | EPOLLRDNORM;
+ spin_unlock(&ops_lock);
+
+ return mask;
+}
+
+static const struct file_operations dev_fops = {
+ .read = dev_read,
+ .write = dev_write,
+ .poll = dev_poll,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice plock_dev_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = DLM_PLOCK_MISC_NAME,
+ .fops = &dev_fops
+};
+
+int dlm_plock_init(void)
+{
+ int rv;
+
+ rv = misc_register(&plock_dev_misc);
+ if (rv)
+ log_print("dlm_plock_init: misc_register failed %d", rv);
+ return rv;
+}
+
+void dlm_plock_exit(void)
+{
+ misc_deregister(&plock_dev_misc);
+ WARN_ON(!list_empty(&send_list));
+ WARN_ON(!list_empty(&recv_list));
+}
+
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 0000000000..3b734aed26
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,692 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+static int rcom_response(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, char *mb, int mb_len,
+ uint64_t seq)
+{
+ struct dlm_rcom *rc;
+
+ rc = (struct dlm_rcom *) mb;
+
+ rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
+ rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ rc->rc_header.h_length = cpu_to_le16(mb_len);
+ rc->rc_header.h_cmd = DLM_RCOM;
+
+ rc->rc_type = cpu_to_le32(type);
+ rc->rc_seq = cpu_to_le64(seq);
+
+ *rc_ret = rc;
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret,
+ uint64_t seq)
+{
+ int mb_len = sizeof(struct dlm_rcom) + len;
+ struct dlm_mhandle *mh;
+ char *mb;
+
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+ if (!mh) {
+ log_print("%s to %d type %d len %d ENOBUFS",
+ __func__, to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq);
+ *mh_ret = mh;
+ return 0;
+}
+
+static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
+ int len, struct dlm_rcom **rc_ret,
+ struct dlm_msg **msg_ret, uint64_t seq)
+{
+ int mb_len = sizeof(struct dlm_rcom) + len;
+ struct dlm_msg *msg;
+ char *mb;
+
+ msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb,
+ NULL, NULL);
+ if (!msg) {
+ log_print("create_rcom to %d type %d len %d ENOBUFS",
+ to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq);
+ *msg_ret = msg;
+ return 0;
+}
+
+static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
+{
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
+}
+
+static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
+{
+ dlm_lowcomms_commit_msg(msg);
+ dlm_lowcomms_put_msg(msg);
+}
+
+static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
+ uint32_t flags)
+{
+ rs->rs_flags = cpu_to_le32(flags);
+}
+
+/* When replying to a status request, a node also sends back its
+ configuration values. The requesting node then checks that the remote
+ node is configured the same way as itself. */
+
+static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
+ uint32_t num_slots)
+{
+ rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
+ rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+
+ rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
+ rf->rf_num_slots = cpu_to_le16(num_slots);
+ rf->rf_generation = cpu_to_le32(ls->ls_generation);
+}
+
+static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+{
+ struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
+
+ if ((le32_to_cpu(rc->rc_header.h_version) & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+ log_error(ls, "version mismatch: %x nodeid %d: %x",
+ DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
+ le32_to_cpu(rc->rc_header.h_version));
+ return -EPROTO;
+ }
+
+ if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
+ le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
+ log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+ ls->ls_lvblen, ls->ls_exflags, nodeid,
+ le32_to_cpu(rf->rf_lvblen),
+ le32_to_cpu(rf->rf_lsflags));
+ return -EPROTO;
+ }
+ return 0;
+}
+
+static void allow_sync_reply(struct dlm_ls *ls, __le64 *new_seq)
+{
+ spin_lock(&ls->ls_rcom_spin);
+ *new_seq = cpu_to_le64(++ls->ls_rcom_seq);
+ set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+ spin_unlock(&ls->ls_rcom_spin);
+}
+
+static void disallow_sync_reply(struct dlm_ls *ls)
+{
+ spin_lock(&ls->ls_rcom_spin);
+ clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+ clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ spin_unlock(&ls->ls_rcom_spin);
+}
+
+/*
+ * low nodeid gathers one slot value at a time from each node.
+ * it sets need_slots=0, and saves rf_our_slot returned from each
+ * rcom_config.
+ *
+ * other nodes gather all slot values at once from the low nodeid.
+ * they set need_slots=1, and ignore the rf_our_slot returned from each
+ * rcom_config. they use the rf_num_slots returned from the low
+ * node's rcom_config.
+ */
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags,
+ uint64_t seq)
+{
+ struct dlm_rcom *rc;
+ struct dlm_msg *msg;
+ int error = 0;
+
+ ls->ls_recover_nodeid = nodeid;
+
+ if (nodeid == dlm_our_nodeid()) {
+ rc = ls->ls_recover_buf;
+ rc->rc_result = cpu_to_le32(dlm_recover_status(ls));
+ goto out;
+ }
+
+retry:
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS,
+ sizeof(struct rcom_status), &rc, &msg,
+ seq);
+ if (error)
+ goto out;
+
+ set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
+
+ allow_sync_reply(ls, &rc->rc_id);
+ memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
+
+ send_rcom_stateless(msg, rc);
+
+ error = dlm_wait_function(ls, &rcom_response);
+ disallow_sync_reply(ls);
+ if (error == -ETIMEDOUT)
+ goto retry;
+ if (error)
+ goto out;
+
+ rc = ls->ls_recover_buf;
+
+ if (rc->rc_result == cpu_to_le32(-ESRCH)) {
+ /* we pretend the remote lockspace exists with 0 status */
+ log_debug(ls, "remote node %d not ready", nodeid);
+ rc->rc_result = 0;
+ error = 0;
+ } else {
+ error = check_rcom_config(ls, rc, nodeid);
+ }
+
+ /* the caller looks at rc_result for the remote recovery status */
+ out:
+ return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls,
+ const struct dlm_rcom *rc_in,
+ uint64_t seq)
+{
+ struct dlm_rcom *rc;
+ struct rcom_status *rs;
+ uint32_t status;
+ int nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ int len = sizeof(struct rcom_config);
+ struct dlm_msg *msg;
+ int num_slots = 0;
+ int error;
+
+ if (!dlm_slots_version(&rc_in->rc_header)) {
+ status = dlm_recover_status(ls);
+ goto do_create;
+ }
+
+ rs = (struct rcom_status *)rc_in->rc_buf;
+
+ if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) {
+ status = dlm_recover_status(ls);
+ goto do_create;
+ }
+
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ num_slots = ls->ls_num_slots;
+ spin_unlock(&ls->ls_recover_lock);
+ len += num_slots * sizeof(struct rcom_slot);
+
+ do_create:
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+ len, &rc, &msg, seq);
+ if (error)
+ return;
+
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_seq_reply = rc_in->rc_seq;
+ rc->rc_result = cpu_to_le32(status);
+
+ set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
+
+ if (!num_slots)
+ goto do_send;
+
+ spin_lock(&ls->ls_recover_lock);
+ if (ls->ls_num_slots != num_slots) {
+ spin_unlock(&ls->ls_recover_lock);
+ log_debug(ls, "receive_rcom_status num_slots %d to %d",
+ num_slots, ls->ls_num_slots);
+ rc->rc_result = 0;
+ set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
+ goto do_send;
+ }
+
+ dlm_slots_copy_out(ls, rc);
+ spin_unlock(&ls->ls_recover_lock);
+
+ do_send:
+ send_rcom_stateless(msg, rc);
+}
+
+static void receive_sync_reply(struct dlm_ls *ls, const struct dlm_rcom *rc_in)
+{
+ spin_lock(&ls->ls_rcom_spin);
+ if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
+ le64_to_cpu(rc_in->rc_id) != ls->ls_rcom_seq) {
+ log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
+ le32_to_cpu(rc_in->rc_type),
+ le32_to_cpu(rc_in->rc_header.h_nodeid),
+ (unsigned long long)le64_to_cpu(rc_in->rc_id),
+ (unsigned long long)ls->ls_rcom_seq);
+ goto out;
+ }
+ memcpy(ls->ls_recover_buf, rc_in,
+ le16_to_cpu(rc_in->rc_header.h_length));
+ set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+ wake_up(&ls->ls_wait_general);
+ out:
+ spin_unlock(&ls->ls_rcom_spin);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,
+ int last_len, uint64_t seq)
+{
+ struct dlm_mhandle *mh;
+ struct dlm_rcom *rc;
+ int error = 0;
+
+ ls->ls_recover_nodeid = nodeid;
+
+retry:
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len,
+ &rc, &mh, seq);
+ if (error)
+ goto out;
+ memcpy(rc->rc_buf, last_name, last_len);
+
+ allow_sync_reply(ls, &rc->rc_id);
+ memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
+
+ send_rcom(mh, rc);
+
+ error = dlm_wait_function(ls, &rcom_response);
+ disallow_sync_reply(ls);
+ if (error == -ETIMEDOUT)
+ goto retry;
+ out:
+ return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, const struct dlm_rcom *rc_in,
+ uint64_t seq)
+{
+ struct dlm_mhandle *mh;
+ struct dlm_rcom *rc;
+ int error, inlen, outlen, nodeid;
+
+ nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ inlen = le16_to_cpu(rc_in->rc_header.h_length) -
+ sizeof(struct dlm_rcom);
+ outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
+ &rc, &mh, seq);
+ if (error)
+ return;
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_seq_reply = rc_in->rc_seq;
+
+ dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+ nodeid);
+ send_rcom(mh, rc);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ struct dlm_ls *ls = r->res_ls;
+ int error;
+
+ error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+ &rc, &mh, seq);
+ if (error)
+ goto out;
+ memcpy(rc->rc_buf, r->res_name, r->res_length);
+ rc->rc_id = cpu_to_le64(r->res_id);
+
+ send_rcom(mh, rc);
+ out:
+ return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls,
+ const struct dlm_rcom *rc_in, uint64_t seq)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, ret_nodeid, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ int len = le16_to_cpu(rc_in->rc_header.h_length) -
+ sizeof(struct dlm_rcom);
+
+ /* Old code would send this special id to trigger a debug dump. */
+ if (rc_in->rc_id == cpu_to_le64(0xFFFFFFFF)) {
+ log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
+ dlm_dump_rsb_name(ls, rc_in->rc_buf, len);
+ return;
+ }
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh,
+ seq);
+ if (error)
+ return;
+
+ error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len,
+ DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
+ if (error)
+ ret_nodeid = error;
+ rc->rc_result = cpu_to_le32(ret_nodeid);
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_seq_reply = rc_in->rc_seq;
+
+ send_rcom(mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls,
+ const struct dlm_rcom *rc_in)
+{
+ dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct rcom_lock *rl)
+{
+ memset(rl, 0, sizeof(*rl));
+
+ rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid);
+ rl->rl_lkid = cpu_to_le32(lkb->lkb_id);
+ rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags);
+ rl->rl_flags = cpu_to_le32(dlm_dflags_val(lkb));
+ rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
+ rl->rl_rqmode = lkb->lkb_rqmode;
+ rl->rl_grmode = lkb->lkb_grmode;
+ rl->rl_status = lkb->lkb_status;
+ rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
+
+ if (lkb->lkb_bastfn)
+ rl->rl_asts |= DLM_CB_BAST;
+ if (lkb->lkb_astfn)
+ rl->rl_asts |= DLM_CB_CAST;
+
+ rl->rl_namelen = cpu_to_le16(r->res_length);
+ memcpy(rl->rl_name, r->res_name, r->res_length);
+
+ /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+ If so, receive_rcom_lock_args() won't take this copy. */
+
+ if (lkb->lkb_lvbptr)
+ memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq)
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ struct rcom_lock *rl;
+ int error, len = sizeof(struct rcom_lock);
+
+ if (lkb->lkb_lvbptr)
+ len += ls->ls_lvblen;
+
+ error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh,
+ seq);
+ if (error)
+ goto out;
+
+ rl = (struct rcom_lock *) rc->rc_buf;
+ pack_rcom_lock(r, lkb, rl);
+ rc->rc_id = cpu_to_le64((uintptr_t)r);
+
+ send_rcom(mh, rc);
+ out:
+ return error;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+static void receive_rcom_lock(struct dlm_ls *ls, const struct dlm_rcom *rc_in,
+ uint64_t seq)
+{
+ __le32 rl_remid, rl_result;
+ struct rcom_lock *rl;
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+
+ dlm_recover_master_copy(ls, rc_in, &rl_remid, &rl_result);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+ sizeof(struct rcom_lock), &rc, &mh, seq);
+ if (error)
+ return;
+
+ memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+ rl = (struct rcom_lock *)rc->rc_buf;
+ /* set rl_remid and rl_result from dlm_recover_master_copy() */
+ rl->rl_remid = rl_remid;
+ rl->rl_result = rl_result;
+
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_seq_reply = rc_in->rc_seq;
+
+ send_rcom(mh, rc);
+}
+
+/* If the lockspace doesn't exist then still send a status message
+ back; it's possible that it just doesn't have its global_id yet. */
+
+int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct rcom_config *rf;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
+
+ mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb);
+ if (!mh)
+ return -ENOBUFS;
+
+ rc = (struct dlm_rcom *) mb;
+
+ rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace;
+ rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ rc->rc_header.h_length = cpu_to_le16(mb_len);
+ rc->rc_header.h_cmd = DLM_RCOM;
+
+ rc->rc_type = cpu_to_le32(DLM_RCOM_STATUS_REPLY);
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_seq_reply = rc_in->rc_seq;
+ rc->rc_result = cpu_to_le32(-ESRCH);
+
+ rf = (struct rcom_config *) rc->rc_buf;
+ rf->rf_lvblen = cpu_to_le32(~0U);
+
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * Ignore messages for stage Y before we set
+ * recover_status bit for stage X:
+ *
+ * recover_status = 0
+ *
+ * dlm_recover_members()
+ * - send nothing
+ * - recv nothing
+ * - ignore NAMES, NAMES_REPLY
+ * - ignore LOOKUP, LOOKUP_REPLY
+ * - ignore LOCK, LOCK_REPLY
+ *
+ * recover_status |= NODES
+ *
+ * dlm_recover_members_wait()
+ *
+ * dlm_recover_directory()
+ * - send NAMES
+ * - recv NAMES_REPLY
+ * - ignore LOOKUP, LOOKUP_REPLY
+ * - ignore LOCK, LOCK_REPLY
+ *
+ * recover_status |= DIR
+ *
+ * dlm_recover_directory_wait()
+ *
+ * dlm_recover_masters()
+ * - send LOOKUP
+ * - recv LOOKUP_REPLY
+ *
+ * dlm_recover_locks()
+ * - send LOCKS
+ * - recv LOCKS_REPLY
+ *
+ * recover_status |= LOCKS
+ *
+ * dlm_recover_locks_wait()
+ *
+ * recover_status |= DONE
+ */
+
+/* Called by dlm_recv; corresponds to dlm_receive_message() but special
+ recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, int nodeid)
+{
+ int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
+ int stop, reply = 0, names = 0, lookup = 0, lock = 0;
+ uint32_t status;
+ uint64_t seq;
+
+ switch (rc->rc_type) {
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
+ reply = 1;
+ break;
+ case cpu_to_le32(DLM_RCOM_NAMES):
+ names = 1;
+ break;
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
+ names = 1;
+ reply = 1;
+ break;
+ case cpu_to_le32(DLM_RCOM_LOOKUP):
+ lookup = 1;
+ break;
+ case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY):
+ lookup = 1;
+ reply = 1;
+ break;
+ case cpu_to_le32(DLM_RCOM_LOCK):
+ lock = 1;
+ break;
+ case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
+ lock = 1;
+ reply = 1;
+ break;
+ }
+
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ stop = dlm_recovery_stopped(ls);
+ seq = ls->ls_recover_seq;
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (stop && (rc->rc_type != cpu_to_le32(DLM_RCOM_STATUS)))
+ goto ignore;
+
+ if (reply && (le64_to_cpu(rc->rc_seq_reply) != seq))
+ goto ignore;
+
+ if (!(status & DLM_RS_NODES) && (names || lookup || lock))
+ goto ignore;
+
+ if (!(status & DLM_RS_DIR) && (lookup || lock))
+ goto ignore;
+
+ switch (rc->rc_type) {
+ case cpu_to_le32(DLM_RCOM_STATUS):
+ receive_rcom_status(ls, rc, seq);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_NAMES):
+ receive_rcom_names(ls, rc, seq);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_LOOKUP):
+ receive_rcom_lookup(ls, rc, seq);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_LOCK):
+ if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
+ goto Eshort;
+ receive_rcom_lock(ls, rc, seq);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
+ receive_sync_reply(ls, rc);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
+ receive_sync_reply(ls, rc);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY):
+ receive_rcom_lookup_reply(ls, rc);
+ break;
+
+ case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
+ if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
+ goto Eshort;
+ dlm_recover_process_copy(ls, rc, seq);
+ break;
+
+ default:
+ log_error(ls, "receive_rcom bad type %d",
+ le32_to_cpu(rc->rc_type));
+ }
+ return;
+
+ignore:
+ log_limit(ls, "dlm_receive_rcom ignore msg %d "
+ "from %d %llu %llu recover seq %llu sts %x gen %u",
+ le32_to_cpu(rc->rc_type),
+ nodeid,
+ (unsigned long long)le64_to_cpu(rc->rc_seq),
+ (unsigned long long)le64_to_cpu(rc->rc_seq_reply),
+ (unsigned long long)seq,
+ status, ls->ls_generation);
+ return;
+Eshort:
+ log_error(ls, "recovery message %d from %d is too short",
+ le32_to_cpu(rc->rc_type), nodeid);
+}
+
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 0000000000..765926ae00
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags,
+ uint64_t seq);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,
+ int last_len, uint64_t seq);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq);
+void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ int nodeid);
+int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in);
+
+#endif
+
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 0000000000..53917c0aa3
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,958 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status. They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timeout. This uses a timeout so it can check periodically if the wait
+ * should abort due to node failure (which doesn't cause a wake_up).
+ * This should only be called by the dlm_recoverd thread.
+ */
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+ int error = 0;
+ int rv;
+
+ while (1) {
+ rv = wait_event_timeout(ls->ls_wait_general,
+ testfn(ls) || dlm_recovery_stopped(ls),
+ dlm_config.ci_recover_timer * HZ);
+ if (rv)
+ break;
+ if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) {
+ log_debug(ls, "dlm_wait_function timed out");
+ return -ETIMEDOUT;
+ }
+ }
+
+ if (dlm_recovery_stopped(ls)) {
+ log_debug(ls, "dlm_wait_function aborted");
+ error = -EINTR;
+ }
+ return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status. The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low). When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+ uint32_t status;
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ spin_unlock(&ls->ls_recover_lock);
+ return status;
+}
+
+static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+ ls->ls_recover_status |= status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+ spin_lock(&ls->ls_recover_lock);
+ _set_recover_status(ls, status);
+ spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
+ int save_slots, uint64_t seq)
+{
+ struct dlm_rcom *rc = ls->ls_recover_buf;
+ struct dlm_member *memb;
+ int error = 0, delay;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ delay = 0;
+ for (;;) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ error = dlm_rcom_status(ls, memb->nodeid, 0, seq);
+ if (error)
+ goto out;
+
+ if (save_slots)
+ dlm_slot_save(ls, rc, memb);
+
+ if (le32_to_cpu(rc->rc_result) & wait_status)
+ break;
+ if (delay < 1000)
+ delay += 20;
+ msleep(delay);
+ }
+ }
+ out:
+ return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
+ uint32_t status_flags, uint64_t seq)
+{
+ struct dlm_rcom *rc = ls->ls_recover_buf;
+ int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+ for (;;) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ error = dlm_rcom_status(ls, nodeid, status_flags, seq);
+ if (error)
+ break;
+
+ if (le32_to_cpu(rc->rc_result) & wait_status)
+ break;
+ if (delay < 1000)
+ delay += 20;
+ msleep(delay);
+ }
+ out:
+ return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq)
+{
+ uint32_t status_all = status << 1;
+ int error;
+
+ if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+ error = wait_status_all(ls, status, 0, seq);
+ if (!error)
+ dlm_set_recover_status(ls, status_all);
+ } else
+ error = wait_status_low(ls, status_all, 0, seq);
+
+ return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *slots;
+ int num_slots, slots_size;
+ int error, rv;
+ uint32_t gen;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ memb->slot = -1;
+ memb->generation = 0;
+ }
+
+ if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+ error = wait_status_all(ls, DLM_RS_NODES, 1, seq);
+ if (error)
+ goto out;
+
+ /* slots array is sparse, slots_size may be > num_slots */
+
+ rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
+ if (!rv) {
+ spin_lock(&ls->ls_recover_lock);
+ _set_recover_status(ls, DLM_RS_NODES_ALL);
+ ls->ls_num_slots = num_slots;
+ ls->ls_slots_size = slots_size;
+ ls->ls_slots = slots;
+ ls->ls_generation = gen;
+ spin_unlock(&ls->ls_recover_lock);
+ } else {
+ dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
+ }
+ } else {
+ error = wait_status_low(ls, DLM_RS_NODES_ALL,
+ DLM_RSF_NEED_SLOTS, seq);
+ if (error)
+ goto out;
+
+ dlm_slots_copy_in(ls);
+ }
+ out:
+ return error;
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq)
+{
+ return wait_status(ls, DLM_RS_DIR, seq);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq)
+{
+ return wait_status(ls, DLM_RS_LOCKS, seq);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq)
+{
+ return wait_status(ls, DLM_RS_DONE, seq);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid. As replies are returned from the resource directories the
+ * rsb's are removed from the list. When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+ int empty;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ empty = list_empty(&ls->ls_recover_list);
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ if (list_empty(&r->res_recover_list)) {
+ list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+ ls->ls_recover_list_count++;
+ dlm_hold_rsb(r);
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_del_init(&r->res_recover_list);
+ ls->ls_recover_list_count--;
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ dlm_put_rsb(r);
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *s;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+ list_del_init(&r->res_recover_list);
+ r->res_recover_locks_count = 0;
+ dlm_put_rsb(r);
+ ls->ls_recover_list_count--;
+ }
+
+ if (ls->ls_recover_list_count != 0) {
+ log_error(ls, "warning: recover_list_count %d",
+ ls->ls_recover_list_count);
+ ls->ls_recover_list_count = 0;
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static int recover_idr_empty(struct dlm_ls *ls)
+{
+ int empty = 1;
+
+ spin_lock(&ls->ls_recover_idr_lock);
+ if (ls->ls_recover_list_count)
+ empty = 0;
+ spin_unlock(&ls->ls_recover_idr_lock);
+
+ return empty;
+}
+
+static int recover_idr_add(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int rv;
+
+ idr_preload(GFP_NOFS);
+ spin_lock(&ls->ls_recover_idr_lock);
+ if (r->res_id) {
+ rv = -1;
+ goto out_unlock;
+ }
+ rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT);
+ if (rv < 0)
+ goto out_unlock;
+
+ r->res_id = rv;
+ ls->ls_recover_list_count++;
+ dlm_hold_rsb(r);
+ rv = 0;
+out_unlock:
+ spin_unlock(&ls->ls_recover_idr_lock);
+ idr_preload_end();
+ return rv;
+}
+
+static void recover_idr_del(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_idr_lock);
+ idr_remove(&ls->ls_recover_idr, r->res_id);
+ r->res_id = 0;
+ ls->ls_recover_list_count--;
+ spin_unlock(&ls->ls_recover_idr_lock);
+
+ dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
+{
+ struct dlm_rsb *r;
+
+ spin_lock(&ls->ls_recover_idr_lock);
+ r = idr_find(&ls->ls_recover_idr, (int)id);
+ spin_unlock(&ls->ls_recover_idr_lock);
+ return r;
+}
+
+static void recover_idr_clear(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int id;
+
+ spin_lock(&ls->ls_recover_idr_lock);
+
+ idr_for_each_entry(&ls->ls_recover_idr, r, id) {
+ idr_remove(&ls->ls_recover_idr, id);
+ r->res_id = 0;
+ r->res_recover_locks_count = 0;
+ ls->ls_recover_list_count--;
+
+ dlm_put_rsb(r);
+ }
+
+ if (ls->ls_recover_list_count != 0) {
+ log_error(ls, "warning: recover_list_count %d",
+ ls->ls_recover_list_count);
+ ls->ls_recover_list_count = 0;
+ }
+ spin_unlock(&ls->ls_recover_idr_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+ mastered on nodes that have been removed.
+
+ dlm_recover_masters
+ recover_master
+ dlm_send_rcom_lookup -> receive_rcom_lookup
+ dlm_dir_lookup
+ receive_rcom_lookup_reply <-
+ dlm_recover_master_reply
+ set_new_master
+ set_master_lkbs
+ set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, queue, lkb_statequeue) {
+ if (!test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) {
+ lkb->lkb_nodeid = nodeid;
+ lkb->lkb_remid = 0;
+ }
+ }
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+ set_lock_master(&r->res_grantqueue, r->res_nodeid);
+ set_lock_master(&r->res_convertqueue, r->res_nodeid);
+ set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propagate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which
+ * rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r)
+{
+ set_master_lkbs(r);
+ rsb_set_flag(r, RSB_NEW_MASTER);
+ rsb_set_flag(r, RSB_NEW_MASTER2);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters. The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ *
+ * Another node recovering the master may have sent us a rcom lookup,
+ * and our dlm_master_lookup() set it as the new master, along with
+ * NEW_MASTER so that we'll recover it here (this implies dir_nodeid
+ * equals our_nodeid below).
+ */
+
+static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int our_nodeid, dir_nodeid;
+ int is_removed = 0;
+ int error;
+
+ if (is_master(r))
+ return 0;
+
+ is_removed = dlm_is_removed(ls, r->res_nodeid);
+
+ if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER))
+ return 0;
+
+ our_nodeid = dlm_our_nodeid();
+ dir_nodeid = dlm_dir_nodeid(r);
+
+ if (dir_nodeid == our_nodeid) {
+ if (is_removed) {
+ r->res_master_nodeid = our_nodeid;
+ r->res_nodeid = 0;
+ }
+
+ /* set master of lkbs to ourself when is_removed, or to
+ another new master which we set along with NEW_MASTER
+ in dlm_master_lookup */
+ set_new_master(r);
+ error = 0;
+ } else {
+ recover_idr_add(r);
+ error = dlm_send_rcom_lookup(r, dir_nodeid, seq);
+ }
+
+ (*count)++;
+ return error;
+}
+
+/*
+ * All MSTCPY locks are purged and rebuilt, even if the master stayed the same.
+ * This is necessary because recovery can be started, aborted and restarted,
+ * causing the master nodeid to briefly change during the aborted recovery, and
+ * change back to the original value in the second recovery. The MSTCPY locks
+ * may or may not have been purged during the aborted recovery. Another node
+ * with an outstanding request in waiters list and a request reply saved in the
+ * requestqueue, cannot know whether it should ignore the reply and resend the
+ * request, or accept the reply and complete the request. It must do the
+ * former if the remote node purged MSTCPY locks, and it must do the later if
+ * the remote node did not. This is solved by always purging MSTCPY locks, in
+ * which case, the request reply would always be ignored and the request
+ * resent.
+ */
+
+static int recover_master_static(struct dlm_rsb *r, unsigned int *count)
+{
+ int dir_nodeid = dlm_dir_nodeid(r);
+ int new_master = dir_nodeid;
+
+ if (dir_nodeid == dlm_our_nodeid())
+ new_master = 0;
+
+ dlm_purge_mstcpy_locks(r);
+ r->res_master_nodeid = dir_nodeid;
+ r->res_nodeid = new_master;
+ set_new_master(r);
+ (*count)++;
+ return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory. The dir will
+ * assign mastery to the first node to look up the new master. That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq)
+{
+ struct dlm_rsb *r;
+ unsigned int total = 0;
+ unsigned int count = 0;
+ int nodir = dlm_no_directory(ls);
+ int error;
+
+ log_rinfo(ls, "dlm_recover_masters");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (dlm_recovery_stopped(ls)) {
+ up_read(&ls->ls_root_sem);
+ error = -EINTR;
+ goto out;
+ }
+
+ lock_rsb(r);
+ if (nodir)
+ error = recover_master_static(r, &count);
+ else
+ error = recover_master(r, &count, seq);
+ unlock_rsb(r);
+ cond_resched();
+ total++;
+
+ if (error) {
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
+
+ error = dlm_wait_function(ls, &recover_idr_empty);
+ out:
+ if (error)
+ recover_idr_clear(ls);
+ return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc)
+{
+ struct dlm_rsb *r;
+ int ret_nodeid, new_master;
+
+ r = recover_idr_find(ls, le64_to_cpu(rc->rc_id));
+ if (!r) {
+ log_error(ls, "dlm_recover_master_reply no id %llx",
+ (unsigned long long)le64_to_cpu(rc->rc_id));
+ goto out;
+ }
+
+ ret_nodeid = le32_to_cpu(rc->rc_result);
+
+ if (ret_nodeid == dlm_our_nodeid())
+ new_master = 0;
+ else
+ new_master = ret_nodeid;
+
+ lock_rsb(r);
+ r->res_master_nodeid = ret_nodeid;
+ r->res_nodeid = new_master;
+ set_new_master(r);
+ unlock_rsb(r);
+ recover_idr_del(r);
+
+ if (recover_idr_empty(ls))
+ wake_up(&ls->ls_wait_general);
+ out:
+ return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+ remastered rsb on the new rsb master.
+
+ dlm_recover_locks
+ recover_locks
+ recover_locks_queue
+ dlm_send_rcom_lock -> receive_rcom_lock
+ dlm_recover_master_copy
+ receive_rcom_lock_reply <-
+ dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head,
+ uint64_t seq)
+{
+ struct dlm_lkb *lkb;
+ int error = 0;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ error = dlm_send_rcom_lock(r, lkb, seq);
+ if (error)
+ break;
+ r->res_recover_locks_count++;
+ }
+
+ return error;
+}
+
+static int recover_locks(struct dlm_rsb *r, uint64_t seq)
+{
+ int error = 0;
+
+ lock_rsb(r);
+
+ DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+
+ error = recover_locks_queue(r, &r->res_grantqueue, seq);
+ if (error)
+ goto out;
+ error = recover_locks_queue(r, &r->res_convertqueue, seq);
+ if (error)
+ goto out;
+ error = recover_locks_queue(r, &r->res_waitqueue, seq);
+ if (error)
+ goto out;
+
+ if (r->res_recover_locks_count)
+ recover_list_add(r);
+ else
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+ unlock_rsb(r);
+ return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq)
+{
+ struct dlm_rsb *r;
+ int error, count = 0;
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (is_master(r)) {
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ continue;
+ }
+
+ if (!rsb_flag(r, RSB_NEW_MASTER))
+ continue;
+
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+
+ error = recover_locks(r, seq);
+ if (error) {
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+
+ count += r->res_recover_locks_count;
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_rinfo(ls, "dlm_recover_locks %d out", count);
+
+ error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+ if (error)
+ recover_list_clear(ls);
+ return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+ DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+
+ r->res_recover_locks_count--;
+ if (!r->res_recover_locks_count) {
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ recover_list_del(r);
+ }
+
+ if (recover_list_empty(r->res_ls))
+ wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's. This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set in two cases:
+ *
+ * 1. we are master, but not new, and we purged an EX/PW lock held by a
+ * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
+ *
+ * 2. we are a new master, and there are only NL/CR locks left.
+ * (We could probably improve this by only invaliding in this way when
+ * the previous master left uncleanly. VMS docs mention that.)
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
+ * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+ struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL;
+ uint32_t high_seq = 0;
+ int lock_lvb_exists = 0;
+ int lvblen = r->res_ls->ls_lvblen;
+
+ if (!rsb_flag(r, RSB_NEW_MASTER2) &&
+ rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
+ /* case 1 above */
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!rsb_flag(r, RSB_NEW_MASTER2))
+ return;
+
+ /* we are the new master, so figure out if VALNOTVALID should
+ be set, and set the rsb lvb from the best lkb available. */
+
+ list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
+ continue;
+
+ lock_lvb_exists = 1;
+
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
+ goto setflag;
+ }
+
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
+ }
+ }
+
+ list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
+ continue;
+
+ lock_lvb_exists = 1;
+
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
+ goto setflag;
+ }
+
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
+ }
+ }
+
+ setflag:
+ if (!lock_lvb_exists)
+ goto out;
+
+ /* lvb is invalidated if only NL/CR locks remain */
+ if (!big_lkb)
+ rsb_set_flag(r, RSB_VALNOTVALID);
+
+ if (!r->res_lvbptr) {
+ r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+ if (!r->res_lvbptr)
+ goto out;
+ }
+
+ if (big_lkb) {
+ r->res_lvbseq = big_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen);
+ } else if (high_lkb) {
+ r->res_lvbseq = high_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+ } else {
+ r->res_lvbseq = 0;
+ memset(r->res_lvbptr, 0, lvblen);
+ }
+ out:
+ return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
+ converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_lkb *lkb;
+ int grmode = -1;
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_grmode == DLM_LOCK_PR ||
+ lkb->lkb_grmode == DLM_LOCK_CW) {
+ grmode = lkb->lkb_grmode;
+ break;
+ }
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ if (lkb->lkb_grmode != DLM_LOCK_IV)
+ continue;
+ if (grmode == -1) {
+ log_debug(ls, "recover_conversion %x set gr to rq %d",
+ lkb->lkb_id, lkb->lkb_rqmode);
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ } else {
+ log_debug(ls, "recover_conversion %x set gr %d",
+ lkb->lkb_id, grmode);
+ lkb->lkb_grmode = grmode;
+ }
+ }
+}
+
+/* We've become the new master for this rsb and waiting/converting locks may
+ need to be granted in dlm_recover_grant() due to locks that may have
+ existed from a removed node. */
+
+static void recover_grant(struct dlm_rsb *r)
+{
+ if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+ rsb_set_flag(r, RSB_RECOVER_GRANT);
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ unsigned int count = 0;
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ lock_rsb(r);
+ if (is_master(r)) {
+ if (rsb_flag(r, RSB_RECOVER_CONVERT))
+ recover_conversion(r);
+
+ /* recover lvb before granting locks so the updated
+ lvb/VALNOTVALID is presented in the completion */
+ recover_lvb(r);
+
+ if (rsb_flag(r, RSB_NEW_MASTER2))
+ recover_grant(r);
+ count++;
+ } else {
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+ }
+ rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+ rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
+ rsb_clear_flag(r, RSB_NEW_MASTER2);
+ unlock_rsb(r);
+ }
+ up_read(&ls->ls_root_sem);
+
+ if (count)
+ log_rinfo(ls, "dlm_recover_rsbs %d done", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+ struct rb_node *n;
+ struct dlm_rsb *r;
+ int i, error = 0;
+
+ down_write(&ls->ls_root_sem);
+ if (!list_empty(&ls->ls_root_list)) {
+ log_error(ls, "root list not empty");
+ error = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ spin_lock(&ls->ls_rsbtbl[i].lock);
+ for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
+ list_add(&r->res_root_list, &ls->ls_root_list);
+ dlm_hold_rsb(r);
+ }
+
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss))
+ log_error(ls, "dlm_create_root_list toss not empty");
+ spin_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ out:
+ up_write(&ls->ls_root_sem);
+ return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *safe;
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+ list_del_init(&r->res_root_list);
+ dlm_put_rsb(r);
+ }
+ up_write(&ls->ls_root_sem);
+}
+
+void dlm_clear_toss(struct dlm_ls *ls)
+{
+ struct rb_node *n, *next;
+ struct dlm_rsb *r;
+ unsigned int count = 0;
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ spin_lock(&ls->ls_rsbtbl[i].lock);
+ for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
+ next = rb_next(n);
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].toss);
+ dlm_free_rsb(r);
+ count++;
+ }
+ spin_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+
+ if (count)
+ log_rinfo(ls, "dlm_clear_toss %u done", count);
+}
+
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 0000000000..dbc51013ec
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif /* __RECOVER_DOT_H__ */
+
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 0000000000..4d17491dea
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+ by a newer stop (ls_recover_seq), we need to leave locking disabled.
+
+ We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
+ locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
+ enables locking and clears the requestqueue between a and b. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+ int error = -EINTR;
+
+ down_write(&ls->ls_recv_active);
+
+ spin_lock(&ls->ls_recover_lock);
+ if (ls->ls_recover_seq == seq) {
+ set_bit(LSFL_RUNNING, &ls->ls_flags);
+ /* unblocks processes waiting to enter the dlm */
+ up_write(&ls->ls_in_recovery);
+ clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+ error = 0;
+ }
+ spin_unlock(&ls->ls_recover_lock);
+
+ up_write(&ls->ls_recv_active);
+ return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+ unsigned long start;
+ int error, neg = 0;
+
+ log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
+
+ mutex_lock(&ls->ls_recoverd_active);
+
+ dlm_callback_suspend(ls);
+
+ dlm_clear_toss(ls);
+
+ /*
+ * This list of root rsb's will be the basis of most of the recovery
+ * routines.
+ */
+
+ dlm_create_root_list(ls);
+
+ /*
+ * Add or remove nodes from the lockspace's ls_nodes list.
+ *
+ * Due to the fact that we must report all membership changes to lsops
+ * or midcomms layer, it is not permitted to abort ls_recover() until
+ * this is done.
+ */
+
+ error = dlm_recover_members(ls, rv, &neg);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_members error %d", error);
+ goto fail;
+ }
+
+ dlm_recover_dir_nodeid(ls);
+
+ ls->ls_recover_dir_sent_res = 0;
+ ls->ls_recover_dir_sent_msg = 0;
+ ls->ls_recover_locks_in = 0;
+
+ dlm_set_recover_status(ls, DLM_RS_NODES);
+
+ error = dlm_recover_members_wait(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_members_wait error %d", error);
+ goto fail;
+ }
+
+ start = jiffies;
+
+ /*
+ * Rebuild our own share of the directory by collecting from all other
+ * nodes their master rsb names that hash to us.
+ */
+
+ error = dlm_recover_directory(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_directory error %d", error);
+ goto fail;
+ }
+
+ dlm_set_recover_status(ls, DLM_RS_DIR);
+
+ error = dlm_recover_directory_wait(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
+ goto fail;
+ }
+
+ log_rinfo(ls, "dlm_recover_directory %u out %u messages",
+ ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
+
+ /*
+ * We may have outstanding operations that are waiting for a reply from
+ * a failed node. Mark these to be resent after recovery. Unlock and
+ * cancel ops can just be completed.
+ */
+
+ dlm_recover_waiters_pre(ls);
+
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto fail;
+ }
+
+ if (neg || dlm_no_directory(ls)) {
+ /*
+ * Clear lkb's for departed nodes.
+ */
+
+ dlm_recover_purge(ls);
+
+ /*
+ * Get new master nodeid's for rsb's that were mastered on
+ * departed nodes.
+ */
+
+ error = dlm_recover_masters(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_masters error %d", error);
+ goto fail;
+ }
+
+ /*
+ * Send our locks on remastered rsb's to the new masters.
+ */
+
+ error = dlm_recover_locks(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_locks error %d", error);
+ goto fail;
+ }
+
+ dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
+ error = dlm_recover_locks_wait(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
+ goto fail;
+ }
+
+ log_rinfo(ls, "dlm_recover_locks %u in",
+ ls->ls_recover_locks_in);
+
+ /*
+ * Finalize state in master rsb's now that all locks can be
+ * checked. This includes conversion resolution and lvb
+ * settings.
+ */
+
+ dlm_recover_rsbs(ls);
+ } else {
+ /*
+ * Other lockspace members may be going through the "neg" steps
+ * while also adding us to the lockspace, in which case they'll
+ * be doing the recover_locks (RS_LOCKS) barrier.
+ */
+ dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
+ error = dlm_recover_locks_wait(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
+ goto fail;
+ }
+ }
+
+ dlm_release_root_list(ls);
+
+ /*
+ * Purge directory-related requests that are saved in requestqueue.
+ * All dir requests from before recovery are invalid now due to the dir
+ * rebuild and will be resent by the requesting nodes.
+ */
+
+ dlm_purge_requestqueue(ls);
+
+ dlm_set_recover_status(ls, DLM_RS_DONE);
+
+ error = dlm_recover_done_wait(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_done_wait error %d", error);
+ goto fail;
+ }
+
+ dlm_clear_members_gone(ls);
+
+ dlm_callback_resume(ls);
+
+ error = enable_locking(ls, rv->seq);
+ if (error) {
+ log_rinfo(ls, "enable_locking error %d", error);
+ goto fail;
+ }
+
+ error = dlm_process_requestqueue(ls);
+ if (error) {
+ log_rinfo(ls, "dlm_process_requestqueue error %d", error);
+ goto fail;
+ }
+
+ error = dlm_recover_waiters_post(ls);
+ if (error) {
+ log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
+ goto fail;
+ }
+
+ dlm_recover_grant(ls);
+
+ log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
+ (unsigned long long)rv->seq, ls->ls_generation,
+ jiffies_to_msecs(jiffies - start));
+ mutex_unlock(&ls->ls_recoverd_active);
+
+ return 0;
+
+ fail:
+ dlm_release_root_list(ls);
+ mutex_unlock(&ls->ls_recoverd_active);
+
+ return error;
+}
+
+/* The dlm_ls_start() that created the rv we take here may already have been
+ stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
+ flag set. */
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+ struct dlm_recover *rv = NULL;
+ int error;
+
+ spin_lock(&ls->ls_recover_lock);
+ rv = ls->ls_recover_args;
+ ls->ls_recover_args = NULL;
+ if (rv && ls->ls_recover_seq == rv->seq)
+ clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (rv) {
+ error = ls_recover(ls, rv);
+ switch (error) {
+ case 0:
+ ls->ls_recovery_result = 0;
+ complete(&ls->ls_recovery_done);
+
+ dlm_lsop_recover_done(ls);
+ break;
+ case -EINTR:
+ /* if recovery was interrupted -EINTR we wait for the next
+ * ls_recover() iteration until it hopefully succeeds.
+ */
+ log_rinfo(ls, "%s %llu interrupted and should be queued to run again",
+ __func__, (unsigned long long)rv->seq);
+ break;
+ default:
+ log_rinfo(ls, "%s %llu error %d", __func__,
+ (unsigned long long)rv->seq, error);
+
+ /* let new_lockspace() get aware of critical error */
+ ls->ls_recovery_result = error;
+ complete(&ls->ls_recovery_done);
+ break;
+ }
+
+ kfree(rv->nodes);
+ kfree(rv);
+ }
+}
+
+static int dlm_recoverd(void *arg)
+{
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_local(arg);
+ if (!ls) {
+ log_print("dlm_recoverd: no lockspace %p", arg);
+ return -1;
+ }
+
+ down_write(&ls->ls_in_recovery);
+ set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+ wake_up(&ls->ls_recover_lock_wait);
+
+ while (1) {
+ /*
+ * We call kthread_should_stop() after set_current_state().
+ * This is because it works correctly if kthread_stop() is
+ * called just before set_current_state().
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ break;
+ }
+ if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
+ !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
+ if (kthread_should_stop())
+ break;
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
+
+ if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
+ down_write(&ls->ls_in_recovery);
+ set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+ wake_up(&ls->ls_recover_lock_wait);
+ }
+
+ if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
+ do_ls_recovery(ls);
+ }
+
+ if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
+ up_write(&ls->ls_in_recovery);
+
+ dlm_put_lockspace(ls);
+ return 0;
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ ls->ls_recoverd_task = p;
+ return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+ kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+ wake_up(&ls->ls_wait_general);
+ mutex_lock(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+ mutex_unlock(&ls->ls_recoverd_active);
+}
+
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 0000000000..d1944dc5f9
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif /* __RECOVERD_DOT_H__ */
+
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 0000000000..892d6ca21e
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+#include "util.h"
+
+struct rq_entry {
+ struct list_head list;
+ uint32_t recover_seq;
+ int nodeid;
+ struct dlm_message request;
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete. This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid,
+ const struct dlm_message *ms)
+{
+ struct rq_entry *e;
+ int length = le16_to_cpu(ms->m_header.h_length) -
+ sizeof(struct dlm_message);
+
+ e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
+ if (!e) {
+ log_print("dlm_add_requestqueue: out of memory len %d", length);
+ return;
+ }
+
+ e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
+ e->nodeid = nodeid;
+ memcpy(&e->request, ms, sizeof(*ms));
+ memcpy(&e->request.m_extra, ms->m_extra, length);
+
+ atomic_inc(&ls->ls_requestqueue_cnt);
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_add_tail(&e->list, &ls->ls_requestqueue);
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+/*
+ * Called by dlm_recoverd to process normal messages saved while recovery was
+ * happening. Normal locking has been enabled before this is called. dlm_recv
+ * upon receiving a message, will wait for all saved messages to be drained
+ * here before processing the message it got. If a new dlm_ls_stop() arrives
+ * while we're processing these saved messages, it may block trying to suspend
+ * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that
+ * case, we don't abort since locking_stopped is still 0. If dlm_recv is not
+ * waiting for us, then this processing may be aborted due to locking_stopped.
+ */
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+ struct rq_entry *e;
+ struct dlm_message *ms;
+ int error = 0;
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+
+ for (;;) {
+ if (list_empty(&ls->ls_requestqueue)) {
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ error = 0;
+ break;
+ }
+ e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+
+ ms = &e->request;
+
+ log_limit(ls, "dlm_process_requestqueue msg %d from %d "
+ "lkid %x remid %x result %d seq %u",
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)),
+ e->recover_seq);
+
+ dlm_receive_message_saved(ls, &e->request, e->recover_seq);
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_del(&e->list);
+ if (atomic_dec_and_test(&ls->ls_requestqueue_cnt))
+ wake_up(&ls->ls_requestqueue_wait);
+ kfree(e);
+
+ if (dlm_locking_stopped(ls)) {
+ log_debug(ls, "process_requestqueue abort running");
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ error = -EINTR;
+ break;
+ }
+ schedule();
+ }
+
+ return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recv. At
+ * the same time, dlm_recv will start receiving new requests from remote nodes.
+ * We want to delay dlm_recv processing new requests until dlm_recoverd has
+ * finished processing the old saved requests. We don't check for locking
+ * stopped here because dlm_ls_stop won't stop locking until it's suspended us
+ * (dlm_recv).
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+ wait_event(ls->ls_requestqueue_wait,
+ atomic_read(&ls->ls_requestqueue_cnt) == 0);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+ __le32 type = ms->m_type;
+
+ /* the ls is being cleaned up and freed by release_lockspace */
+ if (!atomic_read(&ls->ls_count))
+ return 1;
+
+ if (dlm_is_removed(ls, nodeid))
+ return 1;
+
+ /* directory operations are always purged because the directory is
+ always rebuilt during recovery and the lookups resent */
+
+ if (type == cpu_to_le32(DLM_MSG_REMOVE) ||
+ type == cpu_to_le32(DLM_MSG_LOOKUP) ||
+ type == cpu_to_le32(DLM_MSG_LOOKUP_REPLY))
+ return 1;
+
+ if (!dlm_no_directory(ls))
+ return 0;
+
+ return 1;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+ struct dlm_message *ms;
+ struct rq_entry *e, *safe;
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+ ms = &e->request;
+
+ if (purge_request(ls, ms, e->nodeid)) {
+ list_del(&e->list);
+ if (atomic_dec_and_test(&ls->ls_requestqueue_cnt))
+ wake_up(&ls->ls_requestqueue_wait);
+ kfree(e);
+ }
+ }
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 0000000000..42bfe23cea
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid,
+ const struct dlm_message *ms);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 0000000000..695e691b38
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+
+#include <trace/events/dlm.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "ast.h"
+#include "config.h"
+#include "memory.h"
+
+static const char name_prefix[] = "dlm";
+static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
+
+#ifdef CONFIG_COMPAT
+
+struct dlm_lock_params32 {
+ __u8 mode;
+ __u8 namelen;
+ __u16 unused;
+ __u32 flags;
+ __u32 lkid;
+ __u32 parent;
+ __u64 xid;
+ __u64 timeout;
+ __u32 castparam;
+ __u32 castaddr;
+ __u32 bastparam;
+ __u32 bastaddr;
+ __u32 lksb;
+ char lvb[DLM_USER_LVB_LEN];
+ char name[];
+};
+
+struct dlm_write_request32 {
+ __u32 version[3];
+ __u8 cmd;
+ __u8 is64bit;
+ __u8 unused[2];
+
+ union {
+ struct dlm_lock_params32 lock;
+ struct dlm_lspace_params lspace;
+ struct dlm_purge_params purge;
+ } i;
+};
+
+struct dlm_lksb32 {
+ __u32 sb_status;
+ __u32 sb_lkid;
+ __u8 sb_flags;
+ __u32 sb_lvbptr;
+};
+
+struct dlm_lock_result32 {
+ __u32 version[3];
+ __u32 length;
+ __u32 user_astaddr;
+ __u32 user_astparam;
+ __u32 user_lksb;
+ struct dlm_lksb32 lksb;
+ __u8 bast_mode;
+ __u8 unused[3];
+ /* Offsets may be zero if no data is present */
+ __u32 lvb_offset;
+};
+
+static void compat_input(struct dlm_write_request *kb,
+ struct dlm_write_request32 *kb32,
+ int namelen)
+{
+ kb->version[0] = kb32->version[0];
+ kb->version[1] = kb32->version[1];
+ kb->version[2] = kb32->version[2];
+
+ kb->cmd = kb32->cmd;
+ kb->is64bit = kb32->is64bit;
+ if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+ kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+ kb->i.lspace.flags = kb32->i.lspace.flags;
+ kb->i.lspace.minor = kb32->i.lspace.minor;
+ memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen);
+ } else if (kb->cmd == DLM_USER_PURGE) {
+ kb->i.purge.nodeid = kb32->i.purge.nodeid;
+ kb->i.purge.pid = kb32->i.purge.pid;
+ } else {
+ kb->i.lock.mode = kb32->i.lock.mode;
+ kb->i.lock.namelen = kb32->i.lock.namelen;
+ kb->i.lock.flags = kb32->i.lock.flags;
+ kb->i.lock.lkid = kb32->i.lock.lkid;
+ kb->i.lock.parent = kb32->i.lock.parent;
+ kb->i.lock.xid = kb32->i.lock.xid;
+ kb->i.lock.timeout = kb32->i.lock.timeout;
+ kb->i.lock.castparam = (__user void *)(long)kb32->i.lock.castparam;
+ kb->i.lock.castaddr = (__user void *)(long)kb32->i.lock.castaddr;
+ kb->i.lock.bastparam = (__user void *)(long)kb32->i.lock.bastparam;
+ kb->i.lock.bastaddr = (__user void *)(long)kb32->i.lock.bastaddr;
+ kb->i.lock.lksb = (__user void *)(long)kb32->i.lock.lksb;
+ memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+ memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
+ }
+}
+
+static void compat_output(struct dlm_lock_result *res,
+ struct dlm_lock_result32 *res32)
+{
+ memset(res32, 0, sizeof(*res32));
+
+ res32->version[0] = res->version[0];
+ res32->version[1] = res->version[1];
+ res32->version[2] = res->version[2];
+
+ res32->user_astaddr = (__u32)(__force long)res->user_astaddr;
+ res32->user_astparam = (__u32)(__force long)res->user_astparam;
+ res32->user_lksb = (__u32)(__force long)res->user_lksb;
+ res32->bast_mode = res->bast_mode;
+
+ res32->lvb_offset = res->lvb_offset;
+ res32->length = res->length;
+
+ res32->lksb.sb_status = res->lksb.sb_status;
+ res32->lksb.sb_flags = res->lksb.sb_flags;
+ res32->lksb.sb_lkid = res->lksb.sb_lkid;
+ res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+
+/* should held proc->asts_spin lock */
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb)
+{
+ struct dlm_callback *cb, *safe;
+
+ list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) {
+ list_del(&cb->list);
+ kref_put(&cb->ref, dlm_release_callback);
+ }
+
+ clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags);
+
+ /* invalidate */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+ lkb->lkb_last_bast_mode = -1;
+}
+
+/* Figure out if this lock is at the end of its life and no longer
+ available for the application to use. The lkb still exists until
+ the final ast is read. A lock becomes EOL in three situations:
+ 1. a noqueue request fails with EAGAIN
+ 2. an unlock completes with EUNLOCK
+ 3. a cancel of a waiting request completes with ECANCEL/EDEADLK
+ An EOL lock needs to be removed from the process's list of locks.
+ And we can't allow any new operation on an EOL lock. This is
+ not related to the lifetime of the lkb struct which is managed
+ entirely by refcount. */
+
+static int lkb_is_endoflife(int mode, int status)
+{
+ switch (status) {
+ case -DLM_EUNLOCK:
+ return 1;
+ case -DLM_ECANCEL:
+ case -ETIMEDOUT:
+ case -EDEADLK:
+ case -EAGAIN:
+ if (mode == DLM_LOCK_IV)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+ being removed and then remove that lkb from the orphans list and free it */
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ struct dlm_user_proc *proc;
+ int rv;
+
+ if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) ||
+ test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags))
+ return;
+
+ ls = lkb->lkb_resource->res_ls;
+ spin_lock(&ls->ls_clear_proc_locks);
+
+ /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+ can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
+ lkb->ua so we can't try to use it. This second check is necessary
+ for cases where a completion ast is received for an operation that
+ began before clear_proc_locks did its cancel/unlock. */
+
+ if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) ||
+ test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags))
+ goto out;
+
+ DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb););
+ ua = lkb->lkb_ua;
+ proc = ua->proc;
+
+ if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
+ goto out;
+
+ if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
+ set_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags);
+
+ spin_lock(&proc->asts_spin);
+
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
+ spin_unlock(&proc->asts_spin);
+ WARN_ON_ONCE(1);
+ goto out;
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_cb_list, &proc->asts);
+ wake_up_interruptible(&proc->wait);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ spin_unlock(&proc->asts_spin);
+
+ if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) {
+ /* N.B. spin_lock locks_spin, not asts_spin */
+ spin_lock(&proc->locks_spin);
+ if (!list_empty(&lkb->lkb_ownqueue)) {
+ list_del_init(&lkb->lkb_ownqueue);
+ dlm_put_lkb(lkb);
+ }
+ spin_unlock(&proc->locks_spin);
+ }
+ out:
+ spin_unlock(&ls->ls_clear_proc_locks);
+}
+
+static int device_user_lock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ uint32_t lkid;
+ int error = -ENOMEM;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ if (!params->castaddr || !params->lksb) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+ if (!ua)
+ goto out;
+ ua->proc = proc;
+ ua->user_lksb = params->lksb;
+ ua->castparam = params->castparam;
+ ua->castaddr = params->castaddr;
+ ua->bastparam = params->bastparam;
+ ua->bastaddr = params->bastaddr;
+ ua->xid = params->xid;
+
+ if (params->flags & DLM_LKF_CONVERT) {
+ error = dlm_user_convert(ls, ua,
+ params->mode, params->flags,
+ params->lkid, params->lvb);
+ } else if (params->flags & DLM_LKF_ORPHAN) {
+ error = dlm_user_adopt_orphan(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen,
+ &lkid);
+ if (!error)
+ error = lkid;
+ } else {
+ error = dlm_user_request(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen);
+ if (!error)
+ error = ua->lksb.sb_lkid;
+ }
+ out:
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_user_unlock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ int error = -ENOMEM;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+ if (!ua)
+ goto out;
+ ua->proc = proc;
+ ua->user_lksb = params->lksb;
+ ua->castparam = params->castparam;
+ ua->castaddr = params->castaddr;
+
+ if (params->flags & DLM_LKF_CANCEL)
+ error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+ else
+ error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+ params->lvb);
+ out:
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_user_deadlock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ int error;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ error = dlm_user_deadlock(ls, params->flags, params->lkid);
+
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int dlm_device_register(struct dlm_ls *ls, char *name)
+{
+ int error, len;
+
+ /* The device is already registered. This happens when the
+ lockspace is created multiple times from userspace. */
+ if (ls->ls_device.name)
+ return 0;
+
+ error = -ENOMEM;
+ len = strlen(name) + strlen(name_prefix) + 2;
+ ls->ls_device.name = kzalloc(len, GFP_NOFS);
+ if (!ls->ls_device.name)
+ goto fail;
+
+ snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+ name);
+ ls->ls_device.fops = &device_fops;
+ ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+
+ error = misc_register(&ls->ls_device);
+ if (error) {
+ kfree(ls->ls_device.name);
+ /* this has to be set to NULL
+ * to avoid a double-free in dlm_device_deregister
+ */
+ ls->ls_device.name = NULL;
+ }
+fail:
+ return error;
+}
+
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+ /* The device is not registered. This happens when the lockspace
+ was never used from userspace, or when device_create_lockspace()
+ calls dlm_release_lockspace() after the register fails. */
+ if (!ls->ls_device.name)
+ return 0;
+
+ misc_deregister(&ls->ls_device);
+ kfree(ls->ls_device.name);
+ return 0;
+}
+
+static int device_user_purge(struct dlm_user_proc *proc,
+ struct dlm_purge_params *params)
+{
+ struct dlm_ls *ls;
+ int error;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
+
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+ dlm_lockspace_t *lockspace;
+ struct dlm_ls *ls;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name,
+ params->flags, DLM_USER_LVB_LEN, NULL,
+ NULL, NULL, &lockspace);
+ if (error)
+ return error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ error = dlm_device_register(ls, params->name);
+ dlm_put_lockspace(ls);
+
+ if (error)
+ dlm_release_lockspace(lockspace, 0);
+ else
+ error = ls->ls_device.minor;
+
+ return error;
+}
+
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+ dlm_lockspace_t *lockspace;
+ struct dlm_ls *ls;
+ int error, force = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ls = dlm_find_lockspace_device(params->minor);
+ if (!ls)
+ return -ENOENT;
+
+ if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+ force = 2;
+
+ lockspace = ls->ls_local_handle;
+ dlm_put_lockspace(ls);
+
+ /* The final dlm_release_lockspace waits for references to go to
+ zero, so all processes will need to close their device for the
+ ls before the release will proceed. release also calls the
+ device_deregister above. Converting a positive return value
+ from release to zero means that userspace won't know when its
+ release was the final one, but it shouldn't need to know. */
+
+ error = dlm_release_lockspace(lockspace, force);
+ if (error > 0)
+ error = 0;
+ return error;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+ if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+ (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+ req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+ printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+ "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+ current->comm,
+ task_pid_nr(current),
+ req->version[0],
+ req->version[1],
+ req->version[2],
+ DLM_DEVICE_VERSION_MAJOR,
+ DLM_DEVICE_VERSION_MINOR,
+ DLM_DEVICE_VERSION_PATCH);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * device_write
+ *
+ * device_user_lock
+ * dlm_user_request -> request_lock
+ * dlm_user_convert -> convert_lock
+ *
+ * device_user_unlock
+ * dlm_user_unlock -> unlock_lock
+ * dlm_user_cancel -> cancel_lock
+ *
+ * device_create_lockspace
+ * dlm_new_lockspace
+ *
+ * device_remove_lockspace
+ * dlm_release_lockspace
+ */
+
+/* a write to a lockspace device is a lock or unlock request, a write
+ to the control device is to create/remove a lockspace */
+
+static ssize_t device_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_write_request *kbuf;
+ int error;
+
+#ifdef CONFIG_COMPAT
+ if (count < sizeof(struct dlm_write_request32))
+#else
+ if (count < sizeof(struct dlm_write_request))
+#endif
+ return -EINVAL;
+
+ /*
+ * can't compare against COMPAT/dlm_write_request32 because
+ * we don't yet know if is64bit is zero
+ */
+ if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
+ return -EINVAL;
+
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ if (check_version(kbuf)) {
+ error = -EBADE;
+ goto out_free;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (!kbuf->is64bit) {
+ struct dlm_write_request32 *k32buf;
+ int namelen = 0;
+
+ if (count > sizeof(struct dlm_write_request32))
+ namelen = count - sizeof(struct dlm_write_request32);
+
+ k32buf = (struct dlm_write_request32 *)kbuf;
+
+ /* add 1 after namelen so that the name string is terminated */
+ kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
+ GFP_NOFS);
+ if (!kbuf) {
+ kfree(k32buf);
+ return -ENOMEM;
+ }
+
+ if (proc)
+ set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+
+ compat_input(kbuf, k32buf, namelen);
+ kfree(k32buf);
+ }
+#endif
+
+ /* do we really need this? can a write happen after a close? */
+ if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+ (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ error = -EINVAL;
+
+ switch (kbuf->cmd)
+ {
+ case DLM_USER_LOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_free;
+ }
+ error = device_user_lock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_UNLOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_free;
+ }
+ error = device_user_unlock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_DEADLOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_free;
+ }
+ error = device_user_deadlock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_CREATE_LOCKSPACE:
+ if (proc) {
+ log_print("create/remove only on control device");
+ goto out_free;
+ }
+ error = device_create_lockspace(&kbuf->i.lspace);
+ break;
+
+ case DLM_USER_REMOVE_LOCKSPACE:
+ if (proc) {
+ log_print("create/remove only on control device");
+ goto out_free;
+ }
+ error = device_remove_lockspace(&kbuf->i.lspace);
+ break;
+
+ case DLM_USER_PURGE:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_free;
+ }
+ error = device_user_purge(proc, &kbuf->i.purge);
+ break;
+
+ default:
+ log_print("Unknown command passed to DLM device : %d\n",
+ kbuf->cmd);
+ }
+
+ out_free:
+ kfree(kbuf);
+ return error;
+}
+
+/* Every process that opens the lockspace device has its own "proc" structure
+ hanging off the open file that's used to keep track of locks owned by the
+ process and asts that need to be delivered to the process. */
+
+static int device_open(struct inode *inode, struct file *file)
+{
+ struct dlm_user_proc *proc;
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_device(iminor(inode));
+ if (!ls)
+ return -ENOENT;
+
+ proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
+ if (!proc) {
+ dlm_put_lockspace(ls);
+ return -ENOMEM;
+ }
+
+ proc->lockspace = ls->ls_local_handle;
+ INIT_LIST_HEAD(&proc->asts);
+ INIT_LIST_HEAD(&proc->locks);
+ INIT_LIST_HEAD(&proc->unlocking);
+ spin_lock_init(&proc->asts_spin);
+ spin_lock_init(&proc->locks_spin);
+ init_waitqueue_head(&proc->wait);
+ file->private_data = proc;
+
+ return 0;
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+
+ dlm_clear_proc_locks(ls, proc);
+
+ /* at this point no more lkb's should exist for this lockspace,
+ so there's no chance of dlm_user_add_ast() being called and
+ looking for lkb->ua->proc */
+
+ kfree(proc);
+ file->private_data = NULL;
+
+ dlm_put_lockspace(ls);
+ dlm_put_lockspace(ls); /* for the find in device_open() */
+
+ /* FIXME: AUTOFREE: if this ls is no longer used do
+ device_remove_lockspace() */
+
+ return 0;
+}
+
+static int copy_result_to_user(struct dlm_user_args *ua, int compat,
+ uint32_t flags, int mode, int copy_lvb,
+ char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+ struct dlm_lock_result32 result32;
+#endif
+ struct dlm_lock_result result;
+ void *resultptr;
+ int error=0;
+ int len;
+ int struct_len;
+
+ memset(&result, 0, sizeof(struct dlm_lock_result));
+ result.version[0] = DLM_DEVICE_VERSION_MAJOR;
+ result.version[1] = DLM_DEVICE_VERSION_MINOR;
+ result.version[2] = DLM_DEVICE_VERSION_PATCH;
+ memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr));
+ result.user_lksb = ua->user_lksb;
+
+ /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+ in a conversion unless the conversion is successful. See code
+ in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
+ notes that a new blocking AST address and parameter are set even if
+ the conversion fails, so maybe we should just do that. */
+
+ if (flags & DLM_CB_BAST) {
+ result.user_astaddr = ua->bastaddr;
+ result.user_astparam = ua->bastparam;
+ result.bast_mode = mode;
+ } else {
+ result.user_astaddr = ua->castaddr;
+ result.user_astparam = ua->castparam;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (compat)
+ len = sizeof(struct dlm_lock_result32);
+ else
+#endif
+ len = sizeof(struct dlm_lock_result);
+ struct_len = len;
+
+ /* copy lvb to userspace if there is one, it's been updated, and
+ the user buffer has space for it */
+
+ if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
+ if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+ DLM_USER_LVB_LEN)) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ result.lvb_offset = len;
+ len += DLM_USER_LVB_LEN;
+ }
+
+ result.length = len;
+ resultptr = &result;
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ compat_output(&result, &result32);
+ resultptr = &result32;
+ }
+#endif
+
+ if (copy_to_user(buf, resultptr, struct_len))
+ error = -EFAULT;
+ else
+ error = len;
+ out:
+ return error;
+}
+
+static int copy_version_to_user(char __user *buf, size_t count)
+{
+ struct dlm_device_version ver;
+
+ memset(&ver, 0, sizeof(struct dlm_device_version));
+ ver.version[0] = DLM_DEVICE_VERSION_MAJOR;
+ ver.version[1] = DLM_DEVICE_VERSION_MINOR;
+ ver.version[2] = DLM_DEVICE_VERSION_PATCH;
+
+ if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version)))
+ return -EFAULT;
+ return sizeof(struct dlm_device_version);
+}
+
+/* a read returns a single ast described in a struct dlm_lock_result */
+
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_lkb *lkb;
+ DECLARE_WAITQUEUE(wait, current);
+ struct dlm_callback *cb;
+ int rv, copy_lvb = 0;
+ int old_mode, new_mode;
+
+ if (count == sizeof(struct dlm_device_version)) {
+ rv = copy_version_to_user(buf, count);
+ return rv;
+ }
+
+ if (!proc) {
+ log_print("non-version read from control device %zu", count);
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (count < sizeof(struct dlm_lock_result32))
+#else
+ if (count < sizeof(struct dlm_lock_result))
+#endif
+ return -EINVAL;
+
+ try_another:
+
+ /* do we really need this? can a read happen after a close? */
+ if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+ return -EINVAL;
+
+ spin_lock(&proc->asts_spin);
+ if (list_empty(&proc->asts)) {
+ if (file->f_flags & O_NONBLOCK) {
+ spin_unlock(&proc->asts_spin);
+ return -EAGAIN;
+ }
+
+ add_wait_queue(&proc->wait, &wait);
+
+ repeat:
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&proc->asts) && !signal_pending(current)) {
+ spin_unlock(&proc->asts_spin);
+ schedule();
+ spin_lock(&proc->asts_spin);
+ goto repeat;
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&proc->wait, &wait);
+
+ if (signal_pending(current)) {
+ spin_unlock(&proc->asts_spin);
+ return -ERESTARTSYS;
+ }
+ }
+
+ /* if we empty lkb_callbacks, we don't want to unlock the spinlock
+ without removing lkb_cb_list; so empty lkb_cb_list is always
+ consistent with empty lkb_callbacks */
+
+ lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list);
+
+ /* rem_lkb_callback sets a new lkb_last_cast */
+ old_mode = lkb->lkb_last_cast->mode;
+
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ switch (rv) {
+ case DLM_DEQUEUE_CALLBACK_EMPTY:
+ /* this shouldn't happen; lkb should have been removed from
+ * list when last item was dequeued
+ */
+ log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
+ list_del_init(&lkb->lkb_cb_list);
+ spin_unlock(&proc->asts_spin);
+ /* removes ref for proc->asts, may cause lkb to be freed */
+ dlm_put_lkb(lkb);
+ WARN_ON_ONCE(1);
+ goto try_another;
+ case DLM_DEQUEUE_CALLBACK_LAST:
+ list_del_init(&lkb->lkb_cb_list);
+ clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags);
+ break;
+ case DLM_DEQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ spin_unlock(&proc->asts_spin);
+
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ new_mode = cb->mode;
+
+ if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr &&
+ dlm_lvb_operations[old_mode + 1][new_mode + 1])
+ copy_lvb = 1;
+
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
+ trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
+ }
+
+ rv = copy_result_to_user(lkb->lkb_ua,
+ test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+ cb->flags, cb->mode, copy_lvb, buf, count);
+
+ kref_put(&cb->ref, dlm_release_callback);
+
+ /* removes ref for proc->asts, may cause lkb to be freed */
+ if (rv == DLM_DEQUEUE_CALLBACK_LAST)
+ dlm_put_lkb(lkb);
+
+ return rv;
+}
+
+static __poll_t device_poll(struct file *file, poll_table *wait)
+{
+ struct dlm_user_proc *proc = file->private_data;
+
+ poll_wait(file, &proc->wait, wait);
+
+ spin_lock(&proc->asts_spin);
+ if (!list_empty(&proc->asts)) {
+ spin_unlock(&proc->asts_spin);
+ return EPOLLIN | EPOLLRDNORM;
+ }
+ spin_unlock(&proc->asts_spin);
+ return 0;
+}
+
+int dlm_user_daemon_available(void)
+{
+ /* dlm_controld hasn't started (or, has started, but not
+ properly populated configfs) */
+
+ if (!dlm_our_nodeid())
+ return 0;
+
+ /* This is to deal with versions of dlm_controld that don't
+ know about the monitor device. We assume that if the
+ dlm_controld was started (above), but the monitor device
+ was never opened, that it's an old version. dlm_controld
+ should open the monitor device before populating configfs. */
+
+ if (dlm_monitor_unused)
+ return 1;
+
+ return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
+
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return 0;
+}
+
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+ atomic_inc(&dlm_monitor_opened);
+ dlm_monitor_unused = 0;
+ return 0;
+}
+
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+ if (atomic_dec_and_test(&dlm_monitor_opened))
+ dlm_stop_lockspaces();
+ return 0;
+}
+
+static const struct file_operations device_fops = {
+ .open = device_open,
+ .release = device_close,
+ .read = device_read,
+ .write = device_write,
+ .poll = device_poll,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static const struct file_operations ctl_device_fops = {
+ .open = ctl_device_open,
+ .release = ctl_device_close,
+ .read = device_read,
+ .write = device_write,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice ctl_device = {
+ .name = "dlm-control",
+ .fops = &ctl_device_fops,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static const struct file_operations monitor_device_fops = {
+ .open = monitor_device_open,
+ .release = monitor_device_close,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice monitor_device = {
+ .name = "dlm-monitor",
+ .fops = &monitor_device_fops,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+int __init dlm_user_init(void)
+{
+ int error;
+
+ atomic_set(&dlm_monitor_opened, 0);
+
+ error = misc_register(&ctl_device);
+ if (error) {
+ log_print("misc_register failed for control device");
+ goto out;
+ }
+
+ error = misc_register(&monitor_device);
+ if (error) {
+ log_print("misc_register failed for monitor device");
+ misc_deregister(&ctl_device);
+ }
+ out:
+ return error;
+}
+
+void dlm_user_exit(void)
+{
+ misc_deregister(&ctl_device);
+ misc_deregister(&monitor_device);
+}
+
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 0000000000..2caf8e6e24
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
+ */
+
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb);
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
+
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 0000000000..f2bc401f31
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+#define DLM_ERRNO_EDEADLK 35
+#define DLM_ERRNO_EBADR 53
+#define DLM_ERRNO_EBADSLT 57
+#define DLM_ERRNO_EPROTO 71
+#define DLM_ERRNO_EOPNOTSUPP 95
+#define DLM_ERRNO_ETIMEDOUT 110
+#define DLM_ERRNO_EINPROGRESS 115
+
+/* higher errno values are inconsistent across architectures, so select
+ one set of values for on the wire */
+
+int to_dlm_errno(int err)
+{
+ switch (err) {
+ case -EDEADLK:
+ return -DLM_ERRNO_EDEADLK;
+ case -EBADR:
+ return -DLM_ERRNO_EBADR;
+ case -EBADSLT:
+ return -DLM_ERRNO_EBADSLT;
+ case -EPROTO:
+ return -DLM_ERRNO_EPROTO;
+ case -EOPNOTSUPP:
+ return -DLM_ERRNO_EOPNOTSUPP;
+ case -ETIMEDOUT:
+ return -DLM_ERRNO_ETIMEDOUT;
+ case -EINPROGRESS:
+ return -DLM_ERRNO_EINPROGRESS;
+ }
+ return err;
+}
+
+int from_dlm_errno(int err)
+{
+ switch (err) {
+ case -DLM_ERRNO_EDEADLK:
+ return -EDEADLK;
+ case -DLM_ERRNO_EBADR:
+ return -EBADR;
+ case -DLM_ERRNO_EBADSLT:
+ return -EBADSLT;
+ case -DLM_ERRNO_EPROTO:
+ return -EPROTO;
+ case -DLM_ERRNO_EOPNOTSUPP:
+ return -EOPNOTSUPP;
+ case -DLM_ERRNO_ETIMEDOUT:
+ return -ETIMEDOUT;
+ case -DLM_ERRNO_EINPROGRESS:
+ return -EINPROGRESS;
+ }
+ return err;
+}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 0000000000..b6a4b8adca
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+int to_dlm_errno(int err);
+int from_dlm_errno(int err);
+
+#endif
+