diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /fs/dlm | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/dlm')
37 files changed, 19585 insertions, 0 deletions
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig new file mode 100644 index 0000000000..f82a495276 --- /dev/null +++ b/fs/dlm/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig DLM + tristate "Distributed Lock Manager (DLM)" + depends on INET + depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) + select IP_SCTP + help + A general purpose distributed lock manager for kernel or userspace + applications. + +config DLM_DEBUG + bool "DLM debugging" + depends on DLM + help + Under the debugfs mount point, the name of each lockspace will + appear as a file in the "dlm" directory. The output is the + list of resource and locks the local node knows about. diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile new file mode 100644 index 0000000000..5a471af1d1 --- /dev/null +++ b/fs/dlm/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DLM) += dlm.o +dlm-y := ast.o \ + config.o \ + dir.o \ + lock.o \ + lockspace.o \ + main.o \ + member.o \ + memory.o \ + midcomms.o \ + lowcomms.o \ + plock.o \ + rcom.o \ + recover.o \ + recoverd.o \ + requestqueue.o \ + user.o \ + util.o +dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o + diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c new file mode 100644 index 0000000000..1f2f70a1b8 --- /dev/null +++ b/fs/dlm/ast.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include <trace/events/dlm.h> + +#include "dlm_internal.h" +#include "memory.h" +#include "lock.h" +#include "user.h" +#include "ast.h" + +void dlm_release_callback(struct kref *ref) +{ + struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref); + + dlm_free_cb(cb); +} + +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to) +{ + if (*from) + kref_put(&(*from)->ref, dlm_release_callback); + + if (to) + kref_get(&to->ref); + + *from = to; +} + +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int rv = DLM_ENQUEUE_CALLBACK_SUCCESS; + struct dlm_callback *cb; + int prev_mode; + + if (flags & DLM_CB_BAST) { + /* if cb is a bast, it should be skipped if the blocking mode is + * compatible with the last granted mode + */ + if (lkb->lkb_last_cast) { + if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) { + log_debug(ls, "skip %x bast mode %d for cast mode %d", + lkb->lkb_id, mode, + lkb->lkb_last_cast->mode); + goto out; + } + } + + /* + * Suppress some redundant basts here, do more on removal. + * Don't even add a bast if the callback just before it + * is a bast for the same mode or a more restrictive mode. + * (the addional > PR check is needed for PR/CW inversion) + */ + if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) { + prev_mode = lkb->lkb_last_cb->mode; + + if ((prev_mode == mode) || + (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { + log_debug(ls, "skip %x add bast mode %d for bast mode %d", + lkb->lkb_id, mode, prev_mode); + goto out; + } + } + } + + cb = dlm_allocate_cb(); + if (!cb) { + rv = DLM_ENQUEUE_CALLBACK_FAILURE; + goto out; + } + + cb->flags = flags; + cb->mode = mode; + cb->sb_status = status; + cb->sb_flags = (sbflags & 0x000000FF); + kref_init(&cb->ref); + if (!test_and_set_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags)) + rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED; + + list_add_tail(&cb->list, &lkb->lkb_callbacks); + + if (flags & DLM_CB_CAST) + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb); + + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb); + + out: + return rv; +} + +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb) +{ + /* oldest undelivered cb is callbacks first entry */ + *cb = list_first_entry_or_null(&lkb->lkb_callbacks, + struct dlm_callback, list); + if (!*cb) + return DLM_DEQUEUE_CALLBACK_EMPTY; + + /* remove it from callbacks so shift others down */ + list_del(&(*cb)->list); + if (list_empty(&lkb->lkb_callbacks)) + return DLM_DEQUEUE_CALLBACK_LAST; + + return DLM_DEQUEUE_CALLBACK_SUCCESS; +} + +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int rv; + + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { + dlm_user_add_ast(lkb, flags, mode, status, sbflags); + return; + } + + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: + kref_get(&lkb->lkb_ref); + + spin_lock(&ls->ls_cb_lock); + if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { + list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); + } else { + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + } + spin_unlock(&ls->ls_cb_lock); + break; + case DLM_ENQUEUE_CALLBACK_FAILURE: + WARN_ON_ONCE(1); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; + } + spin_unlock(&lkb->lkb_cb_lock); +} + +void dlm_callback_work(struct work_struct *work) +{ + struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work); + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + void (*castfn) (void *astparam); + void (*bastfn) (void *astparam, int mode); + struct dlm_callback *cb; + int rv; + + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) { + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); + spin_unlock(&lkb->lkb_cb_lock); + goto out; + } + spin_unlock(&lkb->lkb_cb_lock); + + for (;;) { + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; + + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(ls, lkb, cb->mode); + lkb->lkb_last_bast_time = ktime_get(); + lkb->lkb_last_bast_mode = cb->mode; + bastfn(lkb->lkb_astparam, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; + trace_dlm_ast(ls, lkb); + lkb->lkb_last_cast_time = ktime_get(); + castfn(lkb->lkb_astparam); + } + + kref_put(&cb->ref, dlm_release_callback); + + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) { + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); + spin_unlock(&lkb->lkb_cb_lock); + break; + } + spin_unlock(&lkb->lkb_cb_lock); + } + +out: + /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ + dlm_put_lkb(lkb); +} + +int dlm_callback_start(struct dlm_ls *ls) +{ + ls->ls_callback_wq = alloc_workqueue("dlm_callback", + WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); + if (!ls->ls_callback_wq) { + log_print("can't start dlm_callback workqueue"); + return -ENOMEM; + } + return 0; +} + +void dlm_callback_stop(struct dlm_ls *ls) +{ + if (ls->ls_callback_wq) + destroy_workqueue(ls->ls_callback_wq); +} + +void dlm_callback_suspend(struct dlm_ls *ls) +{ + if (ls->ls_callback_wq) { + spin_lock(&ls->ls_cb_lock); + set_bit(LSFL_CB_DELAY, &ls->ls_flags); + spin_unlock(&ls->ls_cb_lock); + + flush_workqueue(ls->ls_callback_wq); + } +} + +#define MAX_CB_QUEUE 25 + +void dlm_callback_resume(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + int count = 0, sum = 0; + bool empty; + + if (!ls->ls_callback_wq) + return; + +more: + spin_lock(&ls->ls_cb_lock); + list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { + list_del_init(&lkb->lkb_cb_list); + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + count++; + if (count == MAX_CB_QUEUE) + break; + } + empty = list_empty(&ls->ls_cb_delay); + if (empty) + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + spin_unlock(&ls->ls_cb_lock); + + sum += count; + if (!empty) { + count = 0; + cond_resched(); + goto more; + } + + if (sum) + log_rinfo(ls, "%s %d", __func__, sum); +} + diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h new file mode 100644 index 0000000000..ce007892dc --- /dev/null +++ b/fs/dlm/ast.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __ASTD_DOT_H__ +#define __ASTD_DOT_H__ + +#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1 +#define DLM_ENQUEUE_CALLBACK_SUCCESS 0 +#define DLM_ENQUEUE_CALLBACK_FAILURE -1 +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags); +#define DLM_DEQUEUE_CALLBACK_EMPTY 2 +#define DLM_DEQUEUE_CALLBACK_LAST 1 +#define DLM_DEQUEUE_CALLBACK_SUCCESS 0 +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb); +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags); +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to); + +void dlm_release_callback(struct kref *ref); +void dlm_callback_work(struct work_struct *work); +int dlm_callback_start(struct dlm_ls *ls); +void dlm_callback_stop(struct dlm_ls *ls); +void dlm_callback_suspend(struct dlm_ls *ls); +void dlm_callback_resume(struct dlm_ls *ls); + +#endif + + diff --git a/fs/dlm/config.c b/fs/dlm/config.c new file mode 100644 index 0000000000..e55e0a2cd2 --- /dev/null +++ b/fs/dlm/config.c @@ -0,0 +1,966 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/configfs.h> +#include <linux/slab.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/dlmconstants.h> +#include <net/ipv6.h> +#include <net/sock.h> + +#include "config.h" +#include "midcomms.h" +#include "lowcomms.h" + +/* + * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid + * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight + * /config/dlm/<cluster>/comms/<comm>/nodeid + * /config/dlm/<cluster>/comms/<comm>/local + * /config/dlm/<cluster>/comms/<comm>/addr (write only) + * /config/dlm/<cluster>/comms/<comm>/addr_list (read only) + * The <cluster> level is useless, but I haven't figured out how to avoid it. + */ + +static struct config_group *space_list; +static struct config_group *comm_list; +static struct dlm_comm *local_comm; +static uint32_t dlm_comm_count; + +struct dlm_clusters; +struct dlm_cluster; +struct dlm_spaces; +struct dlm_space; +struct dlm_comms; +struct dlm_comm; +struct dlm_nodes; +struct dlm_node; + +static struct config_group *make_cluster(struct config_group *, const char *); +static void drop_cluster(struct config_group *, struct config_item *); +static void release_cluster(struct config_item *); +static struct config_group *make_space(struct config_group *, const char *); +static void drop_space(struct config_group *, struct config_item *); +static void release_space(struct config_item *); +static struct config_item *make_comm(struct config_group *, const char *); +static void drop_comm(struct config_group *, struct config_item *); +static void release_comm(struct config_item *); +static struct config_item *make_node(struct config_group *, const char *); +static void drop_node(struct config_group *, struct config_item *); +static void release_node(struct config_item *); + +static struct configfs_attribute *comm_attrs[]; +static struct configfs_attribute *node_attrs[]; + +struct dlm_cluster { + struct config_group group; + unsigned int cl_tcp_port; + unsigned int cl_buffer_size; + unsigned int cl_rsbtbl_size; + unsigned int cl_recover_timer; + unsigned int cl_toss_secs; + unsigned int cl_scan_secs; + unsigned int cl_log_debug; + unsigned int cl_log_info; + unsigned int cl_protocol; + unsigned int cl_mark; + unsigned int cl_new_rsb_count; + unsigned int cl_recover_callbacks; + char cl_cluster_name[DLM_LOCKSPACE_LEN]; + + struct dlm_spaces *sps; + struct dlm_comms *cms; +}; + +static struct dlm_cluster *config_item_to_cluster(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_cluster, group) : + NULL; +} + +enum { + CLUSTER_ATTR_TCP_PORT = 0, + CLUSTER_ATTR_BUFFER_SIZE, + CLUSTER_ATTR_RSBTBL_SIZE, + CLUSTER_ATTR_RECOVER_TIMER, + CLUSTER_ATTR_TOSS_SECS, + CLUSTER_ATTR_SCAN_SECS, + CLUSTER_ATTR_LOG_DEBUG, + CLUSTER_ATTR_LOG_INFO, + CLUSTER_ATTR_PROTOCOL, + CLUSTER_ATTR_MARK, + CLUSTER_ATTR_NEW_RSB_COUNT, + CLUSTER_ATTR_RECOVER_CALLBACKS, + CLUSTER_ATTR_CLUSTER_NAME, +}; + +static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf) +{ + struct dlm_cluster *cl = config_item_to_cluster(item); + return sprintf(buf, "%s\n", cl->cl_cluster_name); +} + +static ssize_t cluster_cluster_name_store(struct config_item *item, + const char *buf, size_t len) +{ + struct dlm_cluster *cl = config_item_to_cluster(item); + + strscpy(dlm_config.ci_cluster_name, buf, + sizeof(dlm_config.ci_cluster_name)); + strscpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); + return len; +} + +CONFIGFS_ATTR(cluster_, cluster_name); + +static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, + int *info_field, int (*check_cb)(unsigned int x), + const char *buf, size_t len) +{ + unsigned int x; + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + rc = kstrtouint(buf, 0, &x); + if (rc) + return rc; + + if (check_cb) { + rc = check_cb(x); + if (rc) + return rc; + } + + *cl_field = x; + *info_field = x; + + return len; +} + +#define CLUSTER_ATTR(name, check_cb) \ +static ssize_t cluster_##name##_store(struct config_item *item, \ + const char *buf, size_t len) \ +{ \ + struct dlm_cluster *cl = config_item_to_cluster(item); \ + return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ + check_cb, buf, len); \ +} \ +static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ +{ \ + struct dlm_cluster *cl = config_item_to_cluster(item); \ + return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ +} \ +CONFIGFS_ATTR(cluster_, name); + +static int dlm_check_protocol_and_dlm_running(unsigned int x) +{ + switch (x) { + case 0: + /* TCP */ + break; + case 1: + /* SCTP */ + break; + default: + return -EINVAL; + } + + if (dlm_lowcomms_is_running()) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero_and_dlm_running(unsigned int x) +{ + if (!x) + return -EINVAL; + + if (dlm_lowcomms_is_running()) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero(unsigned int x) +{ + if (!x) + return -EINVAL; + + return 0; +} + +static int dlm_check_buffer_size(unsigned int x) +{ + if (x < DLM_MAX_SOCKET_BUFSIZE) + return -EINVAL; + + return 0; +} + +CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running); +CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); +CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); +CLUSTER_ATTR(recover_timer, dlm_check_zero); +CLUSTER_ATTR(toss_secs, dlm_check_zero); +CLUSTER_ATTR(scan_secs, dlm_check_zero); +CLUSTER_ATTR(log_debug, NULL); +CLUSTER_ATTR(log_info, NULL); +CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); +CLUSTER_ATTR(mark, NULL); +CLUSTER_ATTR(new_rsb_count, NULL); +CLUSTER_ATTR(recover_callbacks, NULL); + +static struct configfs_attribute *cluster_attrs[] = { + [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, + [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size, + [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size, + [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer, + [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs, + [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs, + [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, + [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, + [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, + [CLUSTER_ATTR_MARK] = &cluster_attr_mark, + [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, + [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, + [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, + NULL, +}; + +enum { + COMM_ATTR_NODEID = 0, + COMM_ATTR_LOCAL, + COMM_ATTR_ADDR, + COMM_ATTR_ADDR_LIST, + COMM_ATTR_MARK, +}; + +enum { + NODE_ATTR_NODEID = 0, + NODE_ATTR_WEIGHT, +}; + +struct dlm_clusters { + struct configfs_subsystem subsys; +}; + +struct dlm_spaces { + struct config_group ss_group; +}; + +struct dlm_space { + struct config_group group; + struct list_head members; + struct mutex members_lock; + int members_count; + struct dlm_nodes *nds; +}; + +struct dlm_comms { + struct config_group cs_group; +}; + +struct dlm_comm { + struct config_item item; + int seq; + int nodeid; + int local; + int addr_count; + unsigned int mark; + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; +}; + +struct dlm_nodes { + struct config_group ns_group; +}; + +struct dlm_node { + struct config_item item; + struct list_head list; /* space->members */ + int nodeid; + int weight; + int new; + int comm_seq; /* copy of cm->seq when nd->nodeid is set */ +}; + +static struct configfs_group_operations clusters_ops = { + .make_group = make_cluster, + .drop_item = drop_cluster, +}; + +static struct configfs_item_operations cluster_ops = { + .release = release_cluster, +}; + +static struct configfs_group_operations spaces_ops = { + .make_group = make_space, + .drop_item = drop_space, +}; + +static struct configfs_item_operations space_ops = { + .release = release_space, +}; + +static struct configfs_group_operations comms_ops = { + .make_item = make_comm, + .drop_item = drop_comm, +}; + +static struct configfs_item_operations comm_ops = { + .release = release_comm, +}; + +static struct configfs_group_operations nodes_ops = { + .make_item = make_node, + .drop_item = drop_node, +}; + +static struct configfs_item_operations node_ops = { + .release = release_node, +}; + +static const struct config_item_type clusters_type = { + .ct_group_ops = &clusters_ops, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type cluster_type = { + .ct_item_ops = &cluster_ops, + .ct_attrs = cluster_attrs, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type spaces_type = { + .ct_group_ops = &spaces_ops, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type space_type = { + .ct_item_ops = &space_ops, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type comms_type = { + .ct_group_ops = &comms_ops, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type comm_type = { + .ct_item_ops = &comm_ops, + .ct_attrs = comm_attrs, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type nodes_type = { + .ct_group_ops = &nodes_ops, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type node_type = { + .ct_item_ops = &node_ops, + .ct_attrs = node_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct dlm_space *config_item_to_space(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_space, group) : + NULL; +} + +static struct dlm_comm *config_item_to_comm(struct config_item *i) +{ + return i ? container_of(i, struct dlm_comm, item) : NULL; +} + +static struct dlm_node *config_item_to_node(struct config_item *i) +{ + return i ? container_of(i, struct dlm_node, item) : NULL; +} + +static struct config_group *make_cluster(struct config_group *g, + const char *name) +{ + struct dlm_cluster *cl = NULL; + struct dlm_spaces *sps = NULL; + struct dlm_comms *cms = NULL; + + cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); + sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); + cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); + + if (!cl || !sps || !cms) + goto fail; + + cl->sps = sps; + cl->cms = cms; + + config_group_init_type_name(&cl->group, name, &cluster_type); + config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); + config_group_init_type_name(&cms->cs_group, "comms", &comms_type); + + configfs_add_default_group(&sps->ss_group, &cl->group); + configfs_add_default_group(&cms->cs_group, &cl->group); + + cl->cl_tcp_port = dlm_config.ci_tcp_port; + cl->cl_buffer_size = dlm_config.ci_buffer_size; + cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; + cl->cl_recover_timer = dlm_config.ci_recover_timer; + cl->cl_toss_secs = dlm_config.ci_toss_secs; + cl->cl_scan_secs = dlm_config.ci_scan_secs; + cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_log_info = dlm_config.ci_log_info; + cl->cl_protocol = dlm_config.ci_protocol; + cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; + cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; + memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, + DLM_LOCKSPACE_LEN); + + space_list = &sps->ss_group; + comm_list = &cms->cs_group; + return &cl->group; + + fail: + kfree(cl); + kfree(sps); + kfree(cms); + return ERR_PTR(-ENOMEM); +} + +static void drop_cluster(struct config_group *g, struct config_item *i) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + + configfs_remove_default_groups(&cl->group); + + space_list = NULL; + comm_list = NULL; + + config_item_put(i); +} + +static void release_cluster(struct config_item *i) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + + kfree(cl->sps); + kfree(cl->cms); + kfree(cl); +} + +static struct config_group *make_space(struct config_group *g, const char *name) +{ + struct dlm_space *sp = NULL; + struct dlm_nodes *nds = NULL; + + sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); + nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); + + if (!sp || !nds) + goto fail; + + config_group_init_type_name(&sp->group, name, &space_type); + + config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); + configfs_add_default_group(&nds->ns_group, &sp->group); + + INIT_LIST_HEAD(&sp->members); + mutex_init(&sp->members_lock); + sp->members_count = 0; + sp->nds = nds; + return &sp->group; + + fail: + kfree(sp); + kfree(nds); + return ERR_PTR(-ENOMEM); +} + +static void drop_space(struct config_group *g, struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(i); + + /* assert list_empty(&sp->members) */ + + configfs_remove_default_groups(&sp->group); + config_item_put(i); +} + +static void release_space(struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(i); + kfree(sp->nds); + kfree(sp); +} + +static struct config_item *make_comm(struct config_group *g, const char *name) +{ + struct dlm_comm *cm; + + cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); + if (!cm) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&cm->item, name, &comm_type); + + cm->seq = dlm_comm_count++; + if (!cm->seq) + cm->seq = dlm_comm_count++; + + cm->nodeid = -1; + cm->local = 0; + cm->addr_count = 0; + cm->mark = 0; + return &cm->item; +} + +static void drop_comm(struct config_group *g, struct config_item *i) +{ + struct dlm_comm *cm = config_item_to_comm(i); + if (local_comm == cm) + local_comm = NULL; + dlm_midcomms_close(cm->nodeid); + while (cm->addr_count--) + kfree(cm->addr[cm->addr_count]); + config_item_put(i); +} + +static void release_comm(struct config_item *i) +{ + struct dlm_comm *cm = config_item_to_comm(i); + kfree(cm); +} + +static struct config_item *make_node(struct config_group *g, const char *name) +{ + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd; + + nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); + if (!nd) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&nd->item, name, &node_type); + nd->nodeid = -1; + nd->weight = 1; /* default weight of 1 if none is set */ + nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ + + mutex_lock(&sp->members_lock); + list_add(&nd->list, &sp->members); + sp->members_count++; + mutex_unlock(&sp->members_lock); + + return &nd->item; +} + +static void drop_node(struct config_group *g, struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd = config_item_to_node(i); + + mutex_lock(&sp->members_lock); + list_del(&nd->list); + sp->members_count--; + mutex_unlock(&sp->members_lock); + + config_item_put(i); +} + +static void release_node(struct config_item *i) +{ + struct dlm_node *nd = config_item_to_node(i); + kfree(nd); +} + +static struct dlm_clusters clusters_root = { + .subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "dlm", + .ci_type = &clusters_type, + }, + }, + }, +}; + +int __init dlm_config_init(void) +{ + config_group_init(&clusters_root.subsys.su_group); + mutex_init(&clusters_root.subsys.su_mutex); + return configfs_register_subsystem(&clusters_root.subsys); +} + +void dlm_config_exit(void) +{ + configfs_unregister_subsystem(&clusters_root.subsys); +} + +/* + * Functions for user space to read/write attributes + */ + +static ssize_t comm_nodeid_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid); +} + +static ssize_t comm_nodeid_store(struct config_item *item, const char *buf, + size_t len) +{ + int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid); + + if (rc) + return rc; + return len; +} + +static ssize_t comm_local_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%d\n", config_item_to_comm(item)->local); +} + +static ssize_t comm_local_store(struct config_item *item, const char *buf, + size_t len) +{ + struct dlm_comm *cm = config_item_to_comm(item); + int rc = kstrtoint(buf, 0, &cm->local); + + if (rc) + return rc; + if (cm->local && !local_comm) + local_comm = cm; + return len; +} + +static ssize_t comm_addr_store(struct config_item *item, const char *buf, + size_t len) +{ + struct dlm_comm *cm = config_item_to_comm(item); + struct sockaddr_storage *addr; + int rv; + + if (len != sizeof(struct sockaddr_storage)) + return -EINVAL; + + if (cm->addr_count >= DLM_MAX_ADDR_COUNT) + return -ENOSPC; + + addr = kzalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + return -ENOMEM; + + memcpy(addr, buf, len); + + rv = dlm_midcomms_addr(cm->nodeid, addr, len); + if (rv) { + kfree(addr); + return rv; + } + + cm->addr[cm->addr_count++] = addr; + return len; +} + +static ssize_t comm_addr_list_show(struct config_item *item, char *buf) +{ + struct dlm_comm *cm = config_item_to_comm(item); + ssize_t s; + ssize_t allowance; + int i; + struct sockaddr_storage *addr; + struct sockaddr_in *addr_in; + struct sockaddr_in6 *addr_in6; + + /* Taken from ip6_addr_string() defined in lib/vsprintf.c */ + char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")]; + + + /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */ + allowance = 4096; + buf[0] = '\0'; + + for (i = 0; i < cm->addr_count; i++) { + addr = cm->addr[i]; + + switch(addr->ss_family) { + case AF_INET: + addr_in = (struct sockaddr_in *)addr; + s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr); + break; + case AF_INET6: + addr_in6 = (struct sockaddr_in6 *)addr; + s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr); + break; + default: + s = sprintf(buf0, "%s\n", "<UNKNOWN>"); + break; + } + allowance -= s; + if (allowance >= 0) + strcat(buf, buf0); + else { + allowance += s; + break; + } + } + return 4096 - allowance; +} + +static ssize_t comm_mark_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%u\n", config_item_to_comm(item)->mark); +} + +static ssize_t comm_mark_store(struct config_item *item, const char *buf, + size_t len) +{ + struct dlm_comm *comm; + unsigned int mark; + int rc; + + rc = kstrtouint(buf, 0, &mark); + if (rc) + return rc; + + if (mark == 0) + mark = dlm_config.ci_mark; + + comm = config_item_to_comm(item); + rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark); + if (rc) + return rc; + + comm->mark = mark; + return len; +} + +CONFIGFS_ATTR(comm_, nodeid); +CONFIGFS_ATTR(comm_, local); +CONFIGFS_ATTR(comm_, mark); +CONFIGFS_ATTR_WO(comm_, addr); +CONFIGFS_ATTR_RO(comm_, addr_list); + +static struct configfs_attribute *comm_attrs[] = { + [COMM_ATTR_NODEID] = &comm_attr_nodeid, + [COMM_ATTR_LOCAL] = &comm_attr_local, + [COMM_ATTR_ADDR] = &comm_attr_addr, + [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list, + [COMM_ATTR_MARK] = &comm_attr_mark, + NULL, +}; + +static ssize_t node_nodeid_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid); +} + +static ssize_t node_nodeid_store(struct config_item *item, const char *buf, + size_t len) +{ + struct dlm_node *nd = config_item_to_node(item); + uint32_t seq = 0; + int rc = kstrtoint(buf, 0, &nd->nodeid); + + if (rc) + return rc; + dlm_comm_seq(nd->nodeid, &seq); + nd->comm_seq = seq; + return len; +} + +static ssize_t node_weight_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%d\n", config_item_to_node(item)->weight); +} + +static ssize_t node_weight_store(struct config_item *item, const char *buf, + size_t len) +{ + int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight); + + if (rc) + return rc; + return len; +} + +CONFIGFS_ATTR(node_, nodeid); +CONFIGFS_ATTR(node_, weight); + +static struct configfs_attribute *node_attrs[] = { + [NODE_ATTR_NODEID] = &node_attr_nodeid, + [NODE_ATTR_WEIGHT] = &node_attr_weight, + NULL, +}; + +/* + * Functions for the dlm to get the info that's been configured + */ + +static struct dlm_space *get_space(char *name) +{ + struct config_item *i; + + if (!space_list) + return NULL; + + mutex_lock(&space_list->cg_subsys->su_mutex); + i = config_group_find_item(space_list, name); + mutex_unlock(&space_list->cg_subsys->su_mutex); + + return config_item_to_space(i); +} + +static void put_space(struct dlm_space *sp) +{ + config_item_put(&sp->group.cg_item); +} + +static struct dlm_comm *get_comm(int nodeid) +{ + struct config_item *i; + struct dlm_comm *cm = NULL; + int found = 0; + + if (!comm_list) + return NULL; + + mutex_lock(&clusters_root.subsys.su_mutex); + + list_for_each_entry(i, &comm_list->cg_children, ci_entry) { + cm = config_item_to_comm(i); + + if (cm->nodeid != nodeid) + continue; + found = 1; + config_item_get(i); + break; + } + mutex_unlock(&clusters_root.subsys.su_mutex); + + if (!found) + cm = NULL; + return cm; +} + +static void put_comm(struct dlm_comm *cm) +{ + config_item_put(&cm->item); +} + +/* caller must free mem */ +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out) +{ + struct dlm_space *sp; + struct dlm_node *nd; + struct dlm_config_node *nodes, *node; + int rv, count; + + sp = get_space(lsname); + if (!sp) + return -EEXIST; + + mutex_lock(&sp->members_lock); + if (!sp->members_count) { + rv = -EINVAL; + printk(KERN_ERR "dlm: zero members_count\n"); + goto out; + } + + count = sp->members_count; + + nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); + if (!nodes) { + rv = -ENOMEM; + goto out; + } + + node = nodes; + list_for_each_entry(nd, &sp->members, list) { + node->nodeid = nd->nodeid; + node->weight = nd->weight; + node->new = nd->new; + node->comm_seq = nd->comm_seq; + node++; + + nd->new = 0; + } + + *count_out = count; + *nodes_out = nodes; + rv = 0; + out: + mutex_unlock(&sp->members_lock); + put_space(sp); + return rv; +} + +int dlm_comm_seq(int nodeid, uint32_t *seq) +{ + struct dlm_comm *cm = get_comm(nodeid); + if (!cm) + return -EEXIST; + *seq = cm->seq; + put_comm(cm); + return 0; +} + +int dlm_our_nodeid(void) +{ + return local_comm ? local_comm->nodeid : 0; +} + +/* num 0 is first addr, num 1 is second addr */ +int dlm_our_addr(struct sockaddr_storage *addr, int num) +{ + if (!local_comm) + return -1; + if (num + 1 > local_comm->addr_count) + return -1; + memcpy(addr, local_comm->addr[num], sizeof(*addr)); + return 0; +} + +/* Config file defaults */ +#define DEFAULT_TCP_PORT 21064 +#define DEFAULT_RSBTBL_SIZE 1024 +#define DEFAULT_RECOVER_TIMER 5 +#define DEFAULT_TOSS_SECS 10 +#define DEFAULT_SCAN_SECS 5 +#define DEFAULT_LOG_DEBUG 0 +#define DEFAULT_LOG_INFO 1 +#define DEFAULT_PROTOCOL DLM_PROTO_TCP +#define DEFAULT_MARK 0 +#define DEFAULT_NEW_RSB_COUNT 128 +#define DEFAULT_RECOVER_CALLBACKS 0 +#define DEFAULT_CLUSTER_NAME "" + +struct dlm_config_info dlm_config = { + .ci_tcp_port = DEFAULT_TCP_PORT, + .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, + .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .ci_recover_timer = DEFAULT_RECOVER_TIMER, + .ci_toss_secs = DEFAULT_TOSS_SECS, + .ci_scan_secs = DEFAULT_SCAN_SECS, + .ci_log_debug = DEFAULT_LOG_DEBUG, + .ci_log_info = DEFAULT_LOG_INFO, + .ci_protocol = DEFAULT_PROTOCOL, + .ci_mark = DEFAULT_MARK, + .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, + .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, + .ci_cluster_name = DEFAULT_CLUSTER_NAME +}; + diff --git a/fs/dlm/config.h b/fs/dlm/config.h new file mode 100644 index 0000000000..4c91fcca0f --- /dev/null +++ b/fs/dlm/config.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CONFIG_DOT_H__ +#define __CONFIG_DOT_H__ + +#define DLM_MAX_SOCKET_BUFSIZE 4096 + +struct dlm_config_node { + int nodeid; + int weight; + int new; + uint32_t comm_seq; +}; + +#define DLM_MAX_ADDR_COUNT 3 + +#define DLM_PROTO_TCP 0 +#define DLM_PROTO_SCTP 1 + +struct dlm_config_info { + int ci_tcp_port; + int ci_buffer_size; + int ci_rsbtbl_size; + int ci_recover_timer; + int ci_toss_secs; + int ci_scan_secs; + int ci_log_debug; + int ci_log_info; + int ci_protocol; + int ci_mark; + int ci_new_rsb_count; + int ci_recover_callbacks; + char ci_cluster_name[DLM_LOCKSPACE_LEN]; +}; + +extern struct dlm_config_info dlm_config; + +int dlm_config_init(void); +void dlm_config_exit(void); +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out); +int dlm_comm_seq(int nodeid, uint32_t *seq); +int dlm_our_nodeid(void); +int dlm_our_addr(struct sockaddr_storage *addr, int num); + +#endif /* __CONFIG_DOT_H__ */ + diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c new file mode 100644 index 0000000000..d2c0353875 --- /dev/null +++ b/fs/dlm/debug_fs.c @@ -0,0 +1,1052 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/slab.h> + +#include "dlm_internal.h" +#include "midcomms.h" +#include "lock.h" +#include "ast.h" + +#define DLM_DEBUG_BUF_LEN 4096 +static char debug_buf[DLM_DEBUG_BUF_LEN]; +static struct mutex debug_buf_lock; + +static struct dentry *dlm_root; +static struct dentry *dlm_comms; + +static char *print_lockmode(int mode) +{ + switch (mode) { + case DLM_LOCK_IV: + return "--"; + case DLM_LOCK_NL: + return "NL"; + case DLM_LOCK_CR: + return "CR"; + case DLM_LOCK_CW: + return "CW"; + case DLM_LOCK_PR: + return "PR"; + case DLM_LOCK_PW: + return "PW"; + case DLM_LOCK_EX: + return "EX"; + default: + return "??"; + } +} + +static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) +{ + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); + + if (lkb->lkb_status == DLM_LKSTS_CONVERT || + lkb->lkb_status == DLM_LKSTS_WAITING) + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); + + if (lkb->lkb_nodeid) { + if (lkb->lkb_nodeid != res->res_nodeid) + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, + lkb->lkb_remid); + else + seq_printf(s, " Master: %08x", lkb->lkb_remid); + } + + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + + seq_putc(s, '\n'); +} + +static void print_format1(struct dlm_rsb *res, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; + + lock_rsb(res); + + seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); + + for (i = 0; i < res->res_length; i++) { + if (isprint(res->res_name[i])) + seq_printf(s, "%c", res->res_name[i]); + else + seq_printf(s, "%c", '.'); + } + + if (res->res_nodeid > 0) + seq_printf(s, "\"\nLocal Copy, Master is node %d\n", + res->res_nodeid); + else if (res->res_nodeid == 0) + seq_puts(s, "\"\nMaster Copy\n"); + else if (res->res_nodeid == -1) + seq_printf(s, "\"\nLooking up master (lkid %x)\n", + res->res_first_lkid); + else + seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid); + if (seq_has_overflowed(s)) + goto out; + + /* Print the LVB: */ + if (res->res_lvbptr) { + seq_puts(s, "LVB: "); + for (i = 0; i < lvblen; i++) { + if (i == lvblen / 2) + seq_puts(s, "\n "); + seq_printf(s, "%02x ", + (unsigned char) res->res_lvbptr[i]); + } + if (rsb_flag(res, RSB_VALNOTVALID)) + seq_puts(s, " (INVALID)"); + seq_putc(s, '\n'); + if (seq_has_overflowed(s)) + goto out; + } + + root_list = !list_empty(&res->res_root_list); + recover_list = !list_empty(&res->res_recover_list); + + if (root_list || recover_list) { + seq_printf(s, "Recovery: root %d recover %d flags %lx count %d\n", + root_list, recover_list, + res->res_flags, res->res_recover_locks_count); + } + + /* Print the locks attached to this resource */ + seq_puts(s, "Granted Queue\n"); + list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) + goto out; + } + + seq_puts(s, "Conversion Queue\n"); + list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) + goto out; + } + + seq_puts(s, "Waiting Queue\n"); + list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) + goto out; + } + + if (list_empty(&res->res_lookup)) + goto out; + + seq_puts(s, "Lookup Queue\n"); + list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { + seq_printf(s, "%08x %s", + lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + seq_putc(s, '\n'); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(res); +} + +static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) +{ + u64 xid = 0; + u64 us; + + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + /* microseconds since lkb was added to current queue */ + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp)); + + /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us + r_nodeid r_len r_name */ + + seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + dlm_iflags_val(lkb), + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + (unsigned long long)us, + r->res_nodeid, + r->res_length, + r->res_name); +} + +static void print_format2(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + +static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, + int rsb_lookup) +{ + u64 xid = 0; + + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + dlm_iflags_val(lkb), + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_last_bast_mode, + rsb_lookup, + lkb->lkb_wait_type, + lkb->lkb_lvbseq, + (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), + (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time)); +} + +static void print_format3(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = r->res_ls->ls_lvblen; + int print_name = 1; + + lock_rsb(r); + + seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", + r, + r->res_nodeid, + r->res_first_lkid, + r->res_flags, + !list_empty(&r->res_root_list), + !list_empty(&r->res_recover_list), + r->res_recover_locks_count, + r->res_length); + if (seq_has_overflowed(s)) + goto out; + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_puts(s, print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + seq_putc(s, '\n'); + if (seq_has_overflowed(s)) + goto out; + + if (!r->res_lvbptr) + goto do_locks; + + seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen); + + for (i = 0; i < lvblen; i++) + seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); + seq_putc(s, '\n'); + if (seq_has_overflowed(s)) + goto out; + + do_locks: + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) { + print_format3_lock(s, lkb, 1); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + +static void print_format4(struct dlm_rsb *r, struct seq_file *s) +{ + int our_nodeid = dlm_our_nodeid(); + int print_name = 1; + int i; + + lock_rsb(r); + + seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ", + r, + r->res_nodeid, + r->res_master_nodeid, + r->res_dir_nodeid, + our_nodeid, + r->res_toss_time, + r->res_flags, + r->res_length); + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_puts(s, print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + seq_putc(s, '\n'); + unlock_rsb(r); +} + +static void print_format5_lock(struct seq_file *s, struct dlm_lkb *lkb) +{ + struct dlm_callback *cb; + + /* lkb_id lkb_flags mode flags sb_status sb_flags */ + + spin_lock(&lkb->lkb_cb_lock); + list_for_each_entry(cb, &lkb->lkb_callbacks, list) { + seq_printf(s, "%x %x %d %x %d %x\n", + lkb->lkb_id, + dlm_iflags_val(lkb), + cb->mode, + cb->flags, + cb->sb_status, + cb->sb_flags); + } + spin_unlock(&lkb->lkb_cb_lock); +} + +static void print_format5(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + +struct rsbtbl_iter { + struct dlm_rsb *rsb; + unsigned bucket; + int format; + int header; +}; + +/* + * If the buffer is full, seq_printf can be called again, but it + * does nothing. So, the these printing routines periodically check + * seq_has_overflowed to avoid wasting too much time trying to print to + * a full buffer. + */ + +static int table_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct rsbtbl_iter *ri = iter_ptr; + + switch (ri->format) { + case 1: + print_format1(ri->rsb, seq); + break; + case 2: + if (ri->header) { + seq_puts(seq, "id nodeid remid pid xid exflags flags sts grmode rqmode time_ms r_nodeid r_len r_name\n"); + ri->header = 0; + } + print_format2(ri->rsb, seq); + break; + case 3: + if (ri->header) { + seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + ri->header = 0; + } + print_format3(ri->rsb, seq); + break; + case 4: + if (ri->header) { + seq_puts(seq, "version 4 rsb 2\n"); + ri->header = 0; + } + print_format4(ri->rsb, seq); + break; + case 5: + if (ri->header) { + seq_puts(seq, "lkb_id lkb_flags mode flags sb_status sb_flags\n"); + ri->header = 0; + } + print_format5(ri->rsb, seq); + break; + } + + return 0; +} + +static const struct seq_operations format1_seq_ops; +static const struct seq_operations format2_seq_ops; +static const struct seq_operations format3_seq_ops; +static const struct seq_operations format4_seq_ops; +static const struct seq_operations format5_seq_ops; + +static void *table_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct rb_root *tree; + struct rb_node *node; + struct dlm_ls *ls = seq->private; + struct rsbtbl_iter *ri; + struct dlm_rsb *r; + loff_t n = *pos; + unsigned bucket, entry; + int toss = (seq->op == &format4_seq_ops); + + bucket = n >> 32; + entry = n & ((1LL << 32) - 1); + + if (bucket >= ls->ls_rsbtbl_size) + return NULL; + + ri = kzalloc(sizeof(*ri), GFP_NOFS); + if (!ri) + return NULL; + if (n == 0) + ri->header = 1; + if (seq->op == &format1_seq_ops) + ri->format = 1; + if (seq->op == &format2_seq_ops) + ri->format = 2; + if (seq->op == &format3_seq_ops) + ri->format = 3; + if (seq->op == &format4_seq_ops) + ri->format = 4; + if (seq->op == &format5_seq_ops) + ri->format = 5; + + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!RB_EMPTY_ROOT(tree)) { + for (node = rb_first(tree); node; node = rb_next(node)) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); + if (!entry--) { + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return ri; + } + } + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + + /* + * move to the first rsb in the next non-empty bucket + */ + + /* zero the entry */ + n &= ~((1LL << 32) - 1); + + while (1) { + bucket++; + n += 1LL << 32; + + if (bucket >= ls->ls_rsbtbl_size) { + kfree(ri); + return NULL; + } + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!RB_EMPTY_ROOT(tree)) { + node = rb_first(tree); + r = rb_entry(node, struct dlm_rsb, res_hashnode); + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + *pos = n; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + } +} + +static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) +{ + struct dlm_ls *ls = seq->private; + struct rsbtbl_iter *ri = iter_ptr; + struct rb_root *tree; + struct rb_node *next; + struct dlm_rsb *r, *rp; + loff_t n = *pos; + unsigned bucket; + int toss = (seq->op == &format4_seq_ops); + + bucket = n >> 32; + + /* + * move to the next rsb in the same bucket + */ + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + rp = ri->rsb; + next = rb_next(&rp->res_hashnode); + + if (next) { + r = rb_entry(next, struct dlm_rsb, res_hashnode); + dlm_hold_rsb(r); + ri->rsb = r; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_put_rsb(rp); + ++*pos; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_put_rsb(rp); + + /* + * move to the first rsb in the next non-empty bucket + */ + + /* zero the entry */ + n &= ~((1LL << 32) - 1); + + while (1) { + bucket++; + n += 1LL << 32; + + if (bucket >= ls->ls_rsbtbl_size) { + kfree(ri); + ++*pos; + return NULL; + } + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!RB_EMPTY_ROOT(tree)) { + next = rb_first(tree); + r = rb_entry(next, struct dlm_rsb, res_hashnode); + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + *pos = n; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + } +} + +static void table_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct rsbtbl_iter *ri = iter_ptr; + + if (ri) { + dlm_put_rsb(ri->rsb); + kfree(ri); + } +} + +static const struct seq_operations format1_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format2_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format3_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format4_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format5_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct file_operations format1_fops; +static const struct file_operations format2_fops; +static const struct file_operations format3_fops; +static const struct file_operations format4_fops; +static const struct file_operations format5_fops; + +static int table_open1(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format1_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open2(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format2_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static ssize_t table_write2(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + int n, len, lkb_nodeid, lkb_status, error; + char name[DLM_RESNAME_MAXLEN + 1] = {}; + struct dlm_ls *ls = seq->private; + unsigned int lkb_flags; + char buf[256] = {}; + uint32_t lkb_id; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d", + &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status); + if (n != 5) + return -EINVAL; + + len = strnlen(name, DLM_RESNAME_MAXLEN); + error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags, + lkb_nodeid, lkb_status); + if (error) + return error; + + return count; +} + +static int table_open3(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format3_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open4(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format4_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open5(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format5_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static const struct file_operations format1_fops = { + .owner = THIS_MODULE, + .open = table_open1, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format2_fops = { + .owner = THIS_MODULE, + .open = table_open2, + .read = seq_read, + .write = table_write2, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format3_fops = { + .owner = THIS_MODULE, + .open = table_open3, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format4_fops = { + .owner = THIS_MODULE, + .open = table_open4, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format5_fops = { + .owner = THIS_MODULE, + .open = table_open5, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * dump lkb's on the ls_waiters list + */ +static ssize_t waiters_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + struct dlm_lkb *lkb; + size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv; + + mutex_lock(&debug_buf_lock); + mutex_lock(&ls->ls_waiters_mutex); + memset(debug_buf, 0, sizeof(debug_buf)); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n", + lkb->lkb_id, lkb->lkb_wait_type, + lkb->lkb_nodeid, lkb->lkb_resource->res_name); + if (ret >= len - pos) + break; + pos += ret; + } + mutex_unlock(&ls->ls_waiters_mutex); + + rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos); + mutex_unlock(&debug_buf_lock); + return rv; +} + +static ssize_t waiters_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + int mstype, to_nodeid; + char buf[128] = {}; + uint32_t lkb_id; + int n, error; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid); + if (n != 3) + return -EINVAL; + + error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid); + if (error) + return error; + + return count; +} + +static const struct file_operations waiters_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = waiters_read, + .write = waiters_write, + .llseek = default_llseek, +}; + +void dlm_delete_debug_file(struct dlm_ls *ls) +{ + debugfs_remove(ls->ls_debug_rsb_dentry); + debugfs_remove(ls->ls_debug_waiters_dentry); + debugfs_remove(ls->ls_debug_locks_dentry); + debugfs_remove(ls->ls_debug_all_dentry); + debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_queued_asts_dentry); +} + +static int dlm_state_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%s\n", dlm_midcomms_state(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_state); + +static int dlm_flags_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_flags); + +static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt); + +static int dlm_version_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_version); + +static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + void *buf; + int ret; + + if (count > PAGE_SIZE || count < sizeof(struct dlm_header)) + return -EINVAL; + + buf = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, count)) { + ret = -EFAULT; + goto out; + } + + ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count); + if (ret) + goto out; + + kfree(buf); + return count; + +out: + kfree(buf); + return ret; +} + +static const struct file_operations dlm_rawmsg_fops = { + .open = simple_open, + .write = dlm_rawmsg_write, + .llseek = no_llseek, +}; + +void *dlm_create_debug_comms_file(int nodeid, void *data) +{ + struct dentry *d_node; + char name[256]; + + memset(name, 0, sizeof(name)); + snprintf(name, 256, "%d", nodeid); + + d_node = debugfs_create_dir(name, dlm_comms); + debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops); + debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops); + debugfs_create_file("send_queue_count", 0444, d_node, data, + &dlm_send_queue_cnt_fops); + debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops); + + return d_node; +} + +void dlm_delete_debug_comms_file(void *ctx) +{ + debugfs_remove(ctx); +} + +void dlm_create_debug_file(struct dlm_ls *ls) +{ + /* Reserve enough space for the longest file name */ + char name[DLM_LOCKSPACE_LEN + sizeof("_queued_asts")]; + + /* format 1 */ + + ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format1_fops); + + /* format 2 */ + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "%s_locks", ls->ls_name); + + ls->ls_debug_locks_dentry = debugfs_create_file(name, + 0644, + dlm_root, + ls, + &format2_fops); + + /* format 3 */ + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "%s_all", ls->ls_name); + + ls->ls_debug_all_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format3_fops); + + /* format 4 */ + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "%s_toss", ls->ls_name); + + ls->ls_debug_toss_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format4_fops); + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "%s_waiters", ls->ls_name); + + ls->ls_debug_waiters_dentry = debugfs_create_file(name, + 0644, + dlm_root, + ls, + &waiters_fops); + + /* format 5 */ + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "%s_queued_asts", ls->ls_name); + + ls->ls_debug_queued_asts_dentry = debugfs_create_file(name, + 0644, + dlm_root, + ls, + &format5_fops); +} + +void __init dlm_register_debugfs(void) +{ + mutex_init(&debug_buf_lock); + dlm_root = debugfs_create_dir("dlm", NULL); + dlm_comms = debugfs_create_dir("comms", dlm_root); +} + +void dlm_unregister_debugfs(void) +{ + debugfs_remove(dlm_root); +} + diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c new file mode 100644 index 0000000000..f6acba4310 --- /dev/null +++ b/fs/dlm/dir.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "lowcomms.h" +#include "rcom.h" +#include "config.h" +#include "memory.h" +#include "recover.h" +#include "util.h" +#include "lock.h" +#include "dir.h" + +/* + * We use the upper 16 bits of the hash value to select the directory node. + * Low bits are used for distribution of rsb's among hash buckets on each node. + * + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of + * num_nodes to the hash value. This value in the desired range is used as an + * offset into the sorted list of nodeid's to give the particular nodeid. + */ + +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) +{ + uint32_t node; + + if (ls->ls_num_nodes == 1) + return dlm_our_nodeid(); + else { + node = (hash >> 16) % ls->ls_total_weight; + return ls->ls_node_array[node]; + } +} + +int dlm_dir_nodeid(struct dlm_rsb *r) +{ + return r->res_dir_nodeid; +} + +void dlm_recover_dir_nodeid(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + r->res_dir_nodeid = dlm_hash2nodeid(ls, r->res_hash); + } + up_read(&ls->ls_root_sem); +} + +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq) +{ + struct dlm_member *memb; + char *b, *last_name = NULL; + int error = -ENOMEM, last_len, nodeid, result; + uint16_t namelen; + unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; + + log_rinfo(ls, "dlm_recover_directory"); + + if (dlm_no_directory(ls)) + goto out_status; + + last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); + if (!last_name) + goto out; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == dlm_our_nodeid()) + continue; + + memset(last_name, 0, DLM_RESNAME_MAXLEN); + last_len = 0; + + for (;;) { + int left; + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out_free; + } + + error = dlm_rcom_names(ls, memb->nodeid, + last_name, last_len, seq); + if (error) + goto out_free; + + cond_resched(); + + /* + * pick namelen/name pairs out of received buffer + */ + + b = ls->ls_recover_buf->rc_buf; + left = le16_to_cpu(ls->ls_recover_buf->rc_header.h_length); + left -= sizeof(struct dlm_rcom); + + for (;;) { + __be16 v; + + error = -EINVAL; + if (left < sizeof(__be16)) + goto out_free; + + memcpy(&v, b, sizeof(__be16)); + namelen = be16_to_cpu(v); + b += sizeof(__be16); + left -= sizeof(__be16); + + /* namelen of 0xFFFFF marks end of names for + this node; namelen of 0 marks end of the + buffer */ + + if (namelen == 0xFFFF) + goto done; + if (!namelen) + break; + + if (namelen > left) + goto out_free; + + if (namelen > DLM_RESNAME_MAXLEN) + goto out_free; + + error = dlm_master_lookup(ls, memb->nodeid, + b, namelen, + DLM_LU_RECOVER_DIR, + &nodeid, &result); + if (error) { + log_error(ls, "recover_dir lookup %d", + error); + goto out_free; + } + + /* The name was found in rsbtbl, but the + * master nodeid is different from + * memb->nodeid which says it is the master. + * This should not happen. */ + + if (result == DLM_LU_MATCH && + nodeid != memb->nodeid) { + count_bad++; + log_error(ls, "recover_dir lookup %d " + "nodeid %d memb %d bad %u", + result, nodeid, memb->nodeid, + count_bad); + print_hex_dump_bytes("dlm_recover_dir ", + DUMP_PREFIX_NONE, + b, namelen); + } + + /* The name was found in rsbtbl, and the + * master nodeid matches memb->nodeid. */ + + if (result == DLM_LU_MATCH && + nodeid == memb->nodeid) { + count_match++; + } + + /* The name was not found in rsbtbl and was + * added with memb->nodeid as the master. */ + + if (result == DLM_LU_ADD) { + count_add++; + } + + last_len = namelen; + memcpy(last_name, b, namelen); + b += namelen; + left -= namelen; + count++; + } + } + done: + ; + } + + out_status: + error = 0; + dlm_set_recover_status(ls, DLM_RS_DIR); + + log_rinfo(ls, "dlm_recover_directory %u in %u new", + count, count_add); + out_free: + kfree(last_name); + out: + return error; +} + +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, const char *name, + int len) +{ + struct dlm_rsb *r; + uint32_t hash, bucket; + int rv; + + hash = jhash(name, len, 0); + bucket = hash & (ls->ls_rsbtbl_size - 1); + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, &r); + if (rv) + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss, + name, len, &r); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + + if (!rv) + return r; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (len == r->res_length && !memcmp(name, r->res_name, len)) { + up_read(&ls->ls_root_sem); + log_debug(ls, "find_rsb_root revert to root_list %s", + r->res_name); + return r; + } + } + up_read(&ls->ls_root_sem); + return NULL; +} + +/* Find the rsb where we left off (or start again), then send rsb names + for rsb's we're master of and whose directory node matches the requesting + node. inbuf is the rsb name last sent, inlen is the name's length */ + +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid) +{ + struct list_head *list; + struct dlm_rsb *r; + int offset = 0, dir_nodeid; + __be16 be_namelen; + + down_read(&ls->ls_root_sem); + + if (inlen > 1) { + r = find_rsb_root(ls, inbuf, inlen); + if (!r) { + log_error(ls, "copy_master_names from %d start %d %.*s", + nodeid, inlen, inlen, inbuf); + goto out; + } + list = r->res_root_list.next; + } else { + list = ls->ls_root_list.next; + } + + for (offset = 0; list != &ls->ls_root_list; list = list->next) { + r = list_entry(list, struct dlm_rsb, res_root_list); + if (r->res_nodeid) + continue; + + dir_nodeid = dlm_dir_nodeid(r); + if (dir_nodeid != nodeid) + continue; + + /* + * The block ends when we can't fit the following in the + * remaining buffer space: + * namelen (uint16_t) + + * name (r->res_length) + + * end-of-block record 0x0000 (uint16_t) + */ + + if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { + /* Write end-of-block record */ + be_namelen = cpu_to_be16(0); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + ls->ls_recover_dir_sent_msg++; + goto out; + } + + be_namelen = cpu_to_be16(r->res_length); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + memcpy(outbuf + offset, r->res_name, r->res_length); + offset += r->res_length; + ls->ls_recover_dir_sent_res++; + } + + /* + * If we've reached the end of the list (and there's room) write a + * terminating record. + */ + + if ((list == &ls->ls_root_list) && + (offset + sizeof(uint16_t) <= outlen)) { + be_namelen = cpu_to_be16(0xFFFF); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + ls->ls_recover_dir_sent_msg++; + } + out: + up_read(&ls->ls_root_sem); +} + diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h new file mode 100644 index 0000000000..39ecb69d7e --- /dev/null +++ b/fs/dlm/dir.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +int dlm_dir_nodeid(struct dlm_rsb *rsb); +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); +void dlm_recover_dir_nodeid(struct dlm_ls *ls); +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq); +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid); + +#endif /* __DIR_DOT_H__ */ + diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h new file mode 100644 index 0000000000..dfc444dad3 --- /dev/null +++ b/fs/dlm/dlm_internal.h @@ -0,0 +1,829 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DLM_INTERNAL_DOT_H__ +#define __DLM_INTERNAL_DOT_H__ + +/* + * This is the main header file to be included in each DLM source file. + */ + +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/list.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <linux/delay.h> +#include <linux/socket.h> +#include <linux/kthread.h> +#include <linux/kobject.h> +#include <linux/kref.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/idr.h> +#include <linux/ratelimit.h> +#include <linux/uaccess.h> + +#include <linux/dlm.h> +#include "config.h" + +struct dlm_ls; +struct dlm_lkb; +struct dlm_rsb; +struct dlm_member; +struct dlm_rsbtable; +struct dlm_recover; +struct dlm_header; +struct dlm_message; +struct dlm_rcom; +struct dlm_mhandle; +struct dlm_msg; + +#define log_print(fmt, args...) \ + printk(KERN_ERR "dlm: "fmt"\n" , ##args) +#define log_print_ratelimited(fmt, args...) \ + printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args) +#define log_error(ls, fmt, args...) \ + printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) + +#define log_rinfo(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_info) \ + printk(KERN_INFO "dlm: %s: " fmt "\n", \ + (ls)->ls_name, ##args); \ + else if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + +#define log_debug(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + +#define log_limit(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + +#define DLM_ASSERT(x, do) \ +{ \ + if (!(x)) \ + { \ + printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \ + "DLM: assertion: \"%s\"\n" \ + "DLM: time = %lu\n", \ + __LINE__, __FILE__, #x, jiffies); \ + {do} \ + printk("\n"); \ + panic("DLM: Record message above and reboot.\n"); \ + } \ +} + + +#define DLM_RTF_SHRINK_BIT 0 + +struct dlm_rsbtable { + struct rb_root keep; + struct rb_root toss; + spinlock_t lock; + unsigned long flags; +}; + + +/* + * Lockspace member (per node in a ls) + */ + +struct dlm_member { + struct list_head list; + int nodeid; + int weight; + int slot; + int slot_prev; + int comm_seq; + uint32_t generation; +}; + +/* + * Save and manage recovery state for a lockspace. + */ + +struct dlm_recover { + struct list_head list; + struct dlm_config_node *nodes; + int nodes_count; + uint64_t seq; +}; + +/* + * Pass input args to second stage locking function. + */ + +struct dlm_args { + uint32_t flags; + void (*astfn) (void *astparam); + void *astparam; + void (*bastfn) (void *astparam, int mode); + int mode; + struct dlm_lksb *lksb; +}; + + +/* + * Lock block + * + * A lock can be one of three types: + * + * local copy lock is mastered locally + * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set) + * process copy lock is mastered on a remote node + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set) + * master copy master node's copy of a lock owned by remote node + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set) + * + * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or + * dlm_unlock. The dlm does not modify these or use any private flags in + * this field; it only contains DLM_LKF_ flags from dlm.h. These flags + * are sent as-is to the remote master when the lock is remote. + * + * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h. + * Some internal flags are shared between the master and process nodes; + * these shared flags are kept in the lower two bytes. One of these + * flags set on the master copy will be propagated to the process copy + * and v.v. Other internal flags are private to the master or process + * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes. + * + * lkb_sbflags: status block flags. These flags are copied directly into + * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion + * ast. All defined in dlm.h with DLM_SBF_ prefix. + * + * lkb_status: the lock status indicates which rsb queue the lock is + * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT + * + * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a + * reply is needed. Only set when the lkb is on the lockspace waiters + * list awaiting a reply from a remote node. + * + * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb + * is a master copy, nodeid specifies the remote lock holder, when the + * lkb is a process copy, the nodeid specifies the lock master. + */ + +/* lkb_status */ + +#define DLM_LKSTS_WAITING 1 +#define DLM_LKSTS_GRANTED 2 +#define DLM_LKSTS_CONVERT 3 + +/* lkb_iflags */ + +#define DLM_IFL_MSTCPY_BIT 16 +#define __DLM_IFL_MIN_BIT DLM_IFL_MSTCPY_BIT +#define DLM_IFL_RESEND_BIT 17 +#define DLM_IFL_DEAD_BIT 18 +#define DLM_IFL_OVERLAP_UNLOCK_BIT 19 +#define DLM_IFL_OVERLAP_CANCEL_BIT 20 +#define DLM_IFL_ENDOFLIFE_BIT 21 +#define DLM_IFL_DEADLOCK_CANCEL_BIT 24 +#define DLM_IFL_CB_PENDING_BIT 25 +#define __DLM_IFL_MAX_BIT DLM_IFL_CB_PENDING_BIT + +/* lkb_dflags */ + +#define DLM_DFL_USER_BIT 0 +#define __DLM_DFL_MIN_BIT DLM_DFL_USER_BIT +#define DLM_DFL_ORPHAN_BIT 1 +#define __DLM_DFL_MAX_BIT DLM_DFL_ORPHAN_BIT + +#define DLM_CB_CAST 0x00000001 +#define DLM_CB_BAST 0x00000002 + +struct dlm_callback { + uint32_t flags; /* DLM_CBF_ */ + int sb_status; /* copy to lksb status */ + uint8_t sb_flags; /* copy to lksb flags */ + int8_t mode; /* rq mode of bast, gr mode of cast */ + + struct list_head list; + struct kref ref; +}; + +struct dlm_lkb { + struct dlm_rsb *lkb_resource; /* the rsb */ + struct kref lkb_ref; + int lkb_nodeid; /* copied from rsb */ + int lkb_ownpid; /* pid of lock owner */ + uint32_t lkb_id; /* our lock ID */ + uint32_t lkb_remid; /* lock ID on remote partner */ + uint32_t lkb_exflags; /* external flags from caller */ + unsigned long lkb_sbflags; /* lksb flags */ + unsigned long lkb_dflags; /* distributed flags */ + unsigned long lkb_iflags; /* internal flags */ + uint32_t lkb_lvbseq; /* lvb sequence number */ + + int8_t lkb_status; /* granted, waiting, convert */ + int8_t lkb_rqmode; /* requested lock mode */ + int8_t lkb_grmode; /* granted lock mode */ + int8_t lkb_highbast; /* highest mode bast sent for */ + + int8_t lkb_wait_type; /* type of reply waiting for */ + atomic_t lkb_wait_count; + int lkb_wait_nodeid; /* for debugging */ + + struct list_head lkb_statequeue; /* rsb g/c/w list */ + struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ + struct list_head lkb_wait_reply; /* waiting for remote reply */ + struct list_head lkb_ownqueue; /* list of locks for a process */ + ktime_t lkb_timestamp; + + spinlock_t lkb_cb_lock; + struct work_struct lkb_cb_work; + struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ + struct list_head lkb_callbacks; + struct dlm_callback *lkb_last_cast; + struct dlm_callback *lkb_last_cb; + int lkb_last_bast_mode; + ktime_t lkb_last_cast_time; /* for debugging */ + ktime_t lkb_last_bast_time; /* for debugging */ + + uint64_t lkb_recover_seq; /* from ls_recover_seq */ + + char *lkb_lvbptr; + struct dlm_lksb *lkb_lksb; /* caller's status block */ + void (*lkb_astfn) (void *astparam); + void (*lkb_bastfn) (void *astparam, int mode); + union { + void *lkb_astparam; /* caller's ast arg */ + struct dlm_user_args *lkb_ua; + }; +}; + +/* + * res_master_nodeid is "normal": 0 is unset/invalid, non-zero is the real + * nodeid, even when nodeid is our_nodeid. + * + * res_nodeid is "odd": -1 is unset/invalid, zero means our_nodeid, + * greater than zero when another nodeid. + * + * (TODO: remove res_nodeid and only use res_master_nodeid) + */ + +struct dlm_rsb { + struct dlm_ls *res_ls; /* the lockspace */ + struct kref res_ref; + struct mutex res_mutex; + unsigned long res_flags; + int res_length; /* length of rsb name */ + int res_nodeid; + int res_master_nodeid; + int res_dir_nodeid; + int res_id; /* for ls_recover_idr */ + uint32_t res_lvbseq; + uint32_t res_hash; + uint32_t res_bucket; /* rsbtbl */ + unsigned long res_toss_time; + uint32_t res_first_lkid; + struct list_head res_lookup; /* lkbs waiting on first */ + union { + struct list_head res_hashchain; + struct rb_node res_hashnode; /* rsbtbl */ + }; + struct list_head res_grantqueue; + struct list_head res_convertqueue; + struct list_head res_waitqueue; + + struct list_head res_root_list; /* used for recovery */ + struct list_head res_recover_list; /* used for recovery */ + int res_recover_locks_count; + + char *res_lvbptr; + char res_name[DLM_RESNAME_MAXLEN+1]; +}; + +/* dlm_master_lookup() flags */ + +#define DLM_LU_RECOVER_DIR 1 +#define DLM_LU_RECOVER_MASTER 2 + +/* dlm_master_lookup() results */ + +#define DLM_LU_MATCH 1 +#define DLM_LU_ADD 2 + +/* find_rsb() flags */ + +#define R_REQUEST 0x00000001 +#define R_RECEIVE_REQUEST 0x00000002 +#define R_RECEIVE_RECOVER 0x00000004 + +/* rsb_flags */ + +enum rsb_flags { + RSB_MASTER_UNCERTAIN, + RSB_VALNOTVALID, + RSB_VALNOTVALID_PREV, + RSB_NEW_MASTER, + RSB_NEW_MASTER2, + RSB_RECOVER_CONVERT, + RSB_RECOVER_GRANT, + RSB_RECOVER_LVB_INVAL, +}; + +static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __set_bit(flag, &r->res_flags); +} + +static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __clear_bit(flag, &r->res_flags); +} + +static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + return test_bit(flag, &r->res_flags); +} + + +/* dlm_header is first element of all structs sent between nodes */ + +#define DLM_HEADER_MAJOR 0x00030000 +#define DLM_HEADER_MINOR 0x00000002 + +#define DLM_VERSION_3_1 0x00030001 +#define DLM_VERSION_3_2 0x00030002 + +#define DLM_HEADER_SLOTS 0x00000001 + +#define DLM_MSG 1 +#define DLM_RCOM 2 +#define DLM_OPTS 3 +#define DLM_ACK 4 +#define DLM_FIN 5 + +struct dlm_header { + __le32 h_version; + union { + /* for DLM_MSG and DLM_RCOM */ + __le32 h_lockspace; + /* for DLM_ACK and DLM_OPTS */ + __le32 h_seq; + } u; + __le32 h_nodeid; /* nodeid of sender */ + __le16 h_length; + uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ + uint8_t h_pad; +}; + +#define DLM_MSG_REQUEST 1 +#define DLM_MSG_CONVERT 2 +#define DLM_MSG_UNLOCK 3 +#define DLM_MSG_CANCEL 4 +#define DLM_MSG_REQUEST_REPLY 5 +#define DLM_MSG_CONVERT_REPLY 6 +#define DLM_MSG_UNLOCK_REPLY 7 +#define DLM_MSG_CANCEL_REPLY 8 +#define DLM_MSG_GRANT 9 +#define DLM_MSG_BAST 10 +#define DLM_MSG_LOOKUP 11 +#define DLM_MSG_REMOVE 12 +#define DLM_MSG_LOOKUP_REPLY 13 +#define DLM_MSG_PURGE 14 + +struct dlm_message { + struct dlm_header m_header; + __le32 m_type; /* DLM_MSG_ */ + __le32 m_nodeid; + __le32 m_pid; + __le32 m_lkid; /* lkid on sender */ + __le32 m_remid; /* lkid on receiver */ + __le32 m_parent_lkid; + __le32 m_parent_remid; + __le32 m_exflags; + __le32 m_sbflags; + __le32 m_flags; + __le32 m_lvbseq; + __le32 m_hash; + __le32 m_status; + __le32 m_grmode; + __le32 m_rqmode; + __le32 m_bastmode; + __le32 m_asts; + __le32 m_result; /* 0 or -EXXX */ + char m_extra[]; /* name or lvb */ +}; + + +#define DLM_RS_NODES 0x00000001 +#define DLM_RS_NODES_ALL 0x00000002 +#define DLM_RS_DIR 0x00000004 +#define DLM_RS_DIR_ALL 0x00000008 +#define DLM_RS_LOCKS 0x00000010 +#define DLM_RS_LOCKS_ALL 0x00000020 +#define DLM_RS_DONE 0x00000040 +#define DLM_RS_DONE_ALL 0x00000080 + +#define DLM_RCOM_STATUS 1 +#define DLM_RCOM_NAMES 2 +#define DLM_RCOM_LOOKUP 3 +#define DLM_RCOM_LOCK 4 +#define DLM_RCOM_STATUS_REPLY 5 +#define DLM_RCOM_NAMES_REPLY 6 +#define DLM_RCOM_LOOKUP_REPLY 7 +#define DLM_RCOM_LOCK_REPLY 8 + +struct dlm_rcom { + struct dlm_header rc_header; + __le32 rc_type; /* DLM_RCOM_ */ + __le32 rc_result; /* multi-purpose */ + __le64 rc_id; /* match reply with request */ + __le64 rc_seq; /* sender's ls_recover_seq */ + __le64 rc_seq_reply; /* remote ls_recover_seq */ + char rc_buf[]; +}; + +struct dlm_opt_header { + __le16 t_type; + __le16 t_length; + __le32 t_pad; + /* need to be 8 byte aligned */ + char t_value[]; +}; + +/* encapsulation header */ +struct dlm_opts { + struct dlm_header o_header; + uint8_t o_nextcmd; + uint8_t o_pad; + __le16 o_optlen; + __le32 o_pad2; + char o_opts[]; +}; + +union dlm_packet { + struct dlm_header header; /* common to other two */ + struct dlm_message message; + struct dlm_rcom rcom; + struct dlm_opts opts; +}; + +#define DLM_RSF_NEED_SLOTS 0x00000001 + +/* RCOM_STATUS data */ +struct rcom_status { + __le32 rs_flags; + __le32 rs_unused1; + __le64 rs_unused2; +}; + +/* RCOM_STATUS_REPLY data */ +struct rcom_config { + __le32 rf_lvblen; + __le32 rf_lsflags; + + /* DLM_HEADER_SLOTS adds: */ + __le32 rf_flags; + __le16 rf_our_slot; + __le16 rf_num_slots; + __le32 rf_generation; + __le32 rf_unused1; + __le64 rf_unused2; +}; + +struct rcom_slot { + __le32 ro_nodeid; + __le16 ro_slot; + __le16 ro_unused1; + __le64 ro_unused2; +}; + +struct rcom_lock { + __le32 rl_ownpid; + __le32 rl_lkid; + __le32 rl_remid; + __le32 rl_parent_lkid; + __le32 rl_parent_remid; + __le32 rl_exflags; + __le32 rl_flags; + __le32 rl_lvbseq; + __le32 rl_result; + int8_t rl_rqmode; + int8_t rl_grmode; + int8_t rl_status; + int8_t rl_asts; + __le16 rl_wait_type; + __le16 rl_namelen; + char rl_name[DLM_RESNAME_MAXLEN]; + char rl_lvb[]; +}; + +/* + * The max number of resources per rsbtbl bucket that shrink will attempt + * to remove in each iteration. + */ + +#define DLM_REMOVE_NAMES_MAX 8 + +struct dlm_ls { + struct list_head ls_list; /* list of lockspaces */ + dlm_lockspace_t *ls_local_handle; + uint32_t ls_global_id; /* global unique lockspace ID */ + uint32_t ls_generation; + uint32_t ls_exflags; + int ls_lvblen; + atomic_t ls_count; /* refcount of processes in + the dlm using this ls */ + wait_queue_head_t ls_count_wait; + int ls_create_count; /* create/release refcount */ + unsigned long ls_flags; /* LSFL_ */ + unsigned long ls_scan_time; + struct kobject ls_kobj; + + struct idr ls_lkbidr; + spinlock_t ls_lkbidr_spin; + + struct dlm_rsbtable *ls_rsbtbl; + uint32_t ls_rsbtbl_size; + + struct mutex ls_waiters_mutex; + struct list_head ls_waiters; /* lkbs needing a reply */ + + struct mutex ls_orphans_mutex; + struct list_head ls_orphans; + + spinlock_t ls_new_rsb_spin; + int ls_new_rsb_count; + struct list_head ls_new_rsb; /* new rsb structs */ + + char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; + int ls_remove_lens[DLM_REMOVE_NAMES_MAX]; + + struct list_head ls_nodes; /* current nodes in ls */ + struct list_head ls_nodes_gone; /* dead node list, recovery */ + int ls_num_nodes; /* number of nodes in ls */ + int ls_low_nodeid; + int ls_total_weight; + int *ls_node_array; + + int ls_slot; + int ls_num_slots; + int ls_slots_size; + struct dlm_slot *ls_slots; + + struct dlm_rsb ls_local_rsb; /* for returning errors */ + struct dlm_lkb ls_local_lkb; /* for returning errors */ + struct dlm_message ls_local_ms; /* for faking a reply */ + + struct dentry *ls_debug_rsb_dentry; /* debugfs */ + struct dentry *ls_debug_waiters_dentry; /* debugfs */ + struct dentry *ls_debug_locks_dentry; /* debugfs */ + struct dentry *ls_debug_all_dentry; /* debugfs */ + struct dentry *ls_debug_toss_dentry; /* debugfs */ + struct dentry *ls_debug_queued_asts_dentry; /* debugfs */ + + wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ + int ls_uevent_result; + struct completion ls_recovery_done; + int ls_recovery_result; + + struct miscdevice ls_device; + + struct workqueue_struct *ls_callback_wq; + + /* recovery related */ + + spinlock_t ls_cb_lock; + struct list_head ls_cb_delay; /* save for queue_work later */ + struct timer_list ls_timer; + struct task_struct *ls_recoverd_task; + struct mutex ls_recoverd_active; + spinlock_t ls_recover_lock; + unsigned long ls_recover_begin; /* jiffies timestamp */ + uint32_t ls_recover_status; /* DLM_RS_ */ + uint64_t ls_recover_seq; + struct dlm_recover *ls_recover_args; + struct rw_semaphore ls_in_recovery; /* block local requests */ + struct rw_semaphore ls_recv_active; /* block dlm_recv */ + struct list_head ls_requestqueue;/* queue remote requests */ + atomic_t ls_requestqueue_cnt; + wait_queue_head_t ls_requestqueue_wait; + struct mutex ls_requestqueue_mutex; + struct dlm_rcom *ls_recover_buf; + int ls_recover_nodeid; /* for debugging */ + unsigned int ls_recover_dir_sent_res; /* for log info */ + unsigned int ls_recover_dir_sent_msg; /* for log info */ + unsigned int ls_recover_locks_in; /* for log info */ + uint64_t ls_rcom_seq; + spinlock_t ls_rcom_spin; + struct list_head ls_recover_list; + spinlock_t ls_recover_list_lock; + int ls_recover_list_count; + struct idr ls_recover_idr; + spinlock_t ls_recover_idr_lock; + wait_queue_head_t ls_wait_general; + wait_queue_head_t ls_recover_lock_wait; + spinlock_t ls_clear_proc_locks; + + struct list_head ls_root_list; /* root resources */ + struct rw_semaphore ls_root_sem; /* protect root_list */ + + const struct dlm_lockspace_ops *ls_ops; + void *ls_ops_arg; + + int ls_namelen; + char ls_name[DLM_LOCKSPACE_LEN + 1]; +}; + +/* + * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines + * that they should abort what they're doing so new recovery can be started. + * + * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it + * should do down_write() on the in_recovery rw_semaphore. (doing down_write + * within dlm_ls_stop causes complaints about the lock acquired/released + * in different contexts.) + * + * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore. + * It sets this after it is done with down_write() on the in_recovery + * rw_semaphore and clears it after it has released the rw_semaphore. + * + * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it + * should begin recovery of the lockspace. + * + * LSFL_RUNNING - set when normal locking activity is enabled. + * dlm_ls_stop() clears this to tell dlm locking routines that they should + * quit what they are doing so recovery can run. dlm_recoverd sets + * this after recovery is finished. + */ + +#define LSFL_RECOVER_STOP 0 +#define LSFL_RECOVER_DOWN 1 +#define LSFL_RECOVER_LOCK 2 +#define LSFL_RECOVER_WORK 3 +#define LSFL_RUNNING 4 + +#define LSFL_RCOM_READY 5 +#define LSFL_RCOM_WAIT 6 +#define LSFL_UEVENT_WAIT 7 +#define LSFL_CB_DELAY 9 +#define LSFL_NODIR 10 + +/* much of this is just saving user space pointers associated with the + lock that we pass back to the user lib with an ast */ + +struct dlm_user_args { + struct dlm_user_proc *proc; /* each process that opens the lockspace + device has private data + (dlm_user_proc) on the struct file, + the process's locks point back to it*/ + struct dlm_lksb lksb; + struct dlm_lksb __user *user_lksb; + void __user *castparam; + void __user *castaddr; + void __user *bastparam; + void __user *bastaddr; + uint64_t xid; +}; + +#define DLM_PROC_FLAGS_CLOSING 1 +#define DLM_PROC_FLAGS_COMPAT 2 + +/* locks list is kept so we can remove all a process's locks when it + exits (or orphan those that are persistent) */ + +struct dlm_user_proc { + dlm_lockspace_t *lockspace; + unsigned long flags; /* DLM_PROC_FLAGS */ + struct list_head asts; + spinlock_t asts_spin; + struct list_head locks; + spinlock_t locks_spin; + struct list_head unlocking; + wait_queue_head_t wait; +}; + +static inline int dlm_locking_stopped(struct dlm_ls *ls) +{ + return !test_bit(LSFL_RUNNING, &ls->ls_flags); +} + +static inline int dlm_recovery_stopped(struct dlm_ls *ls) +{ + return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); +} + +static inline int dlm_no_directory(struct dlm_ls *ls) +{ + return test_bit(LSFL_NODIR, &ls->ls_flags); +} + +/* takes a snapshot from dlm atomic flags */ +static inline uint32_t dlm_flags_val(const unsigned long *addr, + uint32_t min, uint32_t max) +{ + uint32_t bit = min, val = 0; + + for_each_set_bit_from(bit, addr, max + 1) { + val |= BIT(bit); + } + + return val; +} + +static inline uint32_t dlm_iflags_val(const struct dlm_lkb *lkb) +{ + return dlm_flags_val(&lkb->lkb_iflags, __DLM_IFL_MIN_BIT, + __DLM_IFL_MAX_BIT); +} + +static inline uint32_t dlm_dflags_val(const struct dlm_lkb *lkb) +{ + return dlm_flags_val(&lkb->lkb_dflags, __DLM_DFL_MIN_BIT, + __DLM_DFL_MAX_BIT); +} + +/* coming from UAPI header + * + * TODO: + * Move this to UAPI header and let other values point to them and use BIT() + */ +#define DLM_SBF_DEMOTED_BIT 0 +#define __DLM_SBF_MIN_BIT DLM_SBF_DEMOTED_BIT +#define DLM_SBF_VALNOTVALID_BIT 1 +#define DLM_SBF_ALTMODE_BIT 2 +#define __DLM_SBF_MAX_BIT DLM_SBF_ALTMODE_BIT + +static inline uint32_t dlm_sbflags_val(const struct dlm_lkb *lkb) +{ + /* be sure the next person updates this */ + BUILD_BUG_ON(BIT(__DLM_SBF_MAX_BIT) != DLM_SBF_ALTMODE); + + return dlm_flags_val(&lkb->lkb_sbflags, __DLM_SBF_MIN_BIT, + __DLM_SBF_MAX_BIT); +} + +static inline void dlm_set_flags_val(unsigned long *addr, uint32_t val, + uint32_t min, uint32_t max) +{ + uint32_t bit; + + for (bit = min; bit < (max + 1); bit++) { + if (val & BIT(bit)) + set_bit(bit, addr); + else + clear_bit(bit, addr); + } +} + +static inline void dlm_set_dflags_val(struct dlm_lkb *lkb, uint32_t val) +{ + dlm_set_flags_val(&lkb->lkb_dflags, val, __DLM_DFL_MIN_BIT, + __DLM_DFL_MAX_BIT); +} + +static inline void dlm_set_sbflags_val(struct dlm_lkb *lkb, uint32_t val) +{ + dlm_set_flags_val(&lkb->lkb_sbflags, val, __DLM_SBF_MIN_BIT, + __DLM_SBF_MAX_BIT); +} + +int dlm_plock_init(void); +void dlm_plock_exit(void); + +#ifdef CONFIG_DLM_DEBUG +void dlm_register_debugfs(void); +void dlm_unregister_debugfs(void); +void dlm_create_debug_file(struct dlm_ls *ls); +void dlm_delete_debug_file(struct dlm_ls *ls); +void *dlm_create_debug_comms_file(int nodeid, void *data); +void dlm_delete_debug_comms_file(void *ctx); +#else +static inline void dlm_register_debugfs(void) { } +static inline void dlm_unregister_debugfs(void) { } +static inline void dlm_create_debug_file(struct dlm_ls *ls) { } +static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; } +static inline void dlm_delete_debug_comms_file(void *ctx) { } +#endif + +#endif /* __DLM_INTERNAL_DOT_H__ */ + diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c new file mode 100644 index 0000000000..652c51fbbf --- /dev/null +++ b/fs/dlm/lock.c @@ -0,0 +1,6153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +/* Central locking logic has four stages: + + dlm_lock() + dlm_unlock() + + request_lock(ls, lkb) + convert_lock(ls, lkb) + unlock_lock(ls, lkb) + cancel_lock(ls, lkb) + + _request_lock(r, lkb) + _convert_lock(r, lkb) + _unlock_lock(r, lkb) + _cancel_lock(r, lkb) + + do_request(r, lkb) + do_convert(r, lkb) + do_unlock(r, lkb) + do_cancel(r, lkb) + + Stage 1 (lock, unlock) is mainly about checking input args and + splitting into one of the four main operations: + + dlm_lock = request_lock + dlm_lock+CONVERT = convert_lock + dlm_unlock = unlock_lock + dlm_unlock+CANCEL = cancel_lock + + Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is + provided to the next stage. + + Stage 3, _xxxx_lock(), determines if the operation is local or remote. + When remote, it calls send_xxxx(), when local it calls do_xxxx(). + + Stage 4, do_xxxx(), is the guts of the operation. It manipulates the + given rsb and lkb and queues callbacks. + + For remote operations, send_xxxx() results in the corresponding do_xxxx() + function being executed on the remote node. The connecting send/receive + calls on local (L) and remote (R) nodes: + + L: send_xxxx() -> R: receive_xxxx() + R: do_xxxx() + L: receive_xxxx_reply() <- R: send_xxxx_reply() +*/ +#include <trace/events/dlm.h> + +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include "dlm_internal.h" +#include <linux/dlm_device.h> +#include "memory.h" +#include "midcomms.h" +#include "requestqueue.h" +#include "util.h" +#include "dir.h" +#include "member.h" +#include "lockspace.h" +#include "ast.h" +#include "lock.h" +#include "rcom.h" +#include "recover.h" +#include "lvb_table.h" +#include "user.h" +#include "config.h" + +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode); +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_remove(struct dlm_rsb *r); +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local); +static int receive_extralen(const struct dlm_message *ms); +static void do_purge(struct dlm_ls *ls, int nodeid, int pid); +static void toss_rsb(struct kref *kref); + +/* + * Lock compatibilty matrix - thanks Steve + * UN = Unlocked state. Not really a state, used as a flag + * PD = Padding. Used to make the matrix a nice power of two in size + * Other states are the same as the VMS DLM. + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) + */ + +static const int __dlm_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +/* + * This defines the direction of transfer of LVB data. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + * 1 = LVB is returned to the caller + * 0 = LVB is written to the resource + * -1 = nothing happens to the LVB + */ + +const int dlm_lvb_operations[8][8] = { + /* UN NL CR CW PR PW EX PD*/ + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */ + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */ + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */ + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */ + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */ + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */ + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */ +}; + +#define modes_compat(gr, rq) \ + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1] + +int dlm_modes_compat(int mode1, int mode2) +{ + return __dlm_compat_matrix[mode1 + 1][mode2 + 1]; +} + +/* + * Compatibility matrix for conversions with QUECVT set. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + */ + +static const int __quecvt_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */ + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */ + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */ + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +void dlm_print_lkb(struct dlm_lkb *lkb) +{ + printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " + "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", + lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, + dlm_iflags_val(lkb), lkb->lkb_status, lkb->lkb_rqmode, + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, + (unsigned long long)lkb->lkb_recover_seq); +} + +static void dlm_print_rsb(struct dlm_rsb *r) +{ + printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x " + "rlc %d name %s\n", + r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid, + r->res_flags, r->res_first_lkid, r->res_recover_locks_count, + r->res_name); +} + +void dlm_dump_rsb(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb; + + dlm_print_rsb(r); + + printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n", + list_empty(&r->res_root_list), list_empty(&r->res_recover_list)); + printk(KERN_ERR "rsb lookup list\n"); + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb grant queue:\n"); + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb convert queue:\n"); + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb wait queue:\n"); + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) + dlm_print_lkb(lkb); +} + +/* Threads cannot use the lockspace while it's being recovered */ + +static inline void dlm_lock_recovery(struct dlm_ls *ls) +{ + down_read(&ls->ls_in_recovery); +} + +void dlm_unlock_recovery(struct dlm_ls *ls) +{ + up_read(&ls->ls_in_recovery); +} + +int dlm_lock_recovery_try(struct dlm_ls *ls) +{ + return down_read_trylock(&ls->ls_in_recovery); +} + +static inline int can_be_queued(struct dlm_lkb *lkb) +{ + return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE); +} + +static inline int force_blocking_asts(struct dlm_lkb *lkb) +{ + return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST); +} + +static inline int is_demoted(struct dlm_lkb *lkb) +{ + return test_bit(DLM_SBF_DEMOTED_BIT, &lkb->lkb_sbflags); +} + +static inline int is_altmode(struct dlm_lkb *lkb) +{ + return test_bit(DLM_SBF_ALTMODE_BIT, &lkb->lkb_sbflags); +} + +static inline int is_granted(struct dlm_lkb *lkb) +{ + return (lkb->lkb_status == DLM_LKSTS_GRANTED); +} + +static inline int is_remote(struct dlm_rsb *r) +{ + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r);); + return !!r->res_nodeid; +} + +static inline int is_process_copy(struct dlm_lkb *lkb) +{ + return lkb->lkb_nodeid && + !test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); +} + +static inline int is_master_copy(struct dlm_lkb *lkb) +{ + return test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); +} + +static inline int middle_conversion(struct dlm_lkb *lkb) +{ + if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) || + (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW)) + return 1; + return 0; +} + +static inline int down_conversion(struct dlm_lkb *lkb) +{ + return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode); +} + +static inline int is_overlap_unlock(struct dlm_lkb *lkb) +{ + return test_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); +} + +static inline int is_overlap_cancel(struct dlm_lkb *lkb) +{ + return test_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); +} + +static inline int is_overlap(struct dlm_lkb *lkb) +{ + return test_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags) || + test_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); +} + +static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + if (is_master_copy(lkb)) + return; + + DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); + + if (rv == -DLM_ECANCEL && + test_and_clear_bit(DLM_IFL_DEADLOCK_CANCEL_BIT, &lkb->lkb_iflags)) + rv = -EDEADLK; + + dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, dlm_sbflags_val(lkb)); +} + +static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + queue_cast(r, lkb, + is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL); +} + +static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) +{ + if (is_master_copy(lkb)) { + send_bast(r, lkb, rqmode); + } else { + dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0); + } +} + +/* + * Basic operations on rsb's and lkb's + */ + +/* This is only called to add a reference when the code already holds + a valid reference to the rsb, so there's no need for locking. */ + +static inline void hold_rsb(struct dlm_rsb *r) +{ + kref_get(&r->res_ref); +} + +void dlm_hold_rsb(struct dlm_rsb *r) +{ + hold_rsb(r); +} + +/* When all references to the rsb are gone it's transferred to + the tossed list for later disposal. */ + +static void put_rsb(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + uint32_t bucket = r->res_bucket; + int rv; + + rv = kref_put_lock(&r->res_ref, toss_rsb, + &ls->ls_rsbtbl[bucket].lock); + if (rv) + spin_unlock(&ls->ls_rsbtbl[bucket].lock); +} + +void dlm_put_rsb(struct dlm_rsb *r) +{ + put_rsb(r); +} + +static int pre_rsb_struct(struct dlm_ls *ls) +{ + struct dlm_rsb *r1, *r2; + int count = 0; + + spin_lock(&ls->ls_new_rsb_spin); + if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) { + spin_unlock(&ls->ls_new_rsb_spin); + return 0; + } + spin_unlock(&ls->ls_new_rsb_spin); + + r1 = dlm_allocate_rsb(ls); + r2 = dlm_allocate_rsb(ls); + + spin_lock(&ls->ls_new_rsb_spin); + if (r1) { + list_add(&r1->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + if (r2) { + list_add(&r2->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + + if (!count) + return -ENOMEM; + return 0; +} + +/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can + unlock any spinlocks, go back and call pre_rsb_struct again. + Otherwise, take an rsb off the list and return it. */ + +static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len, + struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r; + int count; + + spin_lock(&ls->ls_new_rsb_spin); + if (list_empty(&ls->ls_new_rsb)) { + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + log_debug(ls, "find_rsb retry %d %d %s", + count, dlm_config.ci_new_rsb_count, + (const char *)name); + return -EAGAIN; + } + + r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); + list_del(&r->res_hashchain); + /* Convert the empty list_head to a NULL rb_node for tree usage: */ + memset(&r->res_hashnode, 0, sizeof(struct rb_node)); + ls->ls_new_rsb_count--; + spin_unlock(&ls->ls_new_rsb_spin); + + r->res_ls = ls; + r->res_length = len; + memcpy(r->res_name, name, len); + mutex_init(&r->res_mutex); + + INIT_LIST_HEAD(&r->res_lookup); + INIT_LIST_HEAD(&r->res_grantqueue); + INIT_LIST_HEAD(&r->res_convertqueue); + INIT_LIST_HEAD(&r->res_waitqueue); + INIT_LIST_HEAD(&r->res_root_list); + INIT_LIST_HEAD(&r->res_recover_list); + + *r_ret = r; + return 0; +} + +static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) +{ + char maxname[DLM_RESNAME_MAXLEN]; + + memset(maxname, 0, DLM_RESNAME_MAXLEN); + memcpy(maxname, name, nlen); + return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); +} + +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, + struct dlm_rsb **r_ret) +{ + struct rb_node *node = tree->rb_node; + struct dlm_rsb *r; + int rc; + + while (node) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); + rc = rsb_cmp(r, name, len); + if (rc < 0) + node = node->rb_left; + else if (rc > 0) + node = node->rb_right; + else + goto found; + } + *r_ret = NULL; + return -EBADR; + + found: + *r_ret = r; + return 0; +} + +static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) +{ + struct rb_node **newn = &tree->rb_node; + struct rb_node *parent = NULL; + int rc; + + while (*newn) { + struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb, + res_hashnode); + + parent = *newn; + rc = rsb_cmp(cur, rsb->res_name, rsb->res_length); + if (rc < 0) + newn = &parent->rb_left; + else if (rc > 0) + newn = &parent->rb_right; + else { + log_print("rsb_insert match"); + dlm_dump_rsb(rsb); + dlm_dump_rsb(cur); + return -EEXIST; + } + } + + rb_link_node(&rsb->res_hashnode, parent, newn); + rb_insert_color(&rsb->res_hashnode, tree); + return 0; +} + +/* + * Find rsb in rsbtbl and potentially create/add one + * + * Delaying the release of rsb's has a similar benefit to applications keeping + * NL locks on an rsb, but without the guarantee that the cached master value + * will still be valid when the rsb is reused. Apps aren't always smart enough + * to keep NL locks on an rsb that they may lock again shortly; this can lead + * to excessive master lookups and removals if we don't delay the release. + * + * Searching for an rsb means looking through both the normal list and toss + * list. When found on the toss list the rsb is moved to the normal list with + * ref count of 1; when found on normal list the ref count is incremented. + * + * rsb's on the keep list are being used locally and refcounted. + * rsb's on the toss list are not being used locally, and are not refcounted. + * + * The toss list rsb's were either + * - previously used locally but not any more (were on keep list, then + * moved to toss list when last refcount dropped) + * - created and put on toss list as a directory record for a lookup + * (we are the dir node for the res, but are not using the res right now, + * but some other node is) + * + * The purpose of find_rsb() is to return a refcounted rsb for local use. + * So, if the given rsb is on the toss list, it is moved to the keep list + * before being returned. + * + * toss_rsb() happens when all local usage of the rsb is done, i.e. no + * more refcounts exist, so the rsb is moved from the keep list to the + * toss list. + * + * rsb's on both keep and toss lists are used for doing a name to master + * lookups. rsb's that are in use locally (and being refcounted) are on + * the keep list, rsb's that are not in use locally (not refcounted) and + * only exist for name/master lookups are on the toss list. + * + * rsb's on the toss list who's dir_nodeid is not local can have stale + * name/master mappings. So, remote requests on such rsb's can potentially + * return with an error, which means the mapping is stale and needs to + * be updated with a new lookup. (The idea behind MASTER UNCERTAIN and + * first_lkid is to keep only a single outstanding request on an rsb + * while that rsb has a potentially stale master.) + */ + +static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, + uint32_t hash, uint32_t b, + int dir_nodeid, int from_nodeid, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r = NULL; + int our_nodeid = dlm_our_nodeid(); + int from_local = 0; + int from_other = 0; + int from_dir = 0; + int create = 0; + int error; + + if (flags & R_RECEIVE_REQUEST) { + if (from_nodeid == dir_nodeid) + from_dir = 1; + else + from_other = 1; + } else if (flags & R_REQUEST) { + from_local = 1; + } + + /* + * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so + * from_nodeid has sent us a lock in dlm_recover_locks, believing + * we're the new master. Our local recovery may not have set + * res_master_nodeid to our_nodeid yet, so allow either. Don't + * create the rsb; dlm_recover_process_copy() will handle EBADR + * by resending. + * + * If someone sends us a request, we are the dir node, and we do + * not find the rsb anywhere, then recreate it. This happens if + * someone sends us a request after we have removed/freed an rsb + * from our toss list. (They sent a request instead of lookup + * because they are using an rsb from their toss list.) + */ + + if (from_local || from_dir || + (from_other && (dir_nodeid == our_nodeid))) { + create = 1; + } + + retry: + if (create) { + error = pre_rsb_struct(ls); + if (error < 0) + goto out; + } + + spin_lock(&ls->ls_rsbtbl[b].lock); + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (error) + goto do_toss; + + /* + * rsb is active, so we can't check master_nodeid without lock_rsb. + */ + + kref_get(&r->res_ref); + goto out_unlock; + + + do_toss: + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto do_new; + + /* + * rsb found inactive (master_nodeid may be out of date unless + * we are the dir_nodeid or were the master) No other thread + * is using this rsb because it's on the toss list, so we can + * look at or update res_master_nodeid without lock_rsb. + */ + + if ((r->res_master_nodeid != our_nodeid) && from_other) { + /* our rsb was not master, and another node (not the dir node) + has sent us a request */ + log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s", + from_nodeid, r->res_master_nodeid, dir_nodeid, + r->res_name); + error = -ENOTBLK; + goto out_unlock; + } + + if ((r->res_master_nodeid != our_nodeid) && from_dir) { + /* don't think this should ever happen */ + log_error(ls, "find_rsb toss from_dir %d master %d", + from_nodeid, r->res_master_nodeid); + dlm_print_rsb(r); + /* fix it and go on */ + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = 0; + } + + if (from_local && (r->res_master_nodeid != our_nodeid)) { + /* Because we have held no locks on this rsb, + res_master_nodeid could have become stale. */ + rsb_set_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = 0; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + goto out_unlock; + + + do_new: + /* + * rsb not found + */ + + if (error == -EBADR && !create) + goto out_unlock; + + error = get_rsb_struct(ls, name, len, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + if (error) + goto out_unlock; + + r->res_hash = hash; + r->res_bucket = b; + r->res_dir_nodeid = dir_nodeid; + kref_init(&r->res_ref); + + if (from_dir) { + /* want to see how often this happens */ + log_debug(ls, "find_rsb new from_dir %d recreate %s", + from_nodeid, r->res_name); + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + goto out_add; + } + + if (from_other && (dir_nodeid != our_nodeid)) { + /* should never happen */ + log_error(ls, "find_rsb new from_other %d dir %d our %d %s", + from_nodeid, dir_nodeid, our_nodeid, r->res_name); + dlm_free_rsb(r); + r = NULL; + error = -ENOTBLK; + goto out_unlock; + } + + if (from_other) { + log_debug(ls, "find_rsb new from_other %d dir %d %s", + from_nodeid, dir_nodeid, r->res_name); + } + + if (dir_nodeid == our_nodeid) { + /* When we are the dir nodeid, we can set the master + node immediately */ + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + } else { + /* set_master will send_lookup to dir_nodeid */ + r->res_master_nodeid = 0; + r->res_nodeid = -1; + } + + out_add: + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + out_unlock: + spin_unlock(&ls->ls_rsbtbl[b].lock); + out: + *r_ret = r; + return error; +} + +/* During recovery, other nodes can send us new MSTCPY locks (from + dlm_recover_locks) before we've made ourself master (in + dlm_recover_masters). */ + +static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, + uint32_t hash, uint32_t b, + int dir_nodeid, int from_nodeid, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r = NULL; + int our_nodeid = dlm_our_nodeid(); + int recover = (flags & R_RECEIVE_RECOVER); + int error; + + retry: + error = pre_rsb_struct(ls); + if (error < 0) + goto out; + + spin_lock(&ls->ls_rsbtbl[b].lock); + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (error) + goto do_toss; + + /* + * rsb is active, so we can't check master_nodeid without lock_rsb. + */ + + kref_get(&r->res_ref); + goto out_unlock; + + + do_toss: + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto do_new; + + /* + * rsb found inactive. No other thread is using this rsb because + * it's on the toss list, so we can look at or update + * res_master_nodeid without lock_rsb. + */ + + if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) { + /* our rsb is not master, and another node has sent us a + request; this should never happen */ + log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d", + from_nodeid, r->res_master_nodeid, dir_nodeid); + dlm_print_rsb(r); + error = -ENOTBLK; + goto out_unlock; + } + + if (!recover && (r->res_master_nodeid != our_nodeid) && + (dir_nodeid == our_nodeid)) { + /* our rsb is not master, and we are dir; may as well fix it; + this should never happen */ + log_error(ls, "find_rsb toss our %d master %d dir %d", + our_nodeid, r->res_master_nodeid, dir_nodeid); + dlm_print_rsb(r); + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + goto out_unlock; + + + do_new: + /* + * rsb not found + */ + + error = get_rsb_struct(ls, name, len, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + if (error) + goto out_unlock; + + r->res_hash = hash; + r->res_bucket = b; + r->res_dir_nodeid = dir_nodeid; + r->res_master_nodeid = dir_nodeid; + r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid; + kref_init(&r->res_ref); + + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + out_unlock: + spin_unlock(&ls->ls_rsbtbl[b].lock); + out: + *r_ret = r; + return error; +} + +static int find_rsb(struct dlm_ls *ls, const void *name, int len, + int from_nodeid, unsigned int flags, + struct dlm_rsb **r_ret) +{ + uint32_t hash, b; + int dir_nodeid; + + if (len > DLM_RESNAME_MAXLEN) + return -EINVAL; + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + dir_nodeid = dlm_hash2nodeid(ls, hash); + + if (dlm_no_directory(ls)) + return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid, + from_nodeid, flags, r_ret); + else + return find_rsb_dir(ls, name, len, hash, b, dir_nodeid, + from_nodeid, flags, r_ret); +} + +/* we have received a request and found that res_master_nodeid != our_nodeid, + so we need to return an error or make ourself the master */ + +static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r, + int from_nodeid) +{ + if (dlm_no_directory(ls)) { + log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d", + from_nodeid, r->res_master_nodeid, + r->res_dir_nodeid); + dlm_print_rsb(r); + return -ENOTBLK; + } + + if (from_nodeid != r->res_dir_nodeid) { + /* our rsb is not master, and another node (not the dir node) + has sent us a request. this is much more common when our + master_nodeid is zero, so limit debug to non-zero. */ + + if (r->res_master_nodeid) { + log_debug(ls, "validate master from_other %d master %d " + "dir %d first %x %s", from_nodeid, + r->res_master_nodeid, r->res_dir_nodeid, + r->res_first_lkid, r->res_name); + } + return -ENOTBLK; + } else { + /* our rsb is not master, but the dir nodeid has sent us a + request; this could happen with master 0 / res_nodeid -1 */ + + if (r->res_master_nodeid) { + log_error(ls, "validate master from_dir %d master %d " + "first %x %s", + from_nodeid, r->res_master_nodeid, + r->res_first_lkid, r->res_name); + } + + r->res_master_nodeid = dlm_our_nodeid(); + r->res_nodeid = 0; + return 0; + } +} + +static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid, + int from_nodeid, bool toss_list, unsigned int flags, + int *r_nodeid, int *result) +{ + int fix_master = (flags & DLM_LU_RECOVER_MASTER); + int from_master = (flags & DLM_LU_RECOVER_DIR); + + if (r->res_dir_nodeid != our_nodeid) { + /* should not happen, but may as well fix it and carry on */ + log_error(ls, "%s res_dir %d our %d %s", __func__, + r->res_dir_nodeid, our_nodeid, r->res_name); + r->res_dir_nodeid = our_nodeid; + } + + if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) { + /* Recovery uses this function to set a new master when + * the previous master failed. Setting NEW_MASTER will + * force dlm_recover_masters to call recover_master on this + * rsb even though the res_nodeid is no longer removed. + */ + + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + rsb_set_flag(r, RSB_NEW_MASTER); + + if (toss_list) { + /* I don't think we should ever find it on toss list. */ + log_error(ls, "%s fix_master on toss", __func__); + dlm_dump_rsb(r); + } + } + + if (from_master && (r->res_master_nodeid != from_nodeid)) { + /* this will happen if from_nodeid became master during + * a previous recovery cycle, and we aborted the previous + * cycle before recovering this master value + */ + + log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s", + __func__, from_nodeid, r->res_master_nodeid, + r->res_nodeid, r->res_first_lkid, r->res_name); + + if (r->res_master_nodeid == our_nodeid) { + log_error(ls, "from_master %d our_master", from_nodeid); + dlm_dump_rsb(r); + goto ret_assign; + } + + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + rsb_set_flag(r, RSB_NEW_MASTER); + } + + if (!r->res_master_nodeid) { + /* this will happen if recovery happens while we're looking + * up the master for this rsb + */ + + log_debug(ls, "%s master 0 to %d first %x %s", __func__, + from_nodeid, r->res_first_lkid, r->res_name); + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + } + + if (!from_master && !fix_master && + (r->res_master_nodeid == from_nodeid)) { + /* this can happen when the master sends remove, the dir node + * finds the rsb on the keep list and ignores the remove, + * and the former master sends a lookup + */ + + log_limit(ls, "%s from master %d flags %x first %x %s", + __func__, from_nodeid, flags, r->res_first_lkid, + r->res_name); + } + + ret_assign: + *r_nodeid = r->res_master_nodeid; + if (result) + *result = DLM_LU_MATCH; +} + +/* + * We're the dir node for this res and another node wants to know the + * master nodeid. During normal operation (non recovery) this is only + * called from receive_lookup(); master lookups when the local node is + * the dir node are done by find_rsb(). + * + * normal operation, we are the dir node for a resource + * . _request_lock + * . set_master + * . send_lookup + * . receive_lookup + * . dlm_master_lookup flags 0 + * + * recover directory, we are rebuilding dir for all resources + * . dlm_recover_directory + * . dlm_rcom_names + * remote node sends back the rsb names it is master of and we are dir of + * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1) + * we either create new rsb setting remote node as master, or find existing + * rsb and set master to be the remote node. + * + * recover masters, we are finding the new master for resources + * . dlm_recover_masters + * . recover_master + * . dlm_send_rcom_lookup + * . receive_rcom_lookup + * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0) + */ + +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result) +{ + struct dlm_rsb *r = NULL; + uint32_t hash, b; + int our_nodeid = dlm_our_nodeid(); + int dir_nodeid, error; + + if (len > DLM_RESNAME_MAXLEN) + return -EINVAL; + + if (from_nodeid == our_nodeid) { + log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x", + our_nodeid, flags); + return -EINVAL; + } + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + dir_nodeid = dlm_hash2nodeid(ls, hash); + if (dir_nodeid != our_nodeid) { + log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d", + from_nodeid, dir_nodeid, our_nodeid, hash, + ls->ls_num_nodes); + *r_nodeid = -1; + return -EINVAL; + } + + retry: + error = pre_rsb_struct(ls); + if (error < 0) + return error; + + spin_lock(&ls->ls_rsbtbl[b].lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (!error) { + /* because the rsb is active, we need to lock_rsb before + * checking/changing re_master_nodeid + */ + + hold_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + lock_rsb(r); + + __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false, + flags, r_nodeid, result); + + /* the rsb was active */ + unlock_rsb(r); + put_rsb(r); + + return 0; + } + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto not_found; + + /* because the rsb is inactive (on toss list), it's not refcounted + * and lock_rsb is not used, but is protected by the rsbtbl lock + */ + + __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags, + r_nodeid, result); + + r->res_toss_time = jiffies; + /* the rsb was inactive (on toss list) */ + spin_unlock(&ls->ls_rsbtbl[b].lock); + + return 0; + + not_found: + error = get_rsb_struct(ls, name, len, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + if (error) + goto out_unlock; + + r->res_hash = hash; + r->res_bucket = b; + r->res_dir_nodeid = our_nodeid; + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + kref_init(&r->res_ref); + r->res_toss_time = jiffies; + + error = rsb_insert(r, &ls->ls_rsbtbl[b].toss); + if (error) { + /* should never happen */ + dlm_free_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + + if (result) + *result = DLM_LU_ADD; + *r_nodeid = from_nodeid; + out_unlock: + spin_unlock(&ls->ls_rsbtbl[b].lock); + return error; +} + +static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) +{ + struct rb_node *n; + struct dlm_rsb *r; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + if (r->res_hash == hash) + dlm_dump_rsb(r); + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } +} + +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len) +{ + struct dlm_rsb *r = NULL; + uint32_t hash, b; + int error; + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + spin_lock(&ls->ls_rsbtbl[b].lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (!error) + goto out_dump; + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto out; + out_dump: + dlm_dump_rsb(r); + out: + spin_unlock(&ls->ls_rsbtbl[b].lock); +} + +static void toss_rsb(struct kref *kref) +{ + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); + struct dlm_ls *ls = r->res_ls; + + DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); + kref_init(&r->res_ref); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); + rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); + r->res_toss_time = jiffies; + set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[r->res_bucket].flags); + if (r->res_lvbptr) { + dlm_free_lvb(r->res_lvbptr); + r->res_lvbptr = NULL; + } +} + +/* See comment for unhold_lkb */ + +static void unhold_rsb(struct dlm_rsb *r) +{ + int rv; + rv = kref_put(&r->res_ref, toss_rsb); + DLM_ASSERT(!rv, dlm_dump_rsb(r);); +} + +static void kill_rsb(struct kref *kref) +{ + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); + + /* All work is done after the return from kref_put() so we + can release the write_lock before the remove and free. */ + + DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r);); +} + +/* Attaching/detaching lkb's from rsb's is for rsb reference counting. + The rsb must exist as long as any lkb's for it do. */ + +static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + hold_rsb(r); + lkb->lkb_resource = r; +} + +static void detach_lkb(struct dlm_lkb *lkb) +{ + if (lkb->lkb_resource) { + put_rsb(lkb->lkb_resource); + lkb->lkb_resource = NULL; + } +} + +static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, + int start, int end) +{ + struct dlm_lkb *lkb; + int rv; + + lkb = dlm_allocate_lkb(ls); + if (!lkb) + return -ENOMEM; + + lkb->lkb_last_bast_mode = -1; + lkb->lkb_nodeid = -1; + lkb->lkb_grmode = DLM_LOCK_IV; + kref_init(&lkb->lkb_ref); + INIT_LIST_HEAD(&lkb->lkb_ownqueue); + INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); + INIT_LIST_HEAD(&lkb->lkb_cb_list); + INIT_LIST_HEAD(&lkb->lkb_callbacks); + spin_lock_init(&lkb->lkb_cb_lock); + INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); + + idr_preload(GFP_NOFS); + spin_lock(&ls->ls_lkbidr_spin); + rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT); + if (rv >= 0) + lkb->lkb_id = rv; + spin_unlock(&ls->ls_lkbidr_spin); + idr_preload_end(); + + if (rv < 0) { + log_error(ls, "create_lkb idr error %d", rv); + dlm_free_lkb(lkb); + return rv; + } + + *lkb_ret = lkb; + return 0; +} + +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +{ + return _create_lkb(ls, lkb_ret, 1, 0); +} + +static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) +{ + struct dlm_lkb *lkb; + + spin_lock(&ls->ls_lkbidr_spin); + lkb = idr_find(&ls->ls_lkbidr, lkid); + if (lkb) + kref_get(&lkb->lkb_ref); + spin_unlock(&ls->ls_lkbidr_spin); + + *lkb_ret = lkb; + return lkb ? 0 : -ENOENT; +} + +static void kill_lkb(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + /* All work is done after the return from kref_put() so we + can release the write_lock before the detach_lkb */ + + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); +} + +/* __put_lkb() is used when an lkb may not have an rsb attached to + it so we need to provide the lockspace explicitly */ + +static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + uint32_t lkid = lkb->lkb_id; + int rv; + + rv = kref_put_lock(&lkb->lkb_ref, kill_lkb, + &ls->ls_lkbidr_spin); + if (rv) { + idr_remove(&ls->ls_lkbidr, lkid); + spin_unlock(&ls->ls_lkbidr_spin); + + detach_lkb(lkb); + + /* for local/process lkbs, lvbptr points to caller's lksb */ + if (lkb->lkb_lvbptr && is_master_copy(lkb)) + dlm_free_lvb(lkb->lkb_lvbptr); + dlm_free_lkb(lkb); + } + + return rv; +} + +int dlm_put_lkb(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls; + + DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb);); + DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb);); + + ls = lkb->lkb_resource->res_ls; + return __put_lkb(ls, lkb); +} + +/* This is only called to add a reference when the code already holds + a valid reference to the lkb, so there's no need for locking. */ + +static inline void hold_lkb(struct dlm_lkb *lkb) +{ + kref_get(&lkb->lkb_ref); +} + +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + +/* This is called when we need to remove a reference and are certain + it's not the last ref. e.g. del_lkb is always called between a + find_lkb/put_lkb and is always the inverse of a previous add_lkb. + put_lkb would work fine, but would involve unnecessary locking */ + +static inline void unhold_lkb(struct dlm_lkb *lkb) +{ + kref_put(&lkb->lkb_ref, unhold_lkb_assert); +} + +static void lkb_add_ordered(struct list_head *new, struct list_head *head, + int mode) +{ + struct dlm_lkb *lkb = NULL, *iter; + + list_for_each_entry(iter, head, lkb_statequeue) + if (iter->lkb_rqmode < mode) { + lkb = iter; + list_add_tail(new, &iter->lkb_statequeue); + break; + } + + if (!lkb) + list_add_tail(new, head); +} + +/* add/remove lkb to rsb's grant/convert/wait queue */ + +static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status) +{ + kref_get(&lkb->lkb_ref); + + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); + + lkb->lkb_timestamp = ktime_get(); + + lkb->lkb_status = status; + + switch (status) { + case DLM_LKSTS_WAITING: + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_waitqueue); + else + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue); + break; + case DLM_LKSTS_GRANTED: + /* convention says granted locks kept in order of grmode */ + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue, + lkb->lkb_grmode); + break; + case DLM_LKSTS_CONVERT: + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_convertqueue); + else + list_add_tail(&lkb->lkb_statequeue, + &r->res_convertqueue); + break; + default: + DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status);); + } +} + +static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + lkb->lkb_status = 0; + list_del(&lkb->lkb_statequeue); + unhold_lkb(lkb); +} + +static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts) +{ + hold_lkb(lkb); + del_lkb(r, lkb); + add_lkb(r, lkb, sts); + unhold_lkb(lkb); +} + +static int msg_reply_type(int mstype) +{ + switch (mstype) { + case DLM_MSG_REQUEST: + return DLM_MSG_REQUEST_REPLY; + case DLM_MSG_CONVERT: + return DLM_MSG_CONVERT_REPLY; + case DLM_MSG_UNLOCK: + return DLM_MSG_UNLOCK_REPLY; + case DLM_MSG_CANCEL: + return DLM_MSG_CANCEL_REPLY; + case DLM_MSG_LOOKUP: + return DLM_MSG_LOOKUP_REPLY; + } + return -1; +} + +/* add/remove lkb from global waiters list of lkb's waiting for + a reply from a remote node */ + +static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error = 0; + int wc; + + mutex_lock(&ls->ls_waiters_mutex); + + if (is_overlap_unlock(lkb) || + (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) { + error = -EINVAL; + goto out; + } + + if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) { + switch (mstype) { + case DLM_MSG_UNLOCK: + set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); + break; + case DLM_MSG_CANCEL: + set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); + break; + default: + error = -EBUSY; + goto out; + } + wc = atomic_inc_return(&lkb->lkb_wait_count); + hold_lkb(lkb); + + log_debug(ls, "addwait %x cur %d overlap %d count %d f %x", + lkb->lkb_id, lkb->lkb_wait_type, mstype, wc, + dlm_iflags_val(lkb)); + goto out; + } + + wc = atomic_fetch_inc(&lkb->lkb_wait_count); + DLM_ASSERT(!wc, dlm_print_lkb(lkb); printk("wait_count %d\n", wc);); + lkb->lkb_wait_type = mstype; + lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ + hold_lkb(lkb); + list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); + out: + if (error) + log_error(ls, "addwait error %x %d flags %x %d %d %s", + lkb->lkb_id, error, dlm_iflags_val(lkb), mstype, + lkb->lkb_wait_type, lkb->lkb_resource->res_name); + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* We clear the RESEND flag because we might be taking an lkb off the waiters + list as part of process_requestqueue (e.g. a lookup that has an optimized + request reply on the requestqueue) between dlm_recover_waiters_pre() which + set RESEND and dlm_recover_waiters_post() */ + +static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, + const struct dlm_message *ms) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int overlap_done = 0; + + if (mstype == DLM_MSG_UNLOCK_REPLY && + test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags)) { + log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id); + overlap_done = 1; + goto out_del; + } + + if (mstype == DLM_MSG_CANCEL_REPLY && + test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags)) { + log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id); + overlap_done = 1; + goto out_del; + } + + /* Cancel state was preemptively cleared by a successful convert, + see next comment, nothing to do. */ + + if ((mstype == DLM_MSG_CANCEL_REPLY) && + (lkb->lkb_wait_type != DLM_MSG_CANCEL)) { + log_debug(ls, "remwait %x cancel_reply wait_type %d", + lkb->lkb_id, lkb->lkb_wait_type); + return -1; + } + + /* Remove for the convert reply, and premptively remove for the + cancel reply. A convert has been granted while there's still + an outstanding cancel on it (the cancel is moot and the result + in the cancel reply should be 0). We preempt the cancel reply + because the app gets the convert result and then can follow up + with another op, like convert. This subsequent op would see the + lingering state of the cancel and fail with -EBUSY. */ + + if ((mstype == DLM_MSG_CONVERT_REPLY) && + (lkb->lkb_wait_type == DLM_MSG_CONVERT) && ms && !ms->m_result && + test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags)) { + log_debug(ls, "remwait %x convert_reply zap overlap_cancel", + lkb->lkb_id); + lkb->lkb_wait_type = 0; + atomic_dec(&lkb->lkb_wait_count); + unhold_lkb(lkb); + goto out_del; + } + + /* N.B. type of reply may not always correspond to type of original + msg due to lookup->request optimization, verify others? */ + + if (lkb->lkb_wait_type) { + lkb->lkb_wait_type = 0; + goto out_del; + } + + log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", + lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0, + lkb->lkb_remid, mstype, dlm_iflags_val(lkb)); + return -1; + + out_del: + /* the force-unlock/cancel has completed and we haven't recvd a reply + to the op that was in progress prior to the unlock/cancel; we + give up on any reply to the earlier op. FIXME: not sure when/how + this would happen */ + + if (overlap_done && lkb->lkb_wait_type) { + log_error(ls, "remwait error %x reply %d wait_type %d overlap", + lkb->lkb_id, mstype, lkb->lkb_wait_type); + atomic_dec(&lkb->lkb_wait_count); + unhold_lkb(lkb); + lkb->lkb_wait_type = 0; + } + + DLM_ASSERT(atomic_read(&lkb->lkb_wait_count), dlm_print_lkb(lkb);); + + clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); + if (atomic_dec_and_test(&lkb->lkb_wait_count)) + list_del_init(&lkb->lkb_wait_reply); + unhold_lkb(lkb); + return 0; +} + +static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error; + + mutex_lock(&ls->ls_waiters_mutex); + error = _remove_from_waiters(lkb, mstype, NULL); + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* Handles situations where we might be processing a "fake" or "local" reply in + which we can't try to take waiters_mutex again. */ + +static int remove_from_waiters_ms(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error; + + if (!local) + mutex_lock(&ls->ls_waiters_mutex); + error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms); + if (!local) + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +static void shrink_bucket(struct dlm_ls *ls, int b) +{ + struct rb_node *n, *next; + struct dlm_rsb *r; + char *name; + int our_nodeid = dlm_our_nodeid(); + int remote_count = 0; + int need_shrink = 0; + int i, len, rv; + + memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); + + spin_lock(&ls->ls_rsbtbl[b].lock); + + if (!test_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) { + next = rb_next(n); + r = rb_entry(n, struct dlm_rsb, res_hashnode); + + /* If we're the directory record for this rsb, and + we're not the master of it, then we need to wait + for the master node to send us a dir remove for + before removing the dir record. */ + + if (!dlm_no_directory(ls) && + (r->res_master_nodeid != our_nodeid) && + (dlm_dir_nodeid(r) == our_nodeid)) { + continue; + } + + need_shrink = 1; + + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) { + continue; + } + + if (!dlm_no_directory(ls) && + (r->res_master_nodeid == our_nodeid) && + (dlm_dir_nodeid(r) != our_nodeid)) { + + /* We're the master of this rsb but we're not + the directory record, so we need to tell the + dir node to remove the dir record. */ + + ls->ls_remove_lens[remote_count] = r->res_length; + memcpy(ls->ls_remove_names[remote_count], r->res_name, + DLM_RESNAME_MAXLEN); + remote_count++; + + if (remote_count >= DLM_REMOVE_NAMES_MAX) + break; + continue; + } + + if (!kref_put(&r->res_ref, kill_rsb)) { + log_error(ls, "tossed rsb in use %s", r->res_name); + continue; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + dlm_free_rsb(r); + } + + if (need_shrink) + set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags); + else + clear_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + /* + * While searching for rsb's to free, we found some that require + * remote removal. We leave them in place and find them again here + * so there is a very small gap between removing them from the toss + * list and sending the removal. Keeping this gap small is + * important to keep us (the master node) from being out of sync + * with the remote dir node for very long. + */ + + for (i = 0; i < remote_count; i++) { + name = ls->ls_remove_names[i]; + len = ls->ls_remove_lens[i]; + + spin_lock(&ls->ls_rsbtbl[b].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name not toss %s", name); + continue; + } + + if (r->res_master_nodeid != our_nodeid) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name master %d dir %d our %d %s", + r->res_master_nodeid, r->res_dir_nodeid, + our_nodeid, name); + continue; + } + + if (r->res_dir_nodeid == our_nodeid) { + /* should never happen */ + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "remove_name dir %d master %d our %d %s", + r->res_dir_nodeid, r->res_master_nodeid, + our_nodeid, name); + continue; + } + + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name toss_time %lu now %lu %s", + r->res_toss_time, jiffies, name); + continue; + } + + if (!kref_put(&r->res_ref, kill_rsb)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "remove_name in use %s", name); + continue; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + send_remove(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + dlm_free_rsb(r); + } +} + +void dlm_scan_rsbs(struct dlm_ls *ls) +{ + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + shrink_bucket(ls, i); + if (dlm_locking_stopped(ls)) + break; + cond_resched(); + } +} + +/* lkb is master or local copy */ + +static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int b, len = r->res_ls->ls_lvblen; + + /* b=1 lvb returned to caller + b=0 lvb written to rsb or invalidated + b=-1 do nothing */ + + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + + if (b == 1) { + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + return; + + memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len); + lkb->lkb_lvbseq = r->res_lvbseq; + + } else if (b == 0) { + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + + if (!r->res_lvbptr) + return; + + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len); + r->res_lvbseq++; + lkb->lkb_lvbseq = r->res_lvbseq; + rsb_clear_flag(r, RSB_VALNOTVALID); + } + + if (rsb_flag(r, RSB_VALNOTVALID)) + set_bit(DLM_SBF_VALNOTVALID_BIT, &lkb->lkb_sbflags); +} + +static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + if (lkb->lkb_grmode < DLM_LOCK_PW) + return; + + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + + if (!r->res_lvbptr) + return; + + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); + r->res_lvbseq++; + rsb_clear_flag(r, RSB_VALNOTVALID); +} + +/* lkb is process copy (pc) */ + +static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, + const struct dlm_message *ms) +{ + int b; + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + if (b == 1) { + int len = receive_extralen(ms); + if (len > r->res_ls->ls_lvblen) + len = r->res_ls->ls_lvblen; + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); + lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq); + } +} + +/* Manipulate lkb's on rsb's convert/granted/waiting queues + remove_lock -- used for unlock, removes lkb from granted + revert_lock -- used for cancel, moves lkb from convert to granted + grant_lock -- used for request and convert, adds lkb to granted or + moves lkb from convert or waiting to granted + + Each of these is used for master or local copy lkb's. There is + also a _pc() variation used to make the corresponding change on + a process copy (pc) lkb. */ + +static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + del_lkb(r, lkb); + lkb->lkb_grmode = DLM_LOCK_IV; + /* this unhold undoes the original ref from create_lkb() + so this leads to the lkb being freed */ + unhold_lkb(lkb); +} + +static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + set_lvb_unlock(r, lkb); + _remove_lock(r, lkb); +} + +static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + _remove_lock(r, lkb); +} + +/* returns: 0 did nothing + 1 moved lock to granted + -1 removed lock */ + +static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int rv = 0; + + lkb->lkb_rqmode = DLM_LOCK_IV; + + switch (lkb->lkb_status) { + case DLM_LKSTS_GRANTED: + break; + case DLM_LKSTS_CONVERT: + move_lkb(r, lkb, DLM_LKSTS_GRANTED); + rv = 1; + break; + case DLM_LKSTS_WAITING: + del_lkb(r, lkb); + lkb->lkb_grmode = DLM_LOCK_IV; + /* this unhold undoes the original ref from create_lkb() + so this leads to the lkb being freed */ + unhold_lkb(lkb); + rv = -1; + break; + default: + log_print("invalid status for revert %d", lkb->lkb_status); + } + return rv; +} + +static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return revert_lock(r, lkb); +} + +static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + if (lkb->lkb_grmode != lkb->lkb_rqmode) { + lkb->lkb_grmode = lkb->lkb_rqmode; + if (lkb->lkb_status) + move_lkb(r, lkb, DLM_LKSTS_GRANTED); + else + add_lkb(r, lkb, DLM_LKSTS_GRANTED); + } + + lkb->lkb_rqmode = DLM_LOCK_IV; + lkb->lkb_highbast = 0; +} + +static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + set_lvb_lock(r, lkb); + _grant_lock(r, lkb); +} + +static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, + const struct dlm_message *ms) +{ + set_lvb_lock_pc(r, lkb, ms); + _grant_lock(r, lkb); +} + +/* called by grant_pending_locks() which means an async grant message must + be sent to the requesting node in addition to granting the lock if the + lkb belongs to a remote node. */ + +static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + grant_lock(r, lkb); + if (is_master_copy(lkb)) + send_grant(r, lkb); + else + queue_cast(r, lkb, 0); +} + +/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to + change the granted/requested modes. We're munging things accordingly in + the process copy. + CONVDEADLK: our grmode may have been forced down to NL to resolve a + conversion deadlock + ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become + compatible with other granted locks */ + +static void munge_demoted(struct dlm_lkb *lkb) +{ + if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { + log_print("munge_demoted %x invalid modes gr %d rq %d", + lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); + return; + } + + lkb->lkb_grmode = DLM_LOCK_NL; +} + +static void munge_altmode(struct dlm_lkb *lkb, const struct dlm_message *ms) +{ + if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) && + ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) { + log_print("munge_altmode %x invalid reply type %d", + lkb->lkb_id, le32_to_cpu(ms->m_type)); + return; + } + + if (lkb->lkb_exflags & DLM_LKF_ALTPR) + lkb->lkb_rqmode = DLM_LOCK_PR; + else if (lkb->lkb_exflags & DLM_LKF_ALTCW) + lkb->lkb_rqmode = DLM_LOCK_CW; + else { + log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags); + dlm_print_lkb(lkb); + } +} + +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head) +{ + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, + lkb_statequeue); + if (lkb->lkb_id == first->lkb_id) + return 1; + + return 0; +} + +/* Check if the given lkb conflicts with another lkb on the queue. */ + +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) +{ + struct dlm_lkb *this; + + list_for_each_entry(this, head, lkb_statequeue) { + if (this == lkb) + continue; + if (!modes_compat(this, lkb)) + return 1; + } + return 0; +} + +/* + * "A conversion deadlock arises with a pair of lock requests in the converting + * queue for one resource. The granted mode of each lock blocks the requested + * mode of the other lock." + * + * Part 2: if the granted mode of lkb is preventing an earlier lkb in the + * convert queue from being granted, then deadlk/demote lkb. + * + * Example: + * Granted Queue: empty + * Convert Queue: NL->EX (first lock) + * PR->EX (second lock) + * + * The first lock can't be granted because of the granted mode of the second + * lock and the second lock can't be granted because it's not first in the + * list. We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we + * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK + * flag set and return DEMOTED in the lksb flags. + * + * Originally, this function detected conv-deadlk in a more limited scope: + * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or + * - if lkb1 was the first entry in the queue (not just earlier), and was + * blocked by the granted mode of lkb2, and there was nothing on the + * granted queue preventing lkb1 from being granted immediately, i.e. + * lkb2 was the only thing preventing lkb1 from being granted. + * + * That second condition meant we'd only say there was conv-deadlk if + * resolving it (by demotion) would lead to the first lock on the convert + * queue being granted right away. It allowed conversion deadlocks to exist + * between locks on the convert queue while they couldn't be granted anyway. + * + * Now, we detect and take action on conversion deadlocks immediately when + * they're created, even if they may not be immediately consequential. If + * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted + * mode that would prevent lkb1's conversion from being granted, we do a + * deadlk/demote on lkb2 right away and don't let it onto the convert queue. + * I think this means that the lkb_is_ahead condition below should always + * be zero, i.e. there will never be conv-deadlk between two locks that are + * both already on the convert queue. + */ + +static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2) +{ + struct dlm_lkb *lkb1; + int lkb_is_ahead = 0; + + list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) { + if (lkb1 == lkb2) { + lkb_is_ahead = 1; + continue; + } + + if (!lkb_is_ahead) { + if (!modes_compat(lkb2, lkb1)) + return 1; + } else { + if (!modes_compat(lkb2, lkb1) && + !modes_compat(lkb1, lkb2)) + return 1; + } + } + return 0; +} + +/* + * Return 1 if the lock can be granted, 0 otherwise. + * Also detect and resolve conversion deadlocks. + * + * lkb is the lock to be granted + * + * now is 1 if the function is being called in the context of the + * immediate request, it is 0 if called later, after the lock has been + * queued. + * + * recover is 1 if dlm_recover_grant() is trying to grant conversions + * after recovery. + * + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis + */ + +static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int recover) +{ + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); + + /* + * 6-10: Version 5.4 introduced an option to address the phenomenon of + * a new request for a NL mode lock being blocked. + * + * 6-11: If the optional EXPEDITE flag is used with the new NL mode + * request, then it would be granted. In essence, the use of this flag + * tells the Lock Manager to expedite theis request by not considering + * what may be in the CONVERTING or WAITING queues... As of this + * writing, the EXPEDITE flag can be used only with new requests for NL + * mode locks. This flag is not valid for conversion requests. + * + * A shortcut. Earlier checks return an error if EXPEDITE is used in a + * conversion or used with a non-NL requested mode. We also know an + * EXPEDITE request is always granted immediately, so now must always + * be 1. The full condition to grant an expedite request: (now && + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can + * therefore be shortened to just checking the flag. + */ + + if (lkb->lkb_exflags & DLM_LKF_EXPEDITE) + return 1; + + /* + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be + * added to the remaining conditions. + */ + + if (queue_conflict(&r->res_grantqueue, lkb)) + return 0; + + /* + * 6-3: By default, a conversion request is immediately granted if the + * requested mode is compatible with the modes of all other granted + * locks + */ + + if (queue_conflict(&r->res_convertqueue, lkb)) + return 0; + + /* + * The RECOVER_GRANT flag means dlm_recover_grant() is granting + * locks for a recovered rsb, on which lkb's have been rebuilt. + * The lkb's may have been rebuilt on the queues in a different + * order than they were in on the previous master. So, granting + * queued conversions in order after recovery doesn't make sense + * since the order hasn't been preserved anyway. The new order + * could also have created a new "in place" conversion deadlock. + * (e.g. old, failed master held granted EX, with PR->EX, NL->EX. + * After recovery, there would be no granted locks, and possibly + * NL->EX, PR->EX, an in-place conversion deadlock.) So, after + * recovery, grant conversions without considering order. + */ + + if (conv && recover) + return 1; + + /* + * 6-5: But the default algorithm for deciding whether to grant or + * queue conversion requests does not by itself guarantee that such + * requests are serviced on a "first come first serve" basis. This, in + * turn, can lead to a phenomenon known as "indefinate postponement". + * + * 6-7: This issue is dealt with by using the optional QUECVT flag with + * the system service employed to request a lock conversion. This flag + * forces certain conversion requests to be queued, even if they are + * compatible with the granted modes of other locks on the same + * resource. Thus, the use of this flag results in conversion requests + * being ordered on a "first come first servce" basis. + * + * DCT: This condition is all about new conversions being able to occur + * "in place" while the lock remains on the granted queue (assuming + * nothing else conflicts.) IOW if QUECVT isn't set, a conversion + * doesn't _have_ to go onto the convert queue where it's processed in + * order. The "now" variable is necessary to distinguish converts + * being received and processed for the first time now, because once a + * convert is moved to the conversion queue the condition below applies + * requiring fifo granting. + */ + + if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT)) + return 1; + + /* + * Even if the convert is compat with all granted locks, + * QUECVT forces it behind other locks on the convert queue. + */ + + if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) { + if (list_empty(&r->res_convertqueue)) + return 1; + else + return 0; + } + + /* + * The NOORDER flag is set to avoid the standard vms rules on grant + * order. + */ + + if (lkb->lkb_exflags & DLM_LKF_NOORDER) + return 1; + + /* + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be + * granted until all other conversion requests ahead of it are granted + * and/or canceled. + */ + + if (!now && conv && first_in_list(lkb, &r->res_convertqueue)) + return 1; + + /* + * 6-4: By default, a new request is immediately granted only if all + * three of the following conditions are satisfied when the request is + * issued: + * - The queue of ungranted conversion requests for the resource is + * empty. + * - The queue of ungranted new requests for the resource is empty. + * - The mode of the new request is compatible with the most + * restrictive mode of all granted locks on the resource. + */ + + if (now && !conv && list_empty(&r->res_convertqueue) && + list_empty(&r->res_waitqueue)) + return 1; + + /* + * 6-4: Once a lock request is in the queue of ungranted new requests, + * it cannot be granted until the queue of ungranted conversion + * requests is empty, all ungranted new requests ahead of it are + * granted and/or canceled, and it is compatible with the granted mode + * of the most restrictive lock granted on the resource. + */ + + if (!now && !conv && list_empty(&r->res_convertqueue) && + first_in_list(lkb, &r->res_waitqueue)) + return 1; + + return 0; +} + +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int recover, int *err) +{ + int rv; + int8_t alt = 0, rqmode = lkb->lkb_rqmode; + int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV); + + if (err) + *err = 0; + + rv = _can_be_granted(r, lkb, now, recover); + if (rv) + goto out; + + /* + * The CONVDEADLK flag is non-standard and tells the dlm to resolve + * conversion deadlocks by demoting grmode to NL, otherwise the dlm + * cancels one of the locks. + */ + + if (is_convert && can_be_queued(lkb) && + conversion_deadlock_detect(r, lkb)) { + if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { + lkb->lkb_grmode = DLM_LOCK_NL; + set_bit(DLM_SBF_DEMOTED_BIT, &lkb->lkb_sbflags); + } else if (err) { + *err = -EDEADLK; + } else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); + } + goto out; + } + + /* + * The ALTPR and ALTCW flags are non-standard and tell the dlm to try + * to grant a request in a mode other than the normal rqmode. It's a + * simple way to provide a big optimization to applications that can + * use them. + */ + + if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR)) + alt = DLM_LOCK_PR; + else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW)) + alt = DLM_LOCK_CW; + + if (alt) { + lkb->lkb_rqmode = alt; + rv = _can_be_granted(r, lkb, now, 0); + if (rv) + set_bit(DLM_SBF_ALTMODE_BIT, &lkb->lkb_sbflags); + else + lkb->lkb_rqmode = rqmode; + } + out: + return rv; +} + +/* Returns the highest requested mode of all blocked conversions; sets + cw if there's a blocked conversion to DLM_LOCK_CW. */ + +static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) +{ + struct dlm_lkb *lkb, *s; + int recover = rsb_flag(r, RSB_RECOVER_GRANT); + int hi, demoted, quit, grant_restart, demote_restart; + int deadlk; + + quit = 0; + restart: + grant_restart = 0; + demote_restart = 0; + hi = DLM_LOCK_IV; + + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { + demoted = is_demoted(lkb); + deadlk = 0; + + if (can_be_granted(r, lkb, 0, recover, &deadlk)) { + grant_lock_pending(r, lkb); + grant_restart = 1; + if (count) + (*count)++; + continue; + } + + if (!demoted && is_demoted(lkb)) { + log_print("WARN: pending demoted %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + demote_restart = 1; + continue; + } + + if (deadlk) { + /* + * If DLM_LKB_NODLKWT flag is set and conversion + * deadlock is detected, we request blocking AST and + * down (or cancel) conversion. + */ + if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) { + if (lkb->lkb_highbast < lkb->lkb_rqmode) { + queue_bast(r, lkb, lkb->lkb_rqmode); + lkb->lkb_highbast = lkb->lkb_rqmode; + } + } else { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, + r->res_name); + dlm_dump_rsb(r); + } + continue; + } + + hi = max_t(int, lkb->lkb_rqmode, hi); + + if (cw && lkb->lkb_rqmode == DLM_LOCK_CW) + *cw = 1; + } + + if (grant_restart) + goto restart; + if (demote_restart && !quit) { + quit = 1; + goto restart; + } + + return max_t(int, high, hi); +} + +static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) +{ + struct dlm_lkb *lkb, *s; + + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { + if (can_be_granted(r, lkb, 0, 0, NULL)) { + grant_lock_pending(r, lkb); + if (count) + (*count)++; + } else { + high = max_t(int, lkb->lkb_rqmode, high); + if (lkb->lkb_rqmode == DLM_LOCK_CW) + *cw = 1; + } + } + + return high; +} + +/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked + on either the convert or waiting queue. + high is the largest rqmode of all locks blocked on the convert or + waiting queue. */ + +static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw) +{ + if (gr->lkb_grmode == DLM_LOCK_PR && cw) { + if (gr->lkb_highbast < DLM_LOCK_EX) + return 1; + return 0; + } + + if (gr->lkb_highbast < high && + !__dlm_compat_matrix[gr->lkb_grmode+1][high+1]) + return 1; + return 0; +} + +static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count) +{ + struct dlm_lkb *lkb, *s; + int high = DLM_LOCK_IV; + int cw = 0; + + if (!is_master(r)) { + log_print("grant_pending_locks r nodeid %d", r->res_nodeid); + dlm_dump_rsb(r); + return; + } + + high = grant_pending_convert(r, high, &cw, count); + high = grant_pending_wait(r, high, &cw, count); + + if (high == DLM_LOCK_IV) + return; + + /* + * If there are locks left on the wait/convert queue then send blocking + * ASTs to granted locks based on the largest requested mode (high) + * found above. + */ + + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) { + if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) { + if (cw && high == DLM_LOCK_PR && + lkb->lkb_grmode == DLM_LOCK_PR) + queue_bast(r, lkb, DLM_LOCK_CW); + else + queue_bast(r, lkb, high); + lkb->lkb_highbast = high; + } + } +} + +static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq) +{ + if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) || + (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) { + if (gr->lkb_highbast < DLM_LOCK_EX) + return 1; + return 0; + } + + if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq)) + return 1; + return 0; +} + +static void send_bast_queue(struct dlm_rsb *r, struct list_head *head, + struct dlm_lkb *lkb) +{ + struct dlm_lkb *gr; + + list_for_each_entry(gr, head, lkb_statequeue) { + /* skip self when sending basts to convertqueue */ + if (gr == lkb) + continue; + if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { + queue_bast(r, gr, lkb->lkb_rqmode); + gr->lkb_highbast = lkb->lkb_rqmode; + } + } +} + +static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + send_bast_queue(r, &r->res_grantqueue, lkb); +} + +static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + send_bast_queue(r, &r->res_grantqueue, lkb); + send_bast_queue(r, &r->res_convertqueue, lkb); +} + +/* set_master(r, lkb) -- set the master nodeid of a resource + + The purpose of this function is to set the nodeid field in the given + lkb using the nodeid field in the given rsb. If the rsb's nodeid is + known, it can just be copied to the lkb and the function will return + 0. If the rsb's nodeid is _not_ known, it needs to be looked up + before it can be copied to the lkb. + + When the rsb nodeid is being looked up remotely, the initial lkb + causing the lookup is kept on the ls_waiters list waiting for the + lookup reply. Other lkb's waiting for the same rsb lookup are kept + on the rsb's res_lookup list until the master is verified. + + Return values: + 0: nodeid is set in rsb/lkb and the caller should go ahead and use it + 1: the rsb master is not available and the lkb has been placed on + a wait queue +*/ + +static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int our_nodeid = dlm_our_nodeid(); + + if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = lkb->lkb_id; + lkb->lkb_nodeid = r->res_nodeid; + return 0; + } + + if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) { + list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup); + return 1; + } + + if (r->res_master_nodeid == our_nodeid) { + lkb->lkb_nodeid = 0; + return 0; + } + + if (r->res_master_nodeid) { + lkb->lkb_nodeid = r->res_master_nodeid; + return 0; + } + + if (dlm_dir_nodeid(r) == our_nodeid) { + /* This is a somewhat unusual case; find_rsb will usually + have set res_master_nodeid when dir nodeid is local, but + there are cases where we become the dir node after we've + past find_rsb and go through _request_lock again. + confirm_master() or process_lookup_list() needs to be + called after this. */ + log_debug(r->res_ls, "set_master %x self master %d dir %d %s", + lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid, + r->res_name); + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + lkb->lkb_nodeid = 0; + return 0; + } + + r->res_first_lkid = lkb->lkb_id; + send_lookup(r, lkb); + return 1; +} + +static void process_lookup_list(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) { + list_del_init(&lkb->lkb_rsb_lookup); + _request_lock(r, lkb); + schedule(); + } +} + +/* confirm_master -- confirm (or deny) an rsb's master nodeid */ + +static void confirm_master(struct dlm_rsb *r, int error) +{ + struct dlm_lkb *lkb; + + if (!r->res_first_lkid) + return; + + switch (error) { + case 0: + case -EINPROGRESS: + r->res_first_lkid = 0; + process_lookup_list(r); + break; + + case -EAGAIN: + case -EBADR: + case -ENOTBLK: + /* the remote request failed and won't be retried (it was + a NOQUEUE, or has been canceled/unlocked); make a waiting + lkb the first_lkid */ + + r->res_first_lkid = 0; + + if (!list_empty(&r->res_lookup)) { + lkb = list_entry(r->res_lookup.next, struct dlm_lkb, + lkb_rsb_lookup); + list_del_init(&lkb->lkb_rsb_lookup); + r->res_first_lkid = lkb->lkb_id; + _request_lock(r, lkb); + } + break; + + default: + log_error(r->res_ls, "confirm_master unknown error %d", error); + } +} + +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +{ + int rv = -EINVAL; + + /* check for invalid arg usage */ + + if (mode < 0 || mode > DLM_LOCK_EX) + goto out; + + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN)) + goto out; + + if (flags & DLM_LKF_CANCEL) + goto out; + + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL) + goto out; + + if (!ast || !lksb) + goto out; + + if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr) + goto out; + + if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid) + goto out; + + /* these args will be copied to the lkb in validate_lock_args, + it cannot be done now because when converting locks, fields in + an active lkb cannot be modified before locking the rsb */ + + args->flags = flags; + args->astfn = ast; + args->astparam = astparam; + args->bastfn = bast; + args->mode = mode; + args->lksb = lksb; + rv = 0; + out: + return rv; +} + +static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) +{ + if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK | + DLM_LKF_FORCEUNLOCK)) + return -EINVAL; + + if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK) + return -EINVAL; + + args->flags = flags; + args->astparam = astarg; + return 0; +} + +static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + int rv = -EBUSY; + + if (args->flags & DLM_LKF_CONVERT) { + if (lkb->lkb_status != DLM_LKSTS_GRANTED) + goto out; + + /* lock not allowed if there's any op in progress */ + if (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count)) + goto out; + + if (is_overlap(lkb)) + goto out; + + rv = -EINVAL; + if (test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) + goto out; + + if (args->flags & DLM_LKF_QUECVT && + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) + goto out; + } + + lkb->lkb_exflags = args->flags; + dlm_set_sbflags_val(lkb, 0); + lkb->lkb_astfn = args->astfn; + lkb->lkb_astparam = args->astparam; + lkb->lkb_bastfn = args->bastfn; + lkb->lkb_rqmode = args->mode; + lkb->lkb_lksb = args->lksb; + lkb->lkb_lvbptr = args->lksb->sb_lvbptr; + lkb->lkb_ownpid = (int) current->pid; + rv = 0; + out: + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %d %d %s", __func__, + rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %d %d %s", __func__, + rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + } + + return rv; +} + +/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0 + for success */ + +/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here + because there may be a lookup in progress and it's valid to do + cancel/unlockf on it */ + +static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int rv = -EBUSY; + + /* normal unlock not allowed if there's any op in progress */ + if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) && + (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count))) + goto out; + + /* an lkb may be waiting for an rsb lookup to complete where the + lookup was initiated by another lock */ + + if (!list_empty(&lkb->lkb_rsb_lookup)) { + if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { + log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); + list_del_init(&lkb->lkb_rsb_lookup); + queue_cast(lkb->lkb_resource, lkb, + args->flags & DLM_LKF_CANCEL ? + -DLM_ECANCEL : -DLM_EUNLOCK); + unhold_lkb(lkb); /* undoes create_lkb() */ + } + /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ + goto out; + } + + rv = -EINVAL; + if (test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) { + log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); + dlm_print_lkb(lkb); + goto out; + } + + /* an lkb may still exist even though the lock is EOL'ed due to a + * cancel, unlock or failed noqueue request; an app can't use these + * locks; return same error as if the lkid had not been found at all + */ + + if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) { + log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); + rv = -ENOENT; + goto out; + } + + /* cancel not allowed with another cancel/unlock in progress */ + + if (args->flags & DLM_LKF_CANCEL) { + if (lkb->lkb_exflags & DLM_LKF_CANCEL) + goto out; + + if (is_overlap(lkb)) + goto out; + + if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) { + set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); + rv = -EBUSY; + goto out; + } + + /* there's nothing to cancel */ + if (lkb->lkb_status == DLM_LKSTS_GRANTED && + !lkb->lkb_wait_type) { + rv = -EBUSY; + goto out; + } + + switch (lkb->lkb_wait_type) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); + rv = -EBUSY; + goto out; + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + goto out; + } + /* add_to_waiters() will set OVERLAP_CANCEL */ + goto out_ok; + } + + /* do we need to allow a force-unlock if there's a normal unlock + already in progress? in what conditions could the normal unlock + fail such that we'd want to send a force-unlock to be sure? */ + + if (args->flags & DLM_LKF_FORCEUNLOCK) { + if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK) + goto out; + + if (is_overlap_unlock(lkb)) + goto out; + + if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) { + set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); + rv = -EBUSY; + goto out; + } + + switch (lkb->lkb_wait_type) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); + rv = -EBUSY; + goto out; + case DLM_MSG_UNLOCK: + goto out; + } + /* add_to_waiters() will set OVERLAP_UNLOCK */ + } + + out_ok: + /* an overlapping op shouldn't blow away exflags from other op */ + lkb->lkb_exflags |= args->flags; + dlm_set_sbflags_val(lkb, 0); + lkb->lkb_astparam = args->astparam; + rv = 0; + out: + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv, + lkb->lkb_id, dlm_iflags_val(lkb), lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv, + lkb->lkb_id, dlm_iflags_val(lkb), lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + } + + return rv; +} + +/* + * Four stage 4 varieties: + * do_request(), do_convert(), do_unlock(), do_cancel() + * These are called on the master node for the given lock and + * from the central locking logic. + */ + +static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error = 0; + + if (can_be_granted(r, lkb, 1, 0, NULL)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + add_lkb(r, lkb, DLM_LKSTS_WAITING); + goto out; + } + + error = -EAGAIN; + queue_cast(r, lkb, -EAGAIN); + out: + return error; +} + +static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + +static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error = 0; + int deadlk = 0; + + /* changing an existing lock may allow others to be granted */ + + if (can_be_granted(r, lkb, 1, 0, &deadlk)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + + /* can_be_granted() detected that this lock would block in a conversion + deadlock, so we leave it on the granted queue and return EDEADLK in + the ast for the convert. */ + + if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + /* it's left on the granted queue */ + revert_lock(r, lkb); + queue_cast(r, lkb, -EDEADLK); + error = -EDEADLK; + goto out; + } + + /* is_demoted() means the can_be_granted() above set the grmode + to NL, and left us on the granted queue. This auto-demotion + (due to CONVDEADLK) might mean other locks, and/or this lock, are + now grantable. We have to try to grant other converting locks + before we try again to grant this one. */ + + if (is_demoted(lkb)) { + grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); + if (_can_be_granted(r, lkb, 1, 0)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + /* else fall through and move to convert queue */ + } + + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); + goto out; + } + + error = -EAGAIN; + queue_cast(r, lkb, -EAGAIN); + out: + return error; +} + +static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case 0: + grant_pending_locks(r, NULL); + /* grant_pending_locks also sends basts */ + break; + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + +static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + remove_lock(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); + return -DLM_EUNLOCK; +} + +static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + grant_pending_locks(r, NULL); +} + +/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ + +static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + error = revert_lock(r, lkb); + if (error) { + queue_cast(r, lkb, -DLM_ECANCEL); + return -DLM_ECANCEL; + } + return 0; +} + +static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + if (error) + grant_pending_locks(r, NULL); +} + +/* + * Four stage 3 varieties: + * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() + */ + +/* add a new lkb to a possibly new rsb, called by requesting process */ + +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + /* set_master: sets lkb nodeid from r */ + + error = set_master(r, lkb); + if (error < 0) + goto out; + if (error) { + error = 0; + goto out; + } + + if (is_remote(r)) { + /* receive_request() calls do_request() on remote node */ + error = send_request(r, lkb); + } else { + error = do_request(r, lkb); + /* for remote locks the request_reply is sent + between do_request and do_request_effects */ + do_request_effects(r, lkb, error); + } + out: + return error; +} + +/* change some property of an existing lkb, e.g. mode */ + +static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_convert() calls do_convert() on remote node */ + error = send_convert(r, lkb); + } else { + error = do_convert(r, lkb); + /* for remote locks the convert_reply is sent + between do_convert and do_convert_effects */ + do_convert_effects(r, lkb, error); + } + + return error; +} + +/* remove an existing lkb from the granted queue */ + +static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_unlock() calls do_unlock() on remote node */ + error = send_unlock(r, lkb); + } else { + error = do_unlock(r, lkb); + /* for remote locks the unlock_reply is sent + between do_unlock and do_unlock_effects */ + do_unlock_effects(r, lkb, error); + } + + return error; +} + +/* remove an existing lkb from the convert or wait queue */ + +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_cancel() calls do_cancel() on remote node */ + error = send_cancel(r, lkb); + } else { + error = do_cancel(r, lkb); + /* for remote locks the cancel_reply is sent + between do_cancel and do_cancel_effects */ + do_cancel_effects(r, lkb, error); + } + + return error; +} + +/* + * Four stage 2 varieties: + * request_lock(), convert_lock(), unlock_lock(), cancel_lock() + */ + +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + const void *name, int len, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + error = validate_lock_args(ls, lkb, args); + if (error) + return error; + + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); + if (error) + return error; + + lock_rsb(r); + + attach_lkb(r, lkb); + lkb->lkb_lksb->sb_lkid = lkb->lkb_id; + + error = _request_lock(r, lkb); + + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_lock_args(ls, lkb, args); + if (error) + goto out; + + error = _convert_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, args); + if (error) + goto out; + + error = _unlock_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, args); + if (error) + goto out; + + error = _cancel_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +/* + * Two stage 1 varieties: dlm_lock() and dlm_unlock() + */ + +int dlm_lock(dlm_lockspace_t *lockspace, + int mode, + struct dlm_lksb *lksb, + uint32_t flags, + const void *name, + unsigned int namelen, + uint32_t parent_lkid, + void (*ast) (void *astarg), + void *astarg, + void (*bast) (void *astarg, int mode)) +{ + struct dlm_ls *ls; + struct dlm_lkb *lkb; + struct dlm_args args; + int error, convert = flags & DLM_LKF_CONVERT; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + dlm_lock_recovery(ls); + + if (convert) + error = find_lkb(ls, lksb->sb_lkid, &lkb); + else + error = create_lkb(ls, &lkb); + + if (error) + goto out; + + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); + + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); + if (error) + goto out_put; + + if (convert) + error = convert_lock(ls, lkb, &args); + else + error = request_lock(ls, lkb, name, namelen, &args); + + if (error == -EINPROGRESS) + error = 0; + out_put: + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true); + + if (convert || error) + __put_lkb(ls, lkb); + if (error == -EAGAIN || error == -EDEADLK) + error = 0; + out: + dlm_unlock_recovery(ls); + dlm_put_lockspace(ls); + return error; +} + +int dlm_unlock(dlm_lockspace_t *lockspace, + uint32_t lkid, + uint32_t flags, + struct dlm_lksb *lksb, + void *astarg) +{ + struct dlm_ls *ls; + struct dlm_lkb *lkb; + struct dlm_args args; + int error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + trace_dlm_unlock_start(ls, lkb, flags); + + error = set_unlock_args(flags, astarg, &args); + if (error) + goto out_put; + + if (flags & DLM_LKF_CANCEL) + error = cancel_lock(ls, lkb, &args); + else + error = unlock_lock(ls, lkb, &args); + + if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL) + error = 0; + if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK))) + error = 0; + out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + dlm_put_lockspace(ls); + return error; +} + +/* + * send/receive routines for remote operations and replies + * + * send_args + * send_common + * send_request receive_request + * send_convert receive_convert + * send_unlock receive_unlock + * send_cancel receive_cancel + * send_grant receive_grant + * send_bast receive_bast + * send_lookup receive_lookup + * send_remove receive_remove + * + * send_common_reply + * receive_request_reply send_request_reply + * receive_convert_reply send_convert_reply + * receive_unlock_reply send_unlock_reply + * receive_cancel_reply send_cancel_reply + * receive_lookup_reply send_lookup_reply + */ + +static int _create_message(struct dlm_ls *ls, int mb_len, + int to_nodeid, int mstype, + struct dlm_message **ms_ret, + struct dlm_mhandle **mh_ret, + gfp_t allocation) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + char *mb; + + /* get_buffer gives us a message handle (mh) that we need to + pass into midcomms_commit and a message buffer (mb) that we + write our data into */ + + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb); + if (!mh) + return -ENOBUFS; + + ms = (struct dlm_message *) mb; + + ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id); + ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + ms->m_header.h_length = cpu_to_le16(mb_len); + ms->m_header.h_cmd = DLM_MSG; + + ms->m_type = cpu_to_le32(mstype); + + *mh_ret = mh; + *ms_ret = ms; + return 0; +} + +static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, + int to_nodeid, int mstype, + struct dlm_message **ms_ret, + struct dlm_mhandle **mh_ret, + gfp_t allocation) +{ + int mb_len = sizeof(struct dlm_message); + + switch (mstype) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + case DLM_MSG_REMOVE: + mb_len += r->res_length; + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (lkb && lkb->lkb_lvbptr && (lkb->lkb_exflags & DLM_LKF_VALBLK)) + mb_len += r->res_ls->ls_lvblen; + break; + } + + return _create_message(r->res_ls, mb_len, to_nodeid, mstype, + ms_ret, mh_ret, allocation); +} + +/* further lowcomms enhancements or alternate implementations may make + the return value from this function useful at some point */ + +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms, + const void *name, int namelen) +{ + dlm_midcomms_commit_mhandle(mh, name, namelen); + return 0; +} + +static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + ms->m_nodeid = cpu_to_le32(lkb->lkb_nodeid); + ms->m_pid = cpu_to_le32(lkb->lkb_ownpid); + ms->m_lkid = cpu_to_le32(lkb->lkb_id); + ms->m_remid = cpu_to_le32(lkb->lkb_remid); + ms->m_exflags = cpu_to_le32(lkb->lkb_exflags); + ms->m_sbflags = cpu_to_le32(dlm_sbflags_val(lkb)); + ms->m_flags = cpu_to_le32(dlm_dflags_val(lkb)); + ms->m_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); + ms->m_status = cpu_to_le32(lkb->lkb_status); + ms->m_grmode = cpu_to_le32(lkb->lkb_grmode); + ms->m_rqmode = cpu_to_le32(lkb->lkb_rqmode); + ms->m_hash = cpu_to_le32(r->res_hash); + + /* m_result and m_bastmode are set from function args, + not from lkb fields */ + + if (lkb->lkb_bastfn) + ms->m_asts |= cpu_to_le32(DLM_CB_BAST); + if (lkb->lkb_astfn) + ms->m_asts |= cpu_to_le32(DLM_CB_CAST); + + /* compare with switch in create_message; send_remove() doesn't + use send_args() */ + + switch (ms->m_type) { + case cpu_to_le32(DLM_MSG_REQUEST): + case cpu_to_le32(DLM_MSG_LOOKUP): + memcpy(ms->m_extra, r->res_name, r->res_length); + break; + case cpu_to_le32(DLM_MSG_CONVERT): + case cpu_to_le32(DLM_MSG_UNLOCK): + case cpu_to_le32(DLM_MSG_REQUEST_REPLY): + case cpu_to_le32(DLM_MSG_CONVERT_REPLY): + case cpu_to_le32(DLM_MSG_GRANT): + if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK)) + break; + memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); + break; + } +} + +static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = r->res_nodeid; + + error = add_to_waiters(lkb, mstype, to_nodeid); + if (error) + return error; + + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); + if (error) + goto fail; + + send_args(r, lkb, ms); + + error = send_message(mh, ms, r->res_name, r->res_length); + if (error) + goto fail; + return 0; + + fail: + remove_from_waiters(lkb, msg_reply_type(mstype)); + return error; +} + +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_REQUEST); +} + +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + error = send_common(r, lkb, DLM_MSG_CONVERT); + + /* down conversions go without a reply from the master */ + if (!error && down_conversion(lkb)) { + remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_local_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_local_ms.m_result = 0; + __receive_convert_reply(r, lkb, &r->res_ls->ls_local_ms, true); + } + + return error; +} + +/* FIXME: if this lkb is the only lock we hold on the rsb, then set + MASTER_UNCERTAIN to force the next request on the rsb to confirm + that the master is still correct. */ + +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_UNLOCK); +} + +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_CANCEL); +} + +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh, + GFP_NOFS); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_result = 0; + + error = send_message(mh, ms, r->res_name, r->res_length); + out: + return error; +} + +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh, + GFP_NOFS); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_bastmode = cpu_to_le32(mode); + + error = send_message(mh, ms, r->res_name, r->res_length); + out: + return error; +} + +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = dlm_dir_nodeid(r); + + error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); + if (error) + return error; + + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh, + GFP_NOFS); + if (error) + goto fail; + + send_args(r, lkb, ms); + + error = send_message(mh, ms, r->res_name, r->res_length); + if (error) + goto fail; + return 0; + + fail: + remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY); + return error; +} + +static int send_remove(struct dlm_rsb *r) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = dlm_dir_nodeid(r); + + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh, + GFP_ATOMIC); + if (error) + goto out; + + memcpy(ms->m_extra, r->res_name, r->res_length); + ms->m_hash = cpu_to_le32(r->res_hash); + + error = send_message(mh, ms, r->res_name, r->res_length); + out: + return error; +} + +static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + int mstype, int rv) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_result = cpu_to_le32(to_dlm_errno(rv)); + + error = send_message(mh, ms, r->res_name, r->res_length); + out: + return error; +} + +static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv); +} + +static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv); +} + +static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv); +} + +static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); +} + +static int send_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms_in, int ret_nodeid, + int rv) +{ + struct dlm_rsb *r = &ls->ls_local_rsb; + struct dlm_message *ms; + struct dlm_mhandle *mh; + int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); + + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh, + GFP_NOFS); + if (error) + goto out; + + ms->m_lkid = ms_in->m_lkid; + ms->m_result = cpu_to_le32(to_dlm_errno(rv)); + ms->m_nodeid = cpu_to_le32(ret_nodeid); + + error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in)); + out: + return error; +} + +/* which args we save from a received message depends heavily on the type + of message, unlike the send side where we can safely send everything about + the lkb for any type of message */ + +static void receive_flags(struct dlm_lkb *lkb, const struct dlm_message *ms) +{ + lkb->lkb_exflags = le32_to_cpu(ms->m_exflags); + dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); + dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); +} + +static void receive_flags_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, + bool local) +{ + if (local) + return; + + dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); + dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); +} + +static int receive_extralen(const struct dlm_message *ms) +{ + return (le16_to_cpu(ms->m_header.h_length) - + sizeof(struct dlm_message)); +} + +static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, + const struct dlm_message *ms) +{ + int len; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + if (!lkb->lkb_lvbptr) + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + len = receive_extralen(ms); + if (len > ls->ls_lvblen) + len = ls->ls_lvblen; + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); + } + return 0; +} + +static void fake_bastfn(void *astparam, int mode) +{ + log_print("fake_bastfn should not be called"); +} + +static void fake_astfn(void *astparam) +{ + log_print("fake_astfn should not be called"); +} + +static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + const struct dlm_message *ms) +{ + lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + lkb->lkb_ownpid = le32_to_cpu(ms->m_pid); + lkb->lkb_remid = le32_to_cpu(ms->m_lkid); + lkb->lkb_grmode = DLM_LOCK_IV; + lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode); + + lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + /* lkb was just created so there won't be an lvb yet */ + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + } + + return 0; +} + +static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + const struct dlm_message *ms) +{ + if (lkb->lkb_status != DLM_LKSTS_GRANTED) + return -EBUSY; + + if (receive_lvb(ls, lkb, ms)) + return -ENOMEM; + + lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode); + lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq); + + return 0; +} + +static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + const struct dlm_message *ms) +{ + if (receive_lvb(ls, lkb, ms)) + return -ENOMEM; + return 0; +} + +/* We fill in the local-lkb fields with the info that send_xxxx_reply() + uses to send a reply and that the remote end uses to process the reply. */ + +static void setup_local_lkb(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb = &ls->ls_local_lkb; + lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + lkb->lkb_remid = le32_to_cpu(ms->m_lkid); +} + +/* This is called after the rsb is locked so that we can safely inspect + fields in the lkb. */ + +static int validate_message(struct dlm_lkb *lkb, const struct dlm_message *ms) +{ + int from = le32_to_cpu(ms->m_header.h_nodeid); + int error = 0; + + /* currently mixing of user/kernel locks are not supported */ + if (ms->m_flags & cpu_to_le32(BIT(DLM_DFL_USER_BIT)) && + !test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { + log_error(lkb->lkb_resource->res_ls, + "got user dlm message for a kernel lock"); + error = -EINVAL; + goto out; + } + + switch (ms->m_type) { + case cpu_to_le32(DLM_MSG_CONVERT): + case cpu_to_le32(DLM_MSG_UNLOCK): + case cpu_to_le32(DLM_MSG_CANCEL): + if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case cpu_to_le32(DLM_MSG_CONVERT_REPLY): + case cpu_to_le32(DLM_MSG_UNLOCK_REPLY): + case cpu_to_le32(DLM_MSG_CANCEL_REPLY): + case cpu_to_le32(DLM_MSG_GRANT): + case cpu_to_le32(DLM_MSG_BAST): + if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case cpu_to_le32(DLM_MSG_REQUEST_REPLY): + if (!is_process_copy(lkb)) + error = -EINVAL; + else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + default: + error = -EINVAL; + } + +out: + if (error) + log_error(lkb->lkb_resource->res_ls, + "ignore invalid message %d from %d %x %x %x %d", + le32_to_cpu(ms->m_type), from, lkb->lkb_id, + lkb->lkb_remid, dlm_iflags_val(lkb), + lkb->lkb_nodeid); + return error; +} + +static int receive_request(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int from_nodeid; + int error, namelen = 0; + + from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + + error = create_lkb(ls, &lkb); + if (error) + goto fail; + + receive_flags(lkb, ms); + set_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); + error = receive_request_args(ls, lkb, ms); + if (error) { + __put_lkb(ls, lkb); + goto fail; + } + + /* The dir node is the authority on whether we are the master + for this rsb or not, so if the master sends us a request, we should + recreate the rsb if we've destroyed it. This race happens when we + send a remove message to the dir node at the same time that the dir + node sends us a request for the rsb. */ + + namelen = receive_extralen(ms); + + error = find_rsb(ls, ms->m_extra, namelen, from_nodeid, + R_RECEIVE_REQUEST, &r); + if (error) { + __put_lkb(ls, lkb); + goto fail; + } + + lock_rsb(r); + + if (r->res_master_nodeid != dlm_our_nodeid()) { + error = validate_master_nodeid(ls, r, from_nodeid); + if (error) { + unlock_rsb(r); + put_rsb(r); + __put_lkb(ls, lkb); + goto fail; + } + } + + attach_lkb(r, lkb); + error = do_request(r, lkb); + send_request_reply(r, lkb, error); + do_request_effects(r, lkb, error); + + unlock_rsb(r); + put_rsb(r); + + if (error == -EINPROGRESS) + error = 0; + if (error) + dlm_put_lkb(lkb); + return 0; + + fail: + /* TODO: instead of returning ENOTBLK, add the lkb to res_lookup + and do this receive_request again from process_lookup_list once + we get the lookup reply. This would avoid a many repeated + ENOTBLK request failures when the lookup reply designating us + as master is delayed. */ + + if (error != -ENOTBLK) { + log_limit(ls, "receive_request %x from %d %d", + le32_to_cpu(ms->m_lkid), from_nodeid, error); + } + + setup_local_lkb(ls, ms); + send_request_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); + return error; +} + +static int receive_convert(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, reply = 1; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + goto fail; + + if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) { + log_error(ls, "receive_convert %x remid %x recover_seq %llu " + "remote %d %x", lkb->lkb_id, lkb->lkb_remid, + (unsigned long long)lkb->lkb_recover_seq, + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid)); + error = -ENOENT; + dlm_put_lkb(lkb); + goto fail; + } + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags(lkb, ms); + + error = receive_convert_args(ls, lkb, ms); + if (error) { + send_convert_reply(r, lkb, error); + goto out; + } + + reply = !down_conversion(lkb); + + error = do_convert(r, lkb); + if (reply) + send_convert_reply(r, lkb, error); + do_convert_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; + + fail: + setup_local_lkb(ls, ms); + send_convert_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); + return error; +} + +static int receive_unlock(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + goto fail; + + if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) { + log_error(ls, "receive_unlock %x remid %x remote %d %x", + lkb->lkb_id, lkb->lkb_remid, + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid)); + error = -ENOENT; + dlm_put_lkb(lkb); + goto fail; + } + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags(lkb, ms); + + error = receive_unlock_args(ls, lkb, ms); + if (error) { + send_unlock_reply(r, lkb, error); + goto out; + } + + error = do_unlock(r, lkb); + send_unlock_reply(r, lkb, error); + do_unlock_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; + + fail: + setup_local_lkb(ls, ms); + send_unlock_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); + return error; +} + +static int receive_cancel(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + goto fail; + + receive_flags(lkb, ms); + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + error = do_cancel(r, lkb); + send_cancel_reply(r, lkb, error); + do_cancel_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; + + fail: + setup_local_lkb(ls, ms); + send_cancel_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); + return error; +} + +static int receive_grant(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + return error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags_reply(lkb, ms, false); + if (is_altmode(lkb)) + munge_altmode(lkb, ms); + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; +} + +static int receive_bast(struct dlm_ls *ls, const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + return error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode)); + lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; +} + +static void receive_lookup(struct dlm_ls *ls, const struct dlm_message *ms) +{ + int len, error, ret_nodeid, from_nodeid, our_nodeid; + + from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + our_nodeid = dlm_our_nodeid(); + + len = receive_extralen(ms); + + error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0, + &ret_nodeid, NULL); + + /* Optimization: we're master so treat lookup as a request */ + if (!error && ret_nodeid == our_nodeid) { + receive_request(ls, ms); + return; + } + send_lookup_reply(ls, ms, ret_nodeid, error); +} + +static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) +{ + char name[DLM_RESNAME_MAXLEN+1]; + struct dlm_rsb *r; + uint32_t hash, b; + int rv, len, dir_nodeid, from_nodeid; + + from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + + len = receive_extralen(ms); + + if (len > DLM_RESNAME_MAXLEN) { + log_error(ls, "receive_remove from %d bad len %d", + from_nodeid, len); + return; + } + + dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash)); + if (dir_nodeid != dlm_our_nodeid()) { + log_error(ls, "receive_remove from %d bad nodeid %d", + from_nodeid, dir_nodeid); + return; + } + + /* Look for name on rsbtbl.toss, if it's there, kill it. + If it's on rsbtbl.keep, it's being used, and we should ignore this + message. This is an expected race between the dir node sending a + request to the master node at the same time as the master node sends + a remove to the dir node. The resolution to that race is for the + dir node to ignore the remove message, and the master node to + recreate the master rsb when it gets a request from the dir node for + an rsb it doesn't have. */ + + memset(name, 0, sizeof(name)); + memcpy(name, ms->m_extra, len); + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + spin_lock(&ls->ls_rsbtbl[b].lock); + + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (rv) { + /* verify the rsb is on keep list per comment above */ + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (rv) { + /* should not happen */ + log_error(ls, "receive_remove from %d not found %s", + from_nodeid, name); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + if (r->res_master_nodeid != from_nodeid) { + /* should not happen */ + log_error(ls, "receive_remove keep from %d master %d", + from_nodeid, r->res_master_nodeid); + dlm_print_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + log_debug(ls, "receive_remove from %d master %d first %x %s", + from_nodeid, r->res_master_nodeid, r->res_first_lkid, + name); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + if (r->res_master_nodeid != from_nodeid) { + log_error(ls, "receive_remove toss from %d master %d", + from_nodeid, r->res_master_nodeid); + dlm_print_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + if (kref_put(&r->res_ref, kill_rsb)) { + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + spin_unlock(&ls->ls_rsbtbl[b].lock); + dlm_free_rsb(r); + } else { + log_error(ls, "receive_remove from %d rsb ref error", + from_nodeid); + dlm_print_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + } +} + +static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms) +{ + do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid)); +} + +static int receive_request_reply(struct dlm_ls *ls, + const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, mstype, result; + int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + return error; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + mstype = lkb->lkb_wait_type; + error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); + if (error) { + log_error(ls, "receive_request_reply %x remote %d %x result %d", + lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid), + from_dlm_errno(le32_to_cpu(ms->m_result))); + dlm_dump_rsb(r); + goto out; + } + + /* Optimization: the dir node was also the master, so it took our + lookup as a request and sent request reply instead of lookup reply */ + if (mstype == DLM_MSG_LOOKUP) { + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + lkb->lkb_nodeid = from_nodeid; + } + + /* this is the value returned from do_request() on the master */ + result = from_dlm_errno(le32_to_cpu(ms->m_result)); + + switch (result) { + case -EAGAIN: + /* request would block (be queued) on remote master */ + queue_cast(r, lkb, -EAGAIN); + confirm_master(r, -EAGAIN); + unhold_lkb(lkb); /* undoes create_lkb() */ + break; + + case -EINPROGRESS: + case 0: + /* request was queued or granted on remote master */ + receive_flags_reply(lkb, ms, false); + lkb->lkb_remid = le32_to_cpu(ms->m_lkid); + if (is_altmode(lkb)) + munge_altmode(lkb, ms); + if (result) { + add_lkb(r, lkb, DLM_LKSTS_WAITING); + } else { + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + } + confirm_master(r, result); + break; + + case -EBADR: + case -ENOTBLK: + /* find_rsb failed to find rsb or rsb wasn't master */ + log_limit(ls, "receive_request_reply %x from %d %d " + "master %d dir %d first %x %s", lkb->lkb_id, + from_nodeid, result, r->res_master_nodeid, + r->res_dir_nodeid, r->res_first_lkid, r->res_name); + + if (r->res_dir_nodeid != dlm_our_nodeid() && + r->res_master_nodeid != dlm_our_nodeid()) { + /* cause _request_lock->set_master->send_lookup */ + r->res_master_nodeid = 0; + r->res_nodeid = -1; + lkb->lkb_nodeid = -1; + } + + if (is_overlap(lkb)) { + /* we'll ignore error in cancel/unlock reply */ + queue_cast_overlap(r, lkb); + confirm_master(r, result); + unhold_lkb(lkb); /* undoes create_lkb() */ + } else { + _request_lock(r, lkb); + + if (r->res_master_nodeid == dlm_our_nodeid()) + confirm_master(r, 0); + } + break; + + default: + log_error(ls, "receive_request_reply %x error %d", + lkb->lkb_id, result); + } + + if ((result == 0 || result == -EINPROGRESS) && + test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags)) { + log_debug(ls, "receive_request_reply %x result %d unlock", + lkb->lkb_id, result); + clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); + send_unlock(r, lkb); + } else if ((result == -EINPROGRESS) && + test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, + &lkb->lkb_iflags)) { + log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id); + clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); + send_cancel(r, lkb); + } else { + clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); + clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); + } + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; +} + +static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) +{ + /* this is the value returned from do_convert() on the master */ + switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { + case -EAGAIN: + /* convert would block (be queued) on remote master */ + queue_cast(r, lkb, -EAGAIN); + break; + + case -EDEADLK: + receive_flags_reply(lkb, ms, local); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -EDEADLK); + break; + + case -EINPROGRESS: + /* convert was queued on remote master */ + receive_flags_reply(lkb, ms, local); + if (is_demoted(lkb)) + munge_demoted(lkb); + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); + break; + + case 0: + /* convert was granted on remote master */ + receive_flags_reply(lkb, ms, local); + if (is_demoted(lkb)) + munge_demoted(lkb); + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + break; + + default: + log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d", + lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), + from_dlm_errno(le32_to_cpu(ms->m_result))); + dlm_print_rsb(r); + dlm_print_lkb(lkb); + } +} + +static void _receive_convert_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* local reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms, local); + if (error) + goto out; + + __receive_convert_reply(r, lkb, ms, local); + out: + unlock_rsb(r); + put_rsb(r); +} + +static int receive_convert_reply(struct dlm_ls *ls, + const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + return error; + + _receive_convert_reply(lkb, ms, false); + dlm_put_lkb(lkb); + return 0; +} + +static void _receive_unlock_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* local reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms, local); + if (error) + goto out; + + /* this is the value returned from do_unlock() on the master */ + + switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { + case -DLM_EUNLOCK: + receive_flags_reply(lkb, ms, local); + remove_lock_pc(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); + break; + case -ENOENT: + break; + default: + log_error(r->res_ls, "receive_unlock_reply %x error %d", + lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result))); + } + out: + unlock_rsb(r); + put_rsb(r); +} + +static int receive_unlock_reply(struct dlm_ls *ls, + const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + return error; + + _receive_unlock_reply(lkb, ms, false); + dlm_put_lkb(lkb); + return 0; +} + +static void _receive_cancel_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* local reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms, local); + if (error) + goto out; + + /* this is the value returned from do_cancel() on the master */ + + switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { + case -DLM_ECANCEL: + receive_flags_reply(lkb, ms, local); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -DLM_ECANCEL); + break; + case 0: + break; + default: + log_error(r->res_ls, "receive_cancel_reply %x error %d", + lkb->lkb_id, + from_dlm_errno(le32_to_cpu(ms->m_result))); + } + out: + unlock_rsb(r); + put_rsb(r); +} + +static int receive_cancel_reply(struct dlm_ls *ls, + const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); + if (error) + return error; + + _receive_cancel_reply(lkb, ms, false); + dlm_put_lkb(lkb); + return 0; +} + +static void receive_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, ret_nodeid; + int do_lookup_list = 0; + + error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb); + if (error) { + log_error(ls, "%s no lkid %x", __func__, + le32_to_cpu(ms->m_lkid)); + return; + } + + /* ms->m_result is the value returned by dlm_master_lookup on dir node + FIXME: will a non-zero error ever be returned? */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY); + if (error) + goto out; + + ret_nodeid = le32_to_cpu(ms->m_nodeid); + + /* We sometimes receive a request from the dir node for this + rsb before we've received the dir node's loookup_reply for it. + The request from the dir node implies we're the master, so we set + ourself as master in receive_request_reply, and verify here that + we are indeed the master. */ + + if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) { + /* This should never happen */ + log_error(ls, "receive_lookup_reply %x from %d ret %d " + "master %d dir %d our %d first %x %s", + lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid), + ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid, + dlm_our_nodeid(), r->res_first_lkid, r->res_name); + } + + if (ret_nodeid == dlm_our_nodeid()) { + r->res_master_nodeid = ret_nodeid; + r->res_nodeid = 0; + do_lookup_list = 1; + r->res_first_lkid = 0; + } else if (ret_nodeid == -1) { + /* the remote node doesn't believe it's the dir node */ + log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid", + lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid)); + r->res_master_nodeid = 0; + r->res_nodeid = -1; + lkb->lkb_nodeid = -1; + } else { + /* set_master() will set lkb_nodeid from r */ + r->res_master_nodeid = ret_nodeid; + r->res_nodeid = ret_nodeid; + } + + if (is_overlap(lkb)) { + log_debug(ls, "receive_lookup_reply %x unlock %x", + lkb->lkb_id, dlm_iflags_val(lkb)); + queue_cast_overlap(r, lkb); + unhold_lkb(lkb); /* undoes create_lkb() */ + goto out_list; + } + + _request_lock(r, lkb); + + out_list: + if (do_lookup_list) + process_lookup_list(r); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); +} + +static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms, + uint32_t saved_seq) +{ + int error = 0, noent = 0; + + if (WARN_ON_ONCE(!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid)))) { + log_limit(ls, "receive %d from non-member %d %x %x %d", + le32_to_cpu(ms->m_type), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid), + from_dlm_errno(le32_to_cpu(ms->m_result))); + return; + } + + switch (ms->m_type) { + + /* messages sent to a master node */ + + case cpu_to_le32(DLM_MSG_REQUEST): + error = receive_request(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_CONVERT): + error = receive_convert(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_UNLOCK): + error = receive_unlock(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_CANCEL): + noent = 1; + error = receive_cancel(ls, ms); + break; + + /* messages sent from a master node (replies to above) */ + + case cpu_to_le32(DLM_MSG_REQUEST_REPLY): + error = receive_request_reply(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_CONVERT_REPLY): + error = receive_convert_reply(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_UNLOCK_REPLY): + error = receive_unlock_reply(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_CANCEL_REPLY): + error = receive_cancel_reply(ls, ms); + break; + + /* messages sent from a master node (only two types of async msg) */ + + case cpu_to_le32(DLM_MSG_GRANT): + noent = 1; + error = receive_grant(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_BAST): + noent = 1; + error = receive_bast(ls, ms); + break; + + /* messages sent to a dir node */ + + case cpu_to_le32(DLM_MSG_LOOKUP): + receive_lookup(ls, ms); + break; + + case cpu_to_le32(DLM_MSG_REMOVE): + receive_remove(ls, ms); + break; + + /* messages sent from a dir node (remove has no reply) */ + + case cpu_to_le32(DLM_MSG_LOOKUP_REPLY): + receive_lookup_reply(ls, ms); + break; + + /* other messages */ + + case cpu_to_le32(DLM_MSG_PURGE): + receive_purge(ls, ms); + break; + + default: + log_error(ls, "unknown message type %d", + le32_to_cpu(ms->m_type)); + } + + /* + * When checking for ENOENT, we're checking the result of + * find_lkb(m_remid): + * + * The lock id referenced in the message wasn't found. This may + * happen in normal usage for the async messages and cancel, so + * only use log_debug for them. + * + * Some errors are expected and normal. + */ + + if (error == -ENOENT && noent) { + log_debug(ls, "receive %d no %x remote %d %x saved_seq %u", + le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), saved_seq); + } else if (error == -ENOENT) { + log_error(ls, "receive %d no %x remote %d %x saved_seq %u", + le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), saved_seq); + + if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT)) + dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash)); + } + + if (error == -EINVAL) { + log_error(ls, "receive %d inval from %d lkid %x remid %x " + "saved_seq %u", + le32_to_cpu(ms->m_type), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid), + saved_seq); + } +} + +/* If the lockspace is in recovery mode (locking stopped), then normal + messages are saved on the requestqueue for processing after recovery is + done. When not in recovery mode, we wait for dlm_recoverd to drain saved + messages off the requestqueue before we process new ones. This occurs right + after recovery completes when we transition from saving all messages on + requestqueue, to processing all the saved messages, to processing new + messages as they arrive. */ + +static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms, + int nodeid) +{ + if (dlm_locking_stopped(ls)) { + /* If we were a member of this lockspace, left, and rejoined, + other nodes may still be sending us messages from the + lockspace generation before we left. */ + if (WARN_ON_ONCE(!ls->ls_generation)) { + log_limit(ls, "receive %d from %d ignore old gen", + le32_to_cpu(ms->m_type), nodeid); + return; + } + + dlm_add_requestqueue(ls, nodeid, ms); + } else { + dlm_wait_requestqueue(ls); + _receive_message(ls, ms, 0); + } +} + +/* This is called by dlm_recoverd to process messages that were saved on + the requestqueue. */ + +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, + uint32_t saved_seq) +{ + _receive_message(ls, ms, saved_seq); +} + +/* This is called by the midcomms layer when something is received for + the lockspace. It could be either a MSG (normal message sent as part of + standard locking activity) or an RCOM (recovery message sent as part of + lockspace recovery). */ + +void dlm_receive_buffer(const union dlm_packet *p, int nodeid) +{ + const struct dlm_header *hd = &p->header; + struct dlm_ls *ls; + int type = 0; + + switch (hd->h_cmd) { + case DLM_MSG: + type = le32_to_cpu(p->message.m_type); + break; + case DLM_RCOM: + type = le32_to_cpu(p->rcom.rc_type); + break; + default: + log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid); + return; + } + + if (le32_to_cpu(hd->h_nodeid) != nodeid) { + log_print("invalid h_nodeid %d from %d lockspace %x", + le32_to_cpu(hd->h_nodeid), nodeid, + le32_to_cpu(hd->u.h_lockspace)); + return; + } + + ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace)); + if (!ls) { + if (dlm_config.ci_log_debug) { + printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " + "%u from %d cmd %d type %d\n", + le32_to_cpu(hd->u.h_lockspace), nodeid, + hd->h_cmd, type); + } + + if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) + dlm_send_ls_not_ready(nodeid, &p->rcom); + return; + } + + /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to + be inactive (in this ls) before transitioning to recovery mode */ + + down_read(&ls->ls_recv_active); + if (hd->h_cmd == DLM_MSG) + dlm_receive_message(ls, &p->message, nodeid); + else if (hd->h_cmd == DLM_RCOM) + dlm_receive_rcom(ls, &p->rcom, nodeid); + else + log_error(ls, "invalid h_cmd %d from %d lockspace %x", + hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace)); + up_read(&ls->ls_recv_active); + + dlm_put_lockspace(ls); +} + +static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms_local) +{ + if (middle_conversion(lkb)) { + hold_lkb(lkb); + memset(ms_local, 0, sizeof(struct dlm_message)); + ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); + ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS)); + ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); + _receive_convert_reply(lkb, ms_local, true); + + /* Same special case as in receive_rcom_lock_args() */ + lkb->lkb_grmode = DLM_LOCK_IV; + rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT); + unhold_lkb(lkb); + + } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) { + set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); + } + + /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down + conversions are async; there's no reply from the remote master */ +} + +/* A waiting lkb needs recovery if the master node has failed, or + the master node is changing (only when no directory is used) */ + +static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, + int dir_nodeid) +{ + if (dlm_no_directory(ls)) + return 1; + + if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) + return 1; + + return 0; +} + +/* Recovery for locks that are waiting for replies from nodes that are now + gone. We can just complete unlocks and cancels by faking a reply from the + dead node. Requests and up-conversions we flag to be resent after + recovery. Down-conversions can just be completed with a fake reply like + unlocks. Conversions between PR and CW need special attention. */ + +void dlm_recover_waiters_pre(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + struct dlm_message *ms_local; + int wait_type, local_unlock_result, local_cancel_result; + int dir_nodeid; + + ms_local = kmalloc(sizeof(*ms_local), GFP_KERNEL); + if (!ms_local) + return; + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { + + dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource); + + /* exclude debug messages about unlocks because there can be so + many and they aren't very interesting */ + + if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d", + lkb->lkb_id, + lkb->lkb_remid, + lkb->lkb_wait_type, + lkb->lkb_resource->res_nodeid, + lkb->lkb_nodeid, + lkb->lkb_wait_nodeid, + dir_nodeid); + } + + /* all outstanding lookups, regardless of destination will be + resent after recovery is done */ + + if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) { + set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); + continue; + } + + if (!waiter_needs_recovery(ls, lkb, dir_nodeid)) + continue; + + wait_type = lkb->lkb_wait_type; + local_unlock_result = -DLM_EUNLOCK; + local_cancel_result = -DLM_ECANCEL; + + /* Main reply may have been received leaving a zero wait_type, + but a reply for the overlapping op may not have been + received. In that case we need to fake the appropriate + reply for the overlap op. */ + + if (!wait_type) { + if (is_overlap_cancel(lkb)) { + wait_type = DLM_MSG_CANCEL; + if (lkb->lkb_grmode == DLM_LOCK_IV) + local_cancel_result = 0; + } + if (is_overlap_unlock(lkb)) { + wait_type = DLM_MSG_UNLOCK; + if (lkb->lkb_grmode == DLM_LOCK_IV) + local_unlock_result = -ENOENT; + } + + log_debug(ls, "rwpre overlap %x %x %d %d %d", + lkb->lkb_id, dlm_iflags_val(lkb), wait_type, + local_cancel_result, local_unlock_result); + } + + switch (wait_type) { + + case DLM_MSG_REQUEST: + set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); + break; + + case DLM_MSG_CONVERT: + recover_convert_waiter(ls, lkb, ms_local); + break; + + case DLM_MSG_UNLOCK: + hold_lkb(lkb); + memset(ms_local, 0, sizeof(struct dlm_message)); + ms_local->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY); + ms_local->m_result = cpu_to_le32(to_dlm_errno(local_unlock_result)); + ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); + _receive_unlock_reply(lkb, ms_local, true); + dlm_put_lkb(lkb); + break; + + case DLM_MSG_CANCEL: + hold_lkb(lkb); + memset(ms_local, 0, sizeof(struct dlm_message)); + ms_local->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY); + ms_local->m_result = cpu_to_le32(to_dlm_errno(local_cancel_result)); + ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); + _receive_cancel_reply(lkb, ms_local, true); + dlm_put_lkb(lkb); + break; + + default: + log_error(ls, "invalid lkb wait_type %d %d", + lkb->lkb_wait_type, wait_type); + } + schedule(); + } + mutex_unlock(&ls->ls_waiters_mutex); + kfree(ms_local); +} + +static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb = NULL, *iter; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) { + if (test_bit(DLM_IFL_RESEND_BIT, &iter->lkb_iflags)) { + hold_lkb(iter); + lkb = iter; + break; + } + } + mutex_unlock(&ls->ls_waiters_mutex); + + return lkb; +} + +/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the + master or dir-node for r. Processing the lkb may result in it being placed + back on waiters. */ + +/* We do this after normal locking has been enabled and any saved messages + (in requestqueue) have been processed. We should be confident that at + this point we won't get or process a reply to any of these waiting + operations. But, new ops may be coming in on the rsbs/locks here from + userspace or remotely. */ + +/* there may have been an overlap unlock/cancel prior to recovery or after + recovery. if before, the lkb may still have a pos wait_count; if after, the + overlap flag would just have been set and nothing new sent. we can be + confident here than any replies to either the initial op or overlap ops + prior to recovery have been received. */ + +int dlm_recover_waiters_post(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error = 0, mstype, err, oc, ou; + + while (1) { + if (dlm_locking_stopped(ls)) { + log_debug(ls, "recover_waiters_post aborted"); + error = -EINTR; + break; + } + + lkb = find_resend_waiter(ls); + if (!lkb) + break; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + mstype = lkb->lkb_wait_type; + oc = test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, + &lkb->lkb_iflags); + ou = test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, + &lkb->lkb_iflags); + err = 0; + + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d " + "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype, + r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid, + dlm_dir_nodeid(r), oc, ou); + + /* At this point we assume that we won't get a reply to any + previous op or overlap op on this lock. First, do a big + remove_from_waiters() for all previous ops. */ + + clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); + lkb->lkb_wait_type = 0; + /* drop all wait_count references we still + * hold a reference for this iteration. + */ + while (!atomic_dec_and_test(&lkb->lkb_wait_count)) + unhold_lkb(lkb); + + mutex_lock(&ls->ls_waiters_mutex); + list_del_init(&lkb->lkb_wait_reply); + mutex_unlock(&ls->ls_waiters_mutex); + + if (oc || ou) { + /* do an unlock or cancel instead of resending */ + switch (mstype) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + queue_cast(r, lkb, ou ? -DLM_EUNLOCK : + -DLM_ECANCEL); + unhold_lkb(lkb); /* undoes create_lkb() */ + break; + case DLM_MSG_CONVERT: + if (oc) { + queue_cast(r, lkb, -DLM_ECANCEL); + } else { + lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK; + _unlock_lock(r, lkb); + } + break; + default: + err = 1; + } + } else { + switch (mstype) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + _request_lock(r, lkb); + if (is_master(r)) + confirm_master(r, 0); + break; + case DLM_MSG_CONVERT: + _convert_lock(r, lkb); + break; + default: + err = 1; + } + } + + if (err) { + log_error(ls, "waiter %x msg %d r_nodeid %d " + "dir_nodeid %d overlap %d %d", + lkb->lkb_id, mstype, r->res_nodeid, + dlm_dir_nodeid(r), oc, ou); + } + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + } + + return error; +} + +static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + /* don't purge lkbs we've added in recover_master_copy for + the current recovery seq */ + + if (lkb->lkb_recover_seq == ls->ls_recover_seq) + continue; + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged mstcpy lkb not released"); + } +} + +void dlm_purge_mstcpy_locks(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + purge_mstcpy_list(ls, r, &r->res_grantqueue); + purge_mstcpy_list(ls, r, &r->res_convertqueue); + purge_mstcpy_list(ls, r, &r->res_waitqueue); +} + +static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list, + int nodeid_gone, unsigned int *count) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + if ((lkb->lkb_nodeid == nodeid_gone) || + dlm_is_removed(ls, lkb->lkb_nodeid)) { + + /* tell recover_lvb to invalidate the lvb + because a node holding EX/PW failed */ + if ((lkb->lkb_exflags & DLM_LKF_VALBLK) && + (lkb->lkb_grmode >= DLM_LOCK_PW)) { + rsb_set_flag(r, RSB_RECOVER_LVB_INVAL); + } + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged dead lkb not released"); + + rsb_set_flag(r, RSB_RECOVER_GRANT); + + (*count)++; + } + } +} + +/* Get rid of locks held by nodes that are gone. */ + +void dlm_recover_purge(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + struct dlm_member *memb; + int nodes_count = 0; + int nodeid_gone = 0; + unsigned int lkb_count = 0; + + /* cache one removed nodeid to optimize the common + case of a single node removed */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + nodes_count++; + nodeid_gone = memb->nodeid; + } + + if (!nodes_count) + return; + + down_write(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + hold_rsb(r); + lock_rsb(r); + if (is_master(r)) { + purge_dead_list(ls, r, &r->res_grantqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_convertqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_waitqueue, + nodeid_gone, &lkb_count); + } + unlock_rsb(r); + unhold_rsb(r); + cond_resched(); + } + up_write(&ls->ls_root_sem); + + if (lkb_count) + log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes", + lkb_count, nodes_count); +} + +static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) +{ + struct rb_node *n; + struct dlm_rsb *r; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + + if (!rsb_flag(r, RSB_RECOVER_GRANT)) + continue; + if (!is_master(r)) { + rsb_clear_flag(r, RSB_RECOVER_GRANT); + continue; + } + hold_rsb(r); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return r; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return NULL; +} + +/* + * Attempt to grant locks on resources that we are the master of. + * Locks may have become grantable during recovery because locks + * from departed nodes have been purged (or not rebuilt), allowing + * previously blocked locks to now be granted. The subset of rsb's + * we are interested in are those with lkb's on either the convert or + * waiting queues. + * + * Simplest would be to go through each master rsb and check for non-empty + * convert or waiting queues, and attempt to grant on those rsbs. + * Checking the queues requires lock_rsb, though, for which we'd need + * to release the rsbtbl lock. This would make iterating through all + * rsb's very inefficient. So, we rely on earlier recovery routines + * to set RECOVER_GRANT on any rsb's that we should attempt to grant + * locks for. + */ + +void dlm_recover_grant(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int bucket = 0; + unsigned int count = 0; + unsigned int rsb_count = 0; + unsigned int lkb_count = 0; + + while (1) { + r = find_grant_rsb(ls, bucket); + if (!r) { + if (bucket == ls->ls_rsbtbl_size - 1) + break; + bucket++; + continue; + } + rsb_count++; + count = 0; + lock_rsb(r); + /* the RECOVER_GRANT flag is checked in the grant path */ + grant_pending_locks(r, &count); + rsb_clear_flag(r, RSB_RECOVER_GRANT); + lkb_count += count; + confirm_master(r, 0); + unlock_rsb(r); + put_rsb(r); + cond_resched(); + } + + if (lkb_count) + log_rinfo(ls, "dlm_recover_grant %u locks on %u resources", + lkb_count, rsb_count); +} + +static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, + uint32_t remid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, head, lkb_statequeue) { + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) + return lkb; + } + return NULL; +} + +static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, + uint32_t remid) +{ + struct dlm_lkb *lkb; + + lkb = search_remid_list(&r->res_grantqueue, nodeid, remid); + if (lkb) + return lkb; + lkb = search_remid_list(&r->res_convertqueue, nodeid, remid); + if (lkb) + return lkb; + lkb = search_remid_list(&r->res_waitqueue, nodeid, remid); + if (lkb) + return lkb; + return NULL; +} + +/* needs at least dlm_rcom + rcom_lock */ +static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_rsb *r, const struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + + lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); + lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid); + lkb->lkb_remid = le32_to_cpu(rl->rl_lkid); + lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags); + dlm_set_dflags_val(lkb, le32_to_cpu(rl->rl_flags)); + set_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); + lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq); + lkb->lkb_rqmode = rl->rl_rqmode; + lkb->lkb_grmode = rl->rl_grmode; + /* don't set lkb_status because add_lkb wants to itself */ + + lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + int lvblen = le16_to_cpu(rc->rc_header.h_length) - + sizeof(struct dlm_rcom) - sizeof(struct rcom_lock); + if (lvblen > ls->ls_lvblen) + return -EINVAL; + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen); + } + + /* Conversions between PR and CW (middle modes) need special handling. + The real granted mode of these converting locks cannot be determined + until all locks have been rebuilt on the rsb (recover_conversion) */ + + if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) && + middle_conversion(lkb)) { + rl->rl_status = DLM_LKSTS_CONVERT; + lkb->lkb_grmode = DLM_LOCK_IV; + rsb_set_flag(r, RSB_RECOVER_CONVERT); + } + + return 0; +} + +/* This lkb may have been recovered in a previous aborted recovery so we need + to check if the rsb already has an lkb with the given remote nodeid/lkid. + If so we just send back a standard reply. If not, we create a new lkb with + the given values and send back our lkid. We send back our lkid by sending + back the rcom_lock struct we got but with the remid field filled in. */ + +/* needs at least dlm_rcom + rcom_lock */ +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + struct dlm_rsb *r; + struct dlm_lkb *lkb; + uint32_t remid = 0; + int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); + int error; + + /* init rl_remid with rcom lock rl_remid */ + *rl_remid = rl->rl_remid; + + if (rl->rl_parent_lkid) { + error = -EOPNOTSUPP; + goto out; + } + + remid = le32_to_cpu(rl->rl_lkid); + + /* In general we expect the rsb returned to be R_MASTER, but we don't + have to require it. Recovery of masters on one node can overlap + recovery of locks on another node, so one node can send us MSTCPY + locks before we've made ourselves master of this rsb. We can still + add new MSTCPY locks that we receive here without any harm; when + we make ourselves master, dlm_recover_masters() won't touch the + MSTCPY locks we've received early. */ + + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), + from_nodeid, R_RECEIVE_RECOVER, &r); + if (error) + goto out; + + lock_rsb(r); + + if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { + log_error(ls, "dlm_recover_master_copy remote %d %x not dir", + from_nodeid, remid); + error = -EBADR; + goto out_unlock; + } + + lkb = search_remid(r, from_nodeid, remid); + if (lkb) { + error = -EEXIST; + goto out_remid; + } + + error = create_lkb(ls, &lkb); + if (error) + goto out_unlock; + + error = receive_rcom_lock_args(ls, lkb, r, rc); + if (error) { + __put_lkb(ls, lkb); + goto out_unlock; + } + + attach_lkb(r, lkb); + add_lkb(r, lkb, rl->rl_status); + ls->ls_recover_locks_in++; + + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_RECOVER_GRANT); + + out_remid: + /* this is the new value returned to the lock holder for + saving in its process-copy lkb */ + *rl_remid = cpu_to_le32(lkb->lkb_id); + + lkb->lkb_recover_seq = ls->ls_recover_seq; + + out_unlock: + unlock_rsb(r); + put_rsb(r); + out: + if (error && error != -EEXIST) + log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", + from_nodeid, remid, error); + *rl_result = cpu_to_le32(error); + return error; +} + +/* needs at least dlm_rcom + rcom_lock */ +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + struct dlm_rsb *r; + struct dlm_lkb *lkb; + uint32_t lkid, remid; + int error, result; + + lkid = le32_to_cpu(rl->rl_lkid); + remid = le32_to_cpu(rl->rl_remid); + result = le32_to_cpu(rl->rl_result); + + error = find_lkb(ls, lkid, &lkb); + if (error) { + log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d", + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); + return error; + } + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + if (!is_process_copy(lkb)) { + log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); + dlm_dump_rsb(r); + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return -EINVAL; + } + + switch (result) { + case -EBADR: + /* There's a chance the new master received our lock before + dlm_recover_master_reply(), this wouldn't happen if we did + a barrier between recover_masters and recover_locks. */ + + log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d", + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); + + dlm_send_rcom_lock(r, lkb, seq); + goto out; + case -EEXIST: + case 0: + lkb->lkb_remid = remid; + break; + default: + log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk", + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); + } + + /* an ack for dlm_recover_locks() which waits for replies from + all the locks it sends to new masters */ + dlm_recovered_lock(r); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + + return 0; +} + +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + bool do_put = true; + int error; + + dlm_lock_recovery(ls); + + error = create_lkb(ls, &lkb); + if (error) { + kfree(ua); + goto out; + } + + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); + + if (flags & DLM_LKF_VALBLK) { + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + kfree(ua); + error = -ENOMEM; + goto out_put; + } + } + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); + if (error) { + kfree(ua->lksb.sb_lvbptr); + ua->lksb.sb_lvbptr = NULL; + kfree(ua); + goto out_put; + } + + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_DFL_USER_BIT is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + set_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags); + error = request_lock(ls, lkb, name, namelen, &args); + + switch (error) { + case 0: + break; + case -EINPROGRESS: + error = 0; + break; + case -EAGAIN: + error = 0; + fallthrough; + default: + goto out_put; + } + + /* add this new lkb to the per-process list of locks */ + spin_lock(&ua->proc->locks_spin); + hold_lkb(lkb); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + do_put = false; + out_put: + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false); + if (do_put) + __put_lkb(ls, lkb); + out: + dlm_unlock_recovery(ls); + return error; +} + +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags); + + /* user can change the params on its lock when it converts it, or + add an lvb that didn't exist before */ + + ua = lkb->lkb_ua; + + if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + error = -ENOMEM; + goto out_put; + } + } + if (lvb_in && ua->lksb.sb_lvbptr) + memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); + if (error) + goto out_put; + + error = convert_lock(ls, lkb, &args); + + if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) + error = 0; + out_put: + trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false); + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +/* + * The caller asks for an orphan lock on a given resource with a given mode. + * If a matching lock exists, it's moved to the owner's list of locks and + * the lkid is returned. + */ + +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + uint32_t *lkid) +{ + struct dlm_lkb *lkb = NULL, *iter; + struct dlm_user_args *ua; + int found_other_mode = 0; + int rv = 0; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) { + if (iter->lkb_resource->res_length != namelen) + continue; + if (memcmp(iter->lkb_resource->res_name, name, namelen)) + continue; + if (iter->lkb_grmode != mode) { + found_other_mode = 1; + continue; + } + + lkb = iter; + list_del_init(&iter->lkb_ownqueue); + clear_bit(DLM_DFL_ORPHAN_BIT, &iter->lkb_dflags); + *lkid = iter->lkb_id; + break; + } + mutex_unlock(&ls->ls_orphans_mutex); + + if (!lkb && found_other_mode) { + rv = -EAGAIN; + goto out; + } + + if (!lkb) { + rv = -ENOENT; + goto out; + } + + lkb->lkb_exflags = flags; + lkb->lkb_ownpid = (int) current->pid; + + ua = lkb->lkb_ua; + + ua->proc = ua_tmp->proc; + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + /* + * The lkb reference from the ls_orphans list was not + * removed above, and is now considered the reference + * for the proc locks list. + */ + + spin_lock(&ua->proc->locks_spin); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + out: + kfree(ua_tmp); + return rv; +} + +int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid, char *lvb_in) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + trace_dlm_unlock_start(ls, lkb, flags); + + ua = lkb->lkb_ua; + + if (lvb_in && ua->lksb.sb_lvbptr) + memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + if (ua_tmp->castparam) + ua->castparam = ua_tmp->castparam; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + error = unlock_lock(ls, lkb, &args); + + if (error == -DLM_EUNLOCK) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK)) + error = 0; + if (error) + goto out_put; + + spin_lock(&ua->proc->locks_spin); + /* dlm_user_add_cb() may have already taken lkb off the proc list */ + if (!list_empty(&lkb->lkb_ownqueue)) + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); + spin_unlock(&ua->proc->locks_spin); + out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + trace_dlm_unlock_start(ls, lkb, flags); + + ua = lkb->lkb_ua; + if (ua_tmp->castparam) + ua->castparam = ua_tmp->castparam; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + error = cancel_lock(ls, lkb, &args); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + struct dlm_rsb *r; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + trace_dlm_unlock_start(ls, lkb, flags); + + ua = lkb->lkb_ua; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + /* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, &args); + if (error) + goto out_r; + set_bit(DLM_IFL_DEADLOCK_CANCEL_BIT, &lkb->lkb_iflags); + + error = _cancel_lock(r, lkb); + out_r: + unlock_rsb(r); + put_rsb(r); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + return error; +} + +/* lkb's that are removed from the waiters list by revert are just left on the + orphans list with the granted orphan locks, to be freed by purge */ + +static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_args args; + int error; + + hold_lkb(lkb); /* reference for the ls_orphans list */ + mutex_lock(&ls->ls_orphans_mutex); + list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans); + mutex_unlock(&ls->ls_orphans_mutex); + + set_unlock_args(0, lkb->lkb_ua, &args); + + error = cancel_lock(ls, lkb, &args); + if (error == -DLM_ECANCEL) + error = 0; + return error; +} + +/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't + granted. Regardless of what rsb queue the lock is on, it's removed and + freed. The IVVALBLK flag causes the lvb on the resource to be invalidated + if our lock is PW/EX (it's ignored if our granted mode is smaller.) */ + +static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_args args; + int error; + + set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK, + lkb->lkb_ua, &args); + + error = unlock_lock(ls, lkb, &args); + if (error == -DLM_EUNLOCK) + error = 0; + return error; +} + +/* We have to release clear_proc_locks mutex before calling unlock_proc_lock() + (which does lock_rsb) due to deadlock with receiving a message that does + lock_rsb followed by dlm_user_add_cb() */ + +static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, + struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb = NULL; + + spin_lock(&ls->ls_clear_proc_locks); + if (list_empty(&proc->locks)) + goto out; + + lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue); + list_del_init(&lkb->lkb_ownqueue); + + if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) + set_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags); + else + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); + out: + spin_unlock(&ls->ls_clear_proc_locks); + return lkb; +} + +/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which + 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts, + which we clear here. */ + +/* proc CLOSING flag is set so no more device_reads should look at proc->asts + list, and no more device_writes should add lkb's to proc->locks list; so we + shouldn't need to take asts_spin or locks_spin here. this assumes that + device reads/writes/closes are serialized -- FIXME: we may need to serialize + them ourself. */ + +void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb, *safe; + + dlm_lock_recovery(ls); + + while (1) { + lkb = del_proc_lock(ls, proc); + if (!lkb) + break; + if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) + orphan_proc_lock(ls, lkb); + else + unlock_proc_lock(ls, lkb); + + /* this removes the reference for the proc->locks list + added by dlm_user_request, it may result in the lkb + being freed */ + + dlm_put_lkb(lkb); + } + + spin_lock(&ls->ls_clear_proc_locks); + + /* in-progress unlocks */ + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); + dlm_put_lkb(lkb); + } + + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { + dlm_purge_lkb_callbacks(lkb); + list_del_init(&lkb->lkb_cb_list); + dlm_put_lkb(lkb); + } + + spin_unlock(&ls->ls_clear_proc_locks); + dlm_unlock_recovery(ls); +} + +static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb, *safe; + + while (1) { + lkb = NULL; + spin_lock(&proc->locks_spin); + if (!list_empty(&proc->locks)) { + lkb = list_entry(proc->locks.next, struct dlm_lkb, + lkb_ownqueue); + list_del_init(&lkb->lkb_ownqueue); + } + spin_unlock(&proc->locks_spin); + + if (!lkb) + break; + + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); + unlock_proc_lock(ls, lkb); + dlm_put_lkb(lkb); /* ref from proc->locks list */ + } + + spin_lock(&proc->locks_spin); + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->locks_spin); + + spin_lock(&proc->asts_spin); + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { + dlm_purge_lkb_callbacks(lkb); + list_del_init(&lkb->lkb_cb_list); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->asts_spin); +} + +/* pid of 0 means purge all orphans */ + +static void do_purge(struct dlm_ls *ls, int nodeid, int pid) +{ + struct dlm_lkb *lkb, *safe; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) { + if (pid && lkb->lkb_ownpid != pid) + continue; + unlock_proc_lock(ls, lkb); + list_del_init(&lkb->lkb_ownqueue); + dlm_put_lkb(lkb); + } + mutex_unlock(&ls->ls_orphans_mutex); +} + +static int send_purge(struct dlm_ls *ls, int nodeid, int pid) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int error; + + error = _create_message(ls, sizeof(struct dlm_message), nodeid, + DLM_MSG_PURGE, &ms, &mh, GFP_NOFS); + if (error) + return error; + ms->m_nodeid = cpu_to_le32(nodeid); + ms->m_pid = cpu_to_le32(pid); + + return send_message(mh, ms, NULL, 0); +} + +int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, + int nodeid, int pid) +{ + int error = 0; + + if (nodeid && (nodeid != dlm_our_nodeid())) { + error = send_purge(ls, nodeid, pid); + } else { + dlm_lock_recovery(ls); + if (pid == current->pid) + purge_proc_locks(ls, proc); + else + do_purge(ls, nodeid, pid); + dlm_unlock_recovery(ls); + } + return error; +} + +/* debug functionality */ +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_dflags, int lkb_status) +{ + struct dlm_lksb *lksb; + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + /* we currently can't set a valid user lock */ + if (lkb_dflags & BIT(DLM_DFL_USER_BIT)) + return -EOPNOTSUPP; + + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); + if (!lksb) + return -ENOMEM; + + error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1); + if (error) { + kfree(lksb); + return error; + } + + dlm_set_dflags_val(lkb, lkb_dflags); + lkb->lkb_nodeid = lkb_nodeid; + lkb->lkb_lksb = lksb; + /* user specific pointer, just don't have it NULL for kernel locks */ + if (~lkb_dflags & BIT(DLM_DFL_USER_BIT)) + lkb->lkb_astparam = (void *)0xDEADBEEF; + + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); + if (error) { + kfree(lksb); + __put_lkb(ls, lkb); + return error; + } + + lock_rsb(r); + attach_lkb(r, lkb); + add_lkb(r, lkb, lkb_status); + unlock_rsb(r); + put_rsb(r); + + return 0; +} + +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, lkb_id, &lkb); + if (error) + return error; + + error = add_to_waiters(lkb, mstype, to_nodeid); + dlm_put_lkb(lkb); + return error; +} + diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h new file mode 100644 index 0000000000..b54e2cbbe6 --- /dev/null +++ b/fs/dlm/lock.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCK_DOT_H__ +#define __LOCK_DOT_H__ + +void dlm_dump_rsb(struct dlm_rsb *r); +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len); +void dlm_print_lkb(struct dlm_lkb *lkb); +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, + uint32_t saved_seq); +void dlm_receive_buffer(const union dlm_packet *p, int nodeid); +int dlm_modes_compat(int mode1, int mode2); +void dlm_put_rsb(struct dlm_rsb *r); +void dlm_hold_rsb(struct dlm_rsb *r); +int dlm_put_lkb(struct dlm_lkb *lkb); +void dlm_scan_rsbs(struct dlm_ls *ls); +int dlm_lock_recovery_try(struct dlm_ls *ls); +void dlm_unlock_recovery(struct dlm_ls *ls); + +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result); + +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, + struct dlm_rsb **r_ret); + +void dlm_recover_purge(struct dlm_ls *ls); +void dlm_purge_mstcpy_locks(struct dlm_rsb *r); +void dlm_recover_grant(struct dlm_ls *ls); +int dlm_recover_waiters_post(struct dlm_ls *ls); +void dlm_recover_waiters_pre(struct dlm_ls *ls); +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result); +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq); + +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + uint32_t *lkid); +int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid, char *lvb_in); +int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid); +int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, + int nodeid, int pid); +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); +void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status); +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid); + +static inline int is_master(struct dlm_rsb *r) +{ + return !r->res_nodeid; +} + +static inline void lock_rsb(struct dlm_rsb *r) +{ + mutex_lock(&r->res_mutex); +} + +static inline void unlock_rsb(struct dlm_rsb *r) +{ + mutex_unlock(&r->res_mutex); +} + +#endif + diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c new file mode 100644 index 0000000000..0455dddb07 --- /dev/null +++ b/fs/dlm/lockspace.c @@ -0,0 +1,937 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include <linux/module.h> + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "recoverd.h" +#include "dir.h" +#include "midcomms.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "recover.h" +#include "requestqueue.h" +#include "user.h" +#include "ast.h" + +static int ls_count; +static struct mutex ls_lock; +static struct list_head lslist; +static spinlock_t lslist_lock; +static struct task_struct * scand_task; + + +static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int n; + int rc = kstrtoint(buf, 0, &n); + + if (rc) + return rc; + ls = dlm_find_lockspace_local(ls->ls_local_handle); + if (!ls) + return -EINVAL; + + switch (n) { + case 0: + dlm_ls_stop(ls); + break; + case 1: + dlm_ls_start(ls); + break; + default: + ret = -EINVAL; + } + dlm_put_lockspace(ls); + return ret; +} + +static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int rc = kstrtoint(buf, 0, &ls->ls_uevent_result); + + if (rc) + return rc; + set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); + wake_up(&ls->ls_uevent_wait); + return len; +} + +static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id); +} + +static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int rc = kstrtouint(buf, 0, &ls->ls_global_id); + + if (rc) + return rc; + return len; +} + +static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls)); +} + +static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int val; + int rc = kstrtoint(buf, 0, &val); + + if (rc) + return rc; + if (val == 1) + set_bit(LSFL_NODIR, &ls->ls_flags); + return len; +} + +static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) +{ + uint32_t status = dlm_recover_status(ls); + return snprintf(buf, PAGE_SIZE, "%x\n", status); +} + +static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid); +} + +struct dlm_attr { + struct attribute attr; + ssize_t (*show)(struct dlm_ls *, char *); + ssize_t (*store)(struct dlm_ls *, const char *, size_t); +}; + +static struct dlm_attr dlm_attr_control = { + .attr = {.name = "control", .mode = S_IWUSR}, + .store = dlm_control_store +}; + +static struct dlm_attr dlm_attr_event = { + .attr = {.name = "event_done", .mode = S_IWUSR}, + .store = dlm_event_store +}; + +static struct dlm_attr dlm_attr_id = { + .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_id_show, + .store = dlm_id_store +}; + +static struct dlm_attr dlm_attr_nodir = { + .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_nodir_show, + .store = dlm_nodir_store +}; + +static struct dlm_attr dlm_attr_recover_status = { + .attr = {.name = "recover_status", .mode = S_IRUGO}, + .show = dlm_recover_status_show +}; + +static struct dlm_attr dlm_attr_recover_nodeid = { + .attr = {.name = "recover_nodeid", .mode = S_IRUGO}, + .show = dlm_recover_nodeid_show +}; + +static struct attribute *dlm_attrs[] = { + &dlm_attr_control.attr, + &dlm_attr_event.attr, + &dlm_attr_id.attr, + &dlm_attr_nodir.attr, + &dlm_attr_recover_status.attr, + &dlm_attr_recover_nodeid.attr, + NULL, +}; +ATTRIBUTE_GROUPS(dlm); + +static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a->show ? a->show(ls, buf) : 0; +} + +static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a->store ? a->store(ls, buf, len) : len; +} + +static void lockspace_kobj_release(struct kobject *k) +{ + struct dlm_ls *ls = container_of(k, struct dlm_ls, ls_kobj); + kfree(ls); +} + +static const struct sysfs_ops dlm_attr_ops = { + .show = dlm_attr_show, + .store = dlm_attr_store, +}; + +static struct kobj_type dlm_ktype = { + .default_groups = dlm_groups, + .sysfs_ops = &dlm_attr_ops, + .release = lockspace_kobj_release, +}; + +static struct kset *dlm_kset; + +static int do_uevent(struct dlm_ls *ls, int in) +{ + if (in) + kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); + else + kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + + log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); + + /* dlm_controld will see the uevent, do the necessary group management + and then write to sysfs to wake us */ + + wait_event(ls->ls_uevent_wait, + test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); + + log_rinfo(ls, "group event done %d", ls->ls_uevent_result); + + return ls->ls_uevent_result; +} + +static int dlm_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + const struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + + add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name); + return 0; +} + +static const struct kset_uevent_ops dlm_uevent_ops = { + .uevent = dlm_uevent, +}; + +int __init dlm_lockspace_init(void) +{ + ls_count = 0; + mutex_init(&ls_lock); + INIT_LIST_HEAD(&lslist); + spin_lock_init(&lslist_lock); + + dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj); + if (!dlm_kset) { + printk(KERN_WARNING "%s: can not create kset\n", __func__); + return -ENOMEM; + } + return 0; +} + +void dlm_lockspace_exit(void) +{ + kset_unregister(dlm_kset); +} + +static struct dlm_ls *find_ls_to_scan(void) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (time_after_eq(jiffies, ls->ls_scan_time + + dlm_config.ci_scan_secs * HZ)) { + spin_unlock(&lslist_lock); + return ls; + } + } + spin_unlock(&lslist_lock); + return NULL; +} + +static int dlm_scand(void *data) +{ + struct dlm_ls *ls; + + while (!kthread_should_stop()) { + ls = find_ls_to_scan(); + if (ls) { + if (dlm_lock_recovery_try(ls)) { + ls->ls_scan_time = jiffies; + dlm_scan_rsbs(ls); + dlm_unlock_recovery(ls); + } else { + ls->ls_scan_time += HZ; + } + continue; + } + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); + } + return 0; +} + +static int dlm_scand_start(void) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(dlm_scand, NULL, "dlm_scand"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + scand_task = p; + return error; +} + +static void dlm_scand_stop(void) +{ + kthread_stop(scand_task); +} + +struct dlm_ls *dlm_find_lockspace_global(uint32_t id) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_global_id == id) { + atomic_inc(&ls->ls_count); + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_local_handle == lockspace) { + atomic_inc(&ls->ls_count); + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *dlm_find_lockspace_device(int minor) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_device.minor == minor) { + atomic_inc(&ls->ls_count); + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +void dlm_put_lockspace(struct dlm_ls *ls) +{ + if (atomic_dec_and_test(&ls->ls_count)) + wake_up(&ls->ls_count_wait); +} + +static void remove_lockspace(struct dlm_ls *ls) +{ +retry: + wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0); + + spin_lock(&lslist_lock); + if (atomic_read(&ls->ls_count) != 0) { + spin_unlock(&lslist_lock); + goto retry; + } + + WARN_ON(ls->ls_create_count != 0); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); +} + +static int threads_start(void) +{ + int error; + + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_midcomms_start(); + if (error) { + log_print("cannot start dlm midcomms %d", error); + goto fail; + } + + error = dlm_scand_start(); + if (error) { + log_print("cannot start dlm_scand thread %d", error); + goto midcomms_fail; + } + + return 0; + + midcomms_fail: + dlm_midcomms_stop(); + fail: + return error; +} + +static int new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) +{ + struct dlm_ls *ls; + int i, size, error; + int do_unreg = 0; + int namelen = strlen(name); + + if (namelen > DLM_LOCKSPACE_LEN || namelen == 0) + return -EINVAL; + + if (lvblen % 8) + return -EINVAL; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + if (!dlm_user_daemon_available()) { + log_print("dlm user daemon not available"); + error = -EUNATCH; + goto out; + } + + if (ops && ops_result) { + if (!dlm_config.ci_recover_callbacks) + *ops_result = -EOPNOTSUPP; + else + *ops_result = 0; + } + + if (!cluster) + log_print("dlm cluster name '%s' is being used without an application provided cluster name", + dlm_config.ci_cluster_name); + + if (dlm_config.ci_recover_callbacks && cluster && + strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { + log_print("dlm cluster name '%s' does not match " + "the application cluster name '%s'", + dlm_config.ci_cluster_name, cluster); + error = -EBADR; + goto out; + } + + error = 0; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + WARN_ON(ls->ls_create_count <= 0); + if (ls->ls_namelen != namelen) + continue; + if (memcmp(ls->ls_name, name, namelen)) + continue; + if (flags & DLM_LSFL_NEWEXCL) { + error = -EEXIST; + break; + } + ls->ls_create_count++; + *lockspace = ls; + error = 1; + break; + } + spin_unlock(&lslist_lock); + + if (error) + goto out; + + error = -ENOMEM; + + ls = kzalloc(sizeof(*ls), GFP_NOFS); + if (!ls) + goto out; + memcpy(ls->ls_name, name, namelen); + ls->ls_namelen = namelen; + ls->ls_lvblen = lvblen; + atomic_set(&ls->ls_count, 0); + init_waitqueue_head(&ls->ls_count_wait); + ls->ls_flags = 0; + ls->ls_scan_time = jiffies; + + if (ops && dlm_config.ci_recover_callbacks) { + ls->ls_ops = ops; + ls->ls_ops_arg = ops_arg; + } + + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); + + size = READ_ONCE(dlm_config.ci_rsbtbl_size); + ls->ls_rsbtbl_size = size; + + ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable))); + if (!ls->ls_rsbtbl) + goto out_lsfree; + for (i = 0; i < size; i++) { + ls->ls_rsbtbl[i].keep.rb_node = NULL; + ls->ls_rsbtbl[i].toss.rb_node = NULL; + spin_lock_init(&ls->ls_rsbtbl[i].lock); + } + + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { + ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, + GFP_KERNEL); + if (!ls->ls_remove_names[i]) + goto out_rsbtbl; + } + + idr_init(&ls->ls_lkbidr); + spin_lock_init(&ls->ls_lkbidr_spin); + + INIT_LIST_HEAD(&ls->ls_waiters); + mutex_init(&ls->ls_waiters_mutex); + INIT_LIST_HEAD(&ls->ls_orphans); + mutex_init(&ls->ls_orphans_mutex); + + INIT_LIST_HEAD(&ls->ls_new_rsb); + spin_lock_init(&ls->ls_new_rsb_spin); + + INIT_LIST_HEAD(&ls->ls_nodes); + INIT_LIST_HEAD(&ls->ls_nodes_gone); + ls->ls_num_nodes = 0; + ls->ls_low_nodeid = 0; + ls->ls_total_weight = 0; + ls->ls_node_array = NULL; + + memset(&ls->ls_local_rsb, 0, sizeof(struct dlm_rsb)); + ls->ls_local_rsb.res_ls = ls; + + ls->ls_debug_rsb_dentry = NULL; + ls->ls_debug_waiters_dentry = NULL; + + init_waitqueue_head(&ls->ls_uevent_wait); + ls->ls_uevent_result = 0; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; + + spin_lock_init(&ls->ls_cb_lock); + INIT_LIST_HEAD(&ls->ls_cb_delay); + + ls->ls_recoverd_task = NULL; + mutex_init(&ls->ls_recoverd_active); + spin_lock_init(&ls->ls_recover_lock); + spin_lock_init(&ls->ls_rcom_spin); + get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); + ls->ls_recover_status = 0; + ls->ls_recover_seq = get_random_u64(); + ls->ls_recover_args = NULL; + init_rwsem(&ls->ls_in_recovery); + init_rwsem(&ls->ls_recv_active); + INIT_LIST_HEAD(&ls->ls_requestqueue); + atomic_set(&ls->ls_requestqueue_cnt, 0); + init_waitqueue_head(&ls->ls_requestqueue_wait); + mutex_init(&ls->ls_requestqueue_mutex); + spin_lock_init(&ls->ls_clear_proc_locks); + + /* Due backwards compatibility with 3.1 we need to use maximum + * possible dlm message size to be sure the message will fit and + * not having out of bounds issues. However on sending side 3.2 + * might send less. + */ + ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS); + if (!ls->ls_recover_buf) + goto out_lkbidr; + + ls->ls_slot = 0; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; + ls->ls_slots = NULL; + + INIT_LIST_HEAD(&ls->ls_recover_list); + spin_lock_init(&ls->ls_recover_list_lock); + idr_init(&ls->ls_recover_idr); + spin_lock_init(&ls->ls_recover_idr_lock); + ls->ls_recover_list_count = 0; + ls->ls_local_handle = ls; + init_waitqueue_head(&ls->ls_wait_general); + INIT_LIST_HEAD(&ls->ls_root_list); + init_rwsem(&ls->ls_root_sem); + + spin_lock(&lslist_lock); + ls->ls_create_count = 1; + list_add(&ls->ls_list, &lslist); + spin_unlock(&lslist_lock); + + if (flags & DLM_LSFL_FS) { + error = dlm_callback_start(ls); + if (error) { + log_error(ls, "can't start dlm_callback %d", error); + goto out_delist; + } + } + + init_waitqueue_head(&ls->ls_recover_lock_wait); + + /* + * Once started, dlm_recoverd first looks for ls in lslist, then + * initializes ls_in_recovery as locked in "down" mode. We need + * to wait for the wakeup from dlm_recoverd because in_recovery + * has to start out in down mode. + */ + + error = dlm_recoverd_start(ls); + if (error) { + log_error(ls, "can't start dlm_recoverd %d", error); + goto out_callback; + } + + wait_event(ls->ls_recover_lock_wait, + test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + + /* let kobject handle freeing of ls if there's an error */ + do_unreg = 1; + + ls->ls_kobj.kset = dlm_kset; + error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, + "%s", ls->ls_name); + if (error) + goto out_recoverd; + kobject_uevent(&ls->ls_kobj, KOBJ_ADD); + + /* This uevent triggers dlm_controld in userspace to add us to the + group of nodes that are members of this lockspace (managed by the + cluster infrastructure.) Once it's done that, it tells us who the + current lockspace members are (via configfs) and then tells the + lockspace to start running (via sysfs) in dlm_ls_start(). */ + + error = do_uevent(ls, 1); + if (error) + goto out_recoverd; + + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; + if (error) + goto out_members; + + dlm_create_debug_file(ls); + + log_rinfo(ls, "join complete"); + *lockspace = ls; + return 0; + + out_members: + do_uevent(ls, 0); + dlm_clear_members(ls); + kfree(ls->ls_node_array); + out_recoverd: + dlm_recoverd_stop(ls); + out_callback: + dlm_callback_stop(ls); + out_delist: + spin_lock(&lslist_lock); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); + idr_destroy(&ls->ls_recover_idr); + kfree(ls->ls_recover_buf); + out_lkbidr: + idr_destroy(&ls->ls_lkbidr); + out_rsbtbl: + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) + kfree(ls->ls_remove_names[i]); + vfree(ls->ls_rsbtbl); + out_lsfree: + if (do_unreg) + kobject_put(&ls->ls_kobj); + else + kfree(ls); + out: + module_put(THIS_MODULE); + return error; +} + +static int __dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + int error = 0; + + mutex_lock(&ls_lock); + if (!ls_count) + error = threads_start(); + if (error) + goto out; + + error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg, + ops_result, lockspace); + if (!error) + ls_count++; + if (error > 0) + error = 0; + if (!ls_count) { + dlm_scand_stop(); + dlm_midcomms_shutdown(); + dlm_midcomms_stop(); + } + out: + mutex_unlock(&ls_lock); + return error; +} + +int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags, + int lvblen, const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen, + ops, ops_arg, ops_result, lockspace); +} + +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags, lvblen, ops, + ops_arg, ops_result, lockspace); +} + +static int lkb_idr_is_local(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV; +} + +static int lkb_idr_is_any(int id, void *p, void *data) +{ + return 1; +} + +static int lkb_idr_free(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + if (lkb->lkb_lvbptr && test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) + dlm_free_lvb(lkb->lkb_lvbptr); + + dlm_free_lkb(lkb); + return 0; +} + +/* NOTE: We check the lkbidr here rather than the resource table. + This is because there may be LKBs queued as ASTs that have been unlinked + from their RSBs and are pending deletion once the AST has been delivered */ + +static int lockspace_busy(struct dlm_ls *ls, int force) +{ + int rv; + + spin_lock(&ls->ls_lkbidr_spin); + if (force == 0) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls); + } else if (force == 1) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls); + } else { + rv = 0; + } + spin_unlock(&ls->ls_lkbidr_spin); + return rv; +} + +static int release_lockspace(struct dlm_ls *ls, int force) +{ + struct dlm_rsb *rsb; + struct rb_node *n; + int i, busy, rv; + + busy = lockspace_busy(ls, force); + + spin_lock(&lslist_lock); + if (ls->ls_create_count == 1) { + if (busy) { + rv = -EBUSY; + } else { + /* remove_lockspace takes ls off lslist */ + ls->ls_create_count = 0; + rv = 0; + } + } else if (ls->ls_create_count > 1) { + rv = --ls->ls_create_count; + } else { + rv = -EINVAL; + } + spin_unlock(&lslist_lock); + + if (rv) { + log_debug(ls, "release_lockspace no remove %d", rv); + return rv; + } + + if (ls_count == 1) + dlm_midcomms_version_wait(); + + dlm_device_deregister(ls); + + if (force < 3 && dlm_user_daemon_available()) + do_uevent(ls, 0); + + dlm_recoverd_stop(ls); + + if (ls_count == 1) { + dlm_scand_stop(); + dlm_clear_members(ls); + dlm_midcomms_shutdown(); + } + + dlm_callback_stop(ls); + + remove_lockspace(ls); + + dlm_delete_debug_file(ls); + + idr_destroy(&ls->ls_recover_idr); + kfree(ls->ls_recover_buf); + + /* + * Free all lkb's in idr + */ + + idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); + idr_destroy(&ls->ls_lkbidr); + + /* + * Free all rsb's on rsbtbl[] lists + */ + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].keep); + dlm_free_rsb(rsb); + } + + while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].toss); + dlm_free_rsb(rsb); + } + } + + vfree(ls->ls_rsbtbl); + + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) + kfree(ls->ls_remove_names[i]); + + while (!list_empty(&ls->ls_new_rsb)) { + rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, + res_hashchain); + list_del(&rsb->res_hashchain); + dlm_free_rsb(rsb); + } + + /* + * Free structures on any other lists + */ + + dlm_purge_requestqueue(ls); + kfree(ls->ls_recover_args); + dlm_clear_members(ls); + dlm_clear_members_gone(ls); + kfree(ls->ls_node_array); + log_rinfo(ls, "release_lockspace final free"); + kobject_put(&ls->ls_kobj); + /* The ls structure will be freed when the kobject is done with */ + + module_put(THIS_MODULE); + return 0; +} + +/* + * Called when a system has released all its locks and is not going to use the + * lockspace any longer. We free everything we're managing for this lockspace. + * Remaining nodes will go through the recovery process as if we'd died. The + * lockspace must continue to function as usual, participating in recoveries, + * until this returns. + * + * Force has 4 possible values: + * 0 - don't destroy lockspace if it has any LKBs + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs + * 2 - destroy lockspace regardless of LKBs + * 3 - destroy lockspace as part of a forced shutdown + */ + +int dlm_release_lockspace(void *lockspace, int force) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + dlm_put_lockspace(ls); + + mutex_lock(&ls_lock); + error = release_lockspace(ls, force); + if (!error) + ls_count--; + if (!ls_count) + dlm_midcomms_stop(); + mutex_unlock(&ls_lock); + + return error; +} + +void dlm_stop_lockspaces(void) +{ + struct dlm_ls *ls; + int count; + + restart: + count = 0; + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) { + count++; + continue; + } + spin_unlock(&lslist_lock); + log_error(ls, "no userland control daemon, stopping lockspace"); + dlm_ls_stop(ls); + goto restart; + } + spin_unlock(&lslist_lock); + + if (count) + log_print("dlm user daemon left %d lockspaces", count); +} + diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h new file mode 100644 index 0000000000..47ebd44119 --- /dev/null +++ b/fs/dlm/lockspace.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCKSPACE_DOT_H__ +#define __LOCKSPACE_DOT_H__ + +/* DLM_LSFL_FS + * The lockspace user is in the kernel (i.e. filesystem). Enables + * direct bast/cast callbacks. + * + * internal lockspace flag - will be removed in future + */ +#define DLM_LSFL_FS 0x00000004 + +int dlm_lockspace_init(void); +void dlm_lockspace_exit(void); +struct dlm_ls *dlm_find_lockspace_global(uint32_t id); +struct dlm_ls *dlm_find_lockspace_local(void *id); +struct dlm_ls *dlm_find_lockspace_device(int minor); +void dlm_put_lockspace(struct dlm_ls *ls); +void dlm_stop_lockspaces(void); +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace); + +#endif /* __LOCKSPACE_DOT_H__ */ + diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c new file mode 100644 index 0000000000..32dbd1a828 --- /dev/null +++ b/fs/dlm/lowcomms.c @@ -0,0 +1,1997 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is its + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * lowcomms will choose to use either TCP or SCTP as its transport layer + * depending on the configuration variable 'protocol'. This should be set + * to 0 (default) for TCP or 1 for SCTP. It should be configured using a + * cluster-wide mechanism as it must be the same on all nodes of the cluster + * for the DLM to function. + * + */ + +#include <asm/ioctls.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/mutex.h> +#include <linux/sctp.h> +#include <linux/slab.h> +#include <net/sctp/sctp.h> +#include <net/ipv6.h> + +#include <trace/events/dlm.h> +#include <trace/events/sock.h> + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "memory.h" +#include "config.h" + +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) +#define NEEDED_RMEM (4*1024*1024) + +struct connection { + struct socket *sock; /* NULL if not connected */ + uint32_t nodeid; /* So we know who we are in the list */ + /* this semaphore is used to allow parallel recv/send in read + * lock mode. When we release a sock we need to held the write lock. + * + * However this is locking code and not nice. When we remove the + * othercon handling we can look into other mechanism to synchronize + * io handling to call sock_release() at the right time. + */ + struct rw_semaphore sock_lock; + unsigned long flags; +#define CF_APP_LIMITED 0 +#define CF_RECV_PENDING 1 +#define CF_SEND_PENDING 2 +#define CF_RECV_INTR 3 +#define CF_IO_STOP 4 +#define CF_IS_OTHERCON 5 + struct list_head writequeue; /* List of outgoing writequeue_entries */ + spinlock_t writequeue_lock; + int retries; + struct hlist_node list; + /* due some connect()/accept() races we currently have this cross over + * connection attempt second connection for one node. + * + * There is a solution to avoid the race by introducing a connect + * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to + * connect. Otherside can connect but will only be considered that + * the other side wants to have a reconnect. + * + * However changing to this behaviour will break backwards compatible. + * In a DLM protocol major version upgrade we should remove this! + */ + struct connection *othercon; + struct work_struct rwork; /* receive worker */ + struct work_struct swork; /* send worker */ + wait_queue_head_t shutdown_wait; + unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; + int rx_leftover; + int mark; + int addr_count; + int curr_addr_index; + struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT]; + spinlock_t addrs_lock; + struct rcu_head rcu; +}; +#define sock2con(x) ((struct connection *)(x)->sk_user_data) + +struct listen_connection { + struct socket *sock; + struct work_struct rwork; +}; + +#define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end) +#define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset) + +/* An entry waiting to be sent */ +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + bool dirty; + struct connection *con; + struct list_head msgs; + struct kref ref; +}; + +struct dlm_msg { + struct writequeue_entry *entry; + struct dlm_msg *orig_msg; + bool retransmit; + void *ppc; + int len; + int idx; /* new()/commit() idx exchange */ + + struct list_head list; + struct kref ref; +}; + +struct processqueue_entry { + unsigned char *buf; + int nodeid; + int buflen; + + struct list_head list; +}; + +struct dlm_proto_ops { + bool try_new_addr; + const char *name; + int proto; + + int (*connect)(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len); + void (*sockopts)(struct socket *sock); + int (*bind)(struct socket *sock); + int (*listen_validate)(void); + void (*listen_sockopts)(struct socket *sock); + int (*listen_bind)(struct socket *sock); +}; + +static struct listen_sock_callbacks { + void (*sk_error_report)(struct sock *); + void (*sk_data_ready)(struct sock *); + void (*sk_state_change)(struct sock *); + void (*sk_write_space)(struct sock *); +} listen_sock; + +static struct listen_connection listen_con; +static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static int dlm_local_count; + +/* Work queues */ +static struct workqueue_struct *io_workqueue; +static struct workqueue_struct *process_workqueue; + +static struct hlist_head connection_hash[CONN_HASH_SIZE]; +static DEFINE_SPINLOCK(connections_lock); +DEFINE_STATIC_SRCU(connections_srcu); + +static const struct dlm_proto_ops *dlm_proto_ops; + +#define DLM_IO_SUCCESS 0 +#define DLM_IO_END 1 +#define DLM_IO_EOF 2 +#define DLM_IO_RESCHED 3 + +static void process_recv_sockets(struct work_struct *work); +static void process_send_sockets(struct work_struct *work); +static void process_dlm_messages(struct work_struct *work); + +static DECLARE_WORK(process_work, process_dlm_messages); +static DEFINE_SPINLOCK(processqueue_lock); +static bool process_dlm_messages_pending; +static LIST_HEAD(processqueue); + +bool dlm_lowcomms_is_running(void) +{ + return !!listen_con.sock; +} + +static void lowcomms_queue_swork(struct connection *con) +{ + assert_spin_locked(&con->writequeue_lock); + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_bit(CF_APP_LIMITED, &con->flags) && + !test_and_set_bit(CF_SEND_PENDING, &con->flags)) + queue_work(io_workqueue, &con->swork); +} + +static void lowcomms_queue_rwork(struct connection *con) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk)); +#endif + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_and_set_bit(CF_RECV_PENDING, &con->flags)) + queue_work(io_workqueue, &con->rwork); +} + +static void writequeue_entry_ctor(void *data) +{ + struct writequeue_entry *entry = data; + + INIT_LIST_HEAD(&entry->msgs); +} + +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) +{ + return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry), + 0, 0, writequeue_entry_ctor); +} + +struct kmem_cache *dlm_lowcomms_msg_cache_create(void) +{ + return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL); +} + +/* need to held writequeue_lock */ +static struct writequeue_entry *con_next_wq(struct connection *con) +{ + struct writequeue_entry *e; + + e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry, + list); + /* if len is zero nothing is to send, if there are users filling + * buffers we wait until the users are done so we can send more. + */ + if (!e || e->users || e->len == 0) + return NULL; + + return e; +} + +static struct connection *__find_con(int nodeid, int r) +{ + struct connection *con; + + hlist_for_each_entry_rcu(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) + return con; + } + + return NULL; +} + +static void dlm_con_init(struct connection *con, int nodeid) +{ + con->nodeid = nodeid; + init_rwsem(&con->sock_lock); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + INIT_WORK(&con->swork, process_send_sockets); + INIT_WORK(&con->rwork, process_recv_sockets); + spin_lock_init(&con->addrs_lock); + init_waitqueue_head(&con->shutdown_wait); +} + +/* + * If 'allocation' is zero then we don't attempt to create a new + * connection structure for this node. + */ +static struct connection *nodeid2con(int nodeid, gfp_t alloc) +{ + struct connection *con, *tmp; + int r; + + r = nodeid_hash(nodeid); + con = __find_con(nodeid, r); + if (con || !alloc) + return con; + + con = kzalloc(sizeof(*con), alloc); + if (!con) + return NULL; + + dlm_con_init(con, nodeid); + + spin_lock(&connections_lock); + /* Because multiple workqueues/threads calls this function it can + * race on multiple cpu's. Instead of locking hot path __find_con() + * we just check in rare cases of recently added nodes again + * under protection of connections_lock. If this is the case we + * abort our connection creation and return the existing connection. + */ + tmp = __find_con(nodeid, r); + if (tmp) { + spin_unlock(&connections_lock); + kfree(con); + return tmp; + } + + hlist_add_head_rcu(&con->list, &connection_hash[r]); + spin_unlock(&connections_lock); + + return con; +} + +static int addr_compare(const struct sockaddr_storage *x, + const struct sockaddr_storage *y) +{ + switch (x->ss_family) { + case AF_INET: { + struct sockaddr_in *sinx = (struct sockaddr_in *)x; + struct sockaddr_in *siny = (struct sockaddr_in *)y; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + if (sinx->sin_port != siny->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; + struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + if (sinx->sin6_port != siny->sin6_port) + return 0; + break; + } + default: + return 0; + } + return 1; +} + +static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, + struct sockaddr *sa_out, bool try_new_addr, + unsigned int *mark) +{ + struct sockaddr_storage sas; + struct connection *con; + int idx; + + if (!dlm_local_count) + return -1; + + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + memcpy(&sas, &con->addr[con->curr_addr_index], + sizeof(struct sockaddr_storage)); + + if (try_new_addr) { + con->curr_addr_index++; + if (con->curr_addr_index == con->addr_count) + con->curr_addr_index = 0; + } + + *mark = con->mark; + spin_unlock(&con->addrs_lock); + + if (sas_out) + memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); + + if (!sa_out) { + srcu_read_unlock(&connections_srcu, idx); + return 0; + } + + if (dlm_local_addr[0].ss_family == AF_INET) { + struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; + struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; + } else { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas; + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out; + ret6->sin6_addr = in6->sin6_addr; + } + + srcu_read_unlock(&connections_srcu, idx); + return 0; +} + +static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, + unsigned int *mark) +{ + struct connection *con; + int i, idx, addr_i; + + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + WARN_ON_ONCE(!con->addr_count); + + spin_lock(&con->addrs_lock); + for (addr_i = 0; addr_i < con->addr_count; addr_i++) { + if (addr_compare(&con->addr[addr_i], addr)) { + *nodeid = con->nodeid; + *mark = con->mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return 0; + } + } + spin_unlock(&con->addrs_lock); + } + } + srcu_read_unlock(&connections_srcu, idx); + + return -ENOENT; +} + +static bool dlm_lowcomms_con_has_addr(const struct connection *con, + const struct sockaddr_storage *addr) +{ + int i; + + for (i = 0; i < con->addr_count; i++) { + if (addr_compare(&con->addr[i], addr)) + return true; + } + + return false; +} + +int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + struct connection *con; + bool ret, idx; + + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, GFP_NOFS); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOMEM; + } + + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + memcpy(&con->addr[0], addr, sizeof(*addr)); + con->addr_count = 1; + con->mark = dlm_config.ci_mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return 0; + } + + ret = dlm_lowcomms_con_has_addr(con, addr); + if (ret) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return -EEXIST; + } + + if (con->addr_count >= DLM_MAX_ADDR_COUNT) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return -ENOSPC; + } + + memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr)); + srcu_read_unlock(&connections_srcu, idx); + spin_unlock(&con->addrs_lock); + return 0; +} + +/* Data available on socket or listen socket received a connect */ +static void lowcomms_data_ready(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + trace_sk_data_ready(sk); + + set_bit(CF_RECV_INTR, &con->flags); + lowcomms_queue_rwork(con); +} + +static void lowcomms_write_space(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + clear_bit(SOCK_NOSPACE, &con->sock->flags); + + spin_lock_bh(&con->writequeue_lock); + if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { + con->sock->sk->sk_write_pending--; + clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); + } + + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); +} + +static void lowcomms_state_change(struct sock *sk) +{ + /* SCTP layer is not calling sk_data_ready when the connection + * is done, so we catch the signal through here. + */ + if (sk->sk_shutdown == RCV_SHUTDOWN) + lowcomms_data_ready(sk); +} + +static void lowcomms_listen_data_ready(struct sock *sk) +{ + trace_sk_data_ready(sk); + + queue_work(io_workqueue, &listen_con.rwork); +} + +int dlm_lowcomms_connect_node(int nodeid) +{ + struct connection *con; + int idx; + + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + down_read(&con->sock_lock); + if (!con->sock) { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + up_read(&con->sock_lock); + srcu_read_unlock(&connections_srcu, idx); + + cond_resched(); + return 0; +} + +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) +{ + struct connection *con; + int idx; + + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + spin_lock(&con->addrs_lock); + con->mark = mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return 0; +} + +static void lowcomms_error_report(struct sock *sk) +{ + struct connection *con = sock2con(sk); + struct inet_sock *inet; + + inet = inet_sk(sk); + switch (sk->sk_family) { + case AF_INET: + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d at %pI4, dport %d, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &inet->inet_daddr, + ntohs(inet->inet_dport), sk->sk_err, + READ_ONCE(sk->sk_err_soft)); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d at %pI6c, " + "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &sk->sk_v6_daddr, + ntohs(inet->inet_dport), sk->sk_err, + READ_ONCE(sk->sk_err_soft)); + break; +#endif + default: + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "invalid socket family %d set, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + sk->sk_family, sk->sk_err, + READ_ONCE(sk->sk_err_soft)); + break; + } + + dlm_midcomms_unack_msg_resend(con->nodeid); + + listen_sock.sk_error_report(sk); +} + +static void restore_callbacks(struct sock *sk) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(!lockdep_sock_is_held(sk)); +#endif + + sk->sk_user_data = NULL; + sk->sk_data_ready = listen_sock.sk_data_ready; + sk->sk_state_change = listen_sock.sk_state_change; + sk->sk_write_space = listen_sock.sk_write_space; + sk->sk_error_report = listen_sock.sk_error_report; +} + +/* Make a socket active */ +static void add_sock(struct socket *sock, struct connection *con) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + con->sock = sock; + + sk->sk_user_data = con; + sk->sk_data_ready = lowcomms_data_ready; + sk->sk_write_space = lowcomms_write_space; + if (dlm_config.ci_protocol == DLM_PROTO_SCTP) + sk->sk_state_change = lowcomms_state_change; + sk->sk_allocation = GFP_NOFS; + sk->sk_use_task_frag = false; + sk->sk_error_report = lowcomms_error_report; + release_sock(sk); +} + +/* Add the port number to an IPv6 or 4 sockaddr and return the address + length */ +static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, + int *addr_len) +{ + saddr->ss_family = dlm_local_addr[0].ss_family; + if (saddr->ss_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in); + memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); + } else { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; + in6_addr->sin6_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in6); + } + memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); +} + +static void dlm_page_release(struct kref *kref) +{ + struct writequeue_entry *e = container_of(kref, struct writequeue_entry, + ref); + + __free_page(e->page); + dlm_free_writequeue(e); +} + +static void dlm_msg_release(struct kref *kref) +{ + struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); + + kref_put(&msg->entry->ref, dlm_page_release); + dlm_free_msg(msg); +} + +static void free_entry(struct writequeue_entry *e) +{ + struct dlm_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &e->msgs, list) { + if (msg->orig_msg) { + msg->orig_msg->retransmit = false; + kref_put(&msg->orig_msg->ref, dlm_msg_release); + } + + list_del(&msg->list); + kref_put(&msg->ref, dlm_msg_release); + } + + list_del(&e->list); + kref_put(&e->ref, dlm_page_release); +} + +static void dlm_close_sock(struct socket **sock) +{ + lock_sock((*sock)->sk); + restore_callbacks((*sock)->sk); + release_sock((*sock)->sk); + + sock_release(*sock); + *sock = NULL; +} + +static void allow_connection_io(struct connection *con) +{ + if (con->othercon) + clear_bit(CF_IO_STOP, &con->othercon->flags); + clear_bit(CF_IO_STOP, &con->flags); +} + +static void stop_connection_io(struct connection *con) +{ + if (con->othercon) + stop_connection_io(con->othercon); + + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + + down_write(&con->sock_lock); + if (con->sock) { + lock_sock(con->sock->sk); + restore_callbacks(con->sock->sk); + release_sock(con->sock->sk); + } + up_write(&con->sock_lock); + + cancel_work_sync(&con->swork); + cancel_work_sync(&con->rwork); +} + +/* Close a remote connection and tidy up */ +static void close_connection(struct connection *con, bool and_other) +{ + struct writequeue_entry *e; + + if (con->othercon && and_other) + close_connection(con->othercon, false); + + down_write(&con->sock_lock); + if (!con->sock) { + up_write(&con->sock_lock); + return; + } + + dlm_close_sock(&con->sock); + + /* if we send a writequeue entry only a half way, we drop the + * whole entry because reconnection and that we not start of the + * middle of a msg which will confuse the other end. + * + * we can always drop messages because retransmits, but what we + * cannot allow is to transmit half messages which may be processed + * at the other side. + * + * our policy is to start on a clean state when disconnects, we don't + * know what's send/received on transport layer in this case. + */ + spin_lock_bh(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->dirty) + free_entry(e); + } + spin_unlock_bh(&con->writequeue_lock); + + con->rx_leftover = 0; + con->retries = 0; + clear_bit(CF_APP_LIMITED, &con->flags); + clear_bit(CF_RECV_PENDING, &con->flags); + clear_bit(CF_SEND_PENDING, &con->flags); + up_write(&con->sock_lock); +} + +static void shutdown_connection(struct connection *con, bool and_other) +{ + int ret; + + if (con->othercon && and_other) + shutdown_connection(con->othercon, false); + + flush_workqueue(io_workqueue); + down_read(&con->sock_lock); + /* nothing to shutdown */ + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + up_read(&con->sock_lock); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, !con->sock, + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + close_connection(con, false); +} + +static struct processqueue_entry *new_processqueue_entry(int nodeid, + int buflen) +{ + struct processqueue_entry *pentry; + + pentry = kmalloc(sizeof(*pentry), GFP_NOFS); + if (!pentry) + return NULL; + + pentry->buf = kmalloc(buflen, GFP_NOFS); + if (!pentry->buf) { + kfree(pentry); + return NULL; + } + + pentry->nodeid = nodeid; + return pentry; +} + +static void free_processqueue_entry(struct processqueue_entry *pentry) +{ + kfree(pentry->buf); + kfree(pentry); +} + +struct dlm_processed_nodes { + int nodeid; + + struct list_head list; +}; + +static void process_dlm_messages(struct work_struct *work) +{ + struct processqueue_entry *pentry; + + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (WARN_ON_ONCE(!pentry)) { + process_dlm_messages_pending = false; + spin_unlock(&processqueue_lock); + return; + } + + list_del(&pentry->list); + spin_unlock(&processqueue_lock); + + for (;;) { + dlm_process_incoming_buffer(pentry->nodeid, pentry->buf, + pentry->buflen); + free_processqueue_entry(pentry); + + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (!pentry) { + process_dlm_messages_pending = false; + spin_unlock(&processqueue_lock); + break; + } + + list_del(&pentry->list); + spin_unlock(&processqueue_lock); + } +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con, int buflen) +{ + struct processqueue_entry *pentry; + int ret, buflen_real; + struct msghdr msg; + struct kvec iov; + + pentry = new_processqueue_entry(con->nodeid, buflen); + if (!pentry) + return DLM_IO_RESCHED; + + memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover); + + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = pentry->buf + con->rx_leftover; + iov.iov_len = buflen - con->rx_leftover; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + clear_bit(CF_RECV_INTR, &con->flags); +again: + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); + if (ret == -EAGAIN) { + lock_sock(con->sock->sk); + if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) { + release_sock(con->sock->sk); + goto again; + } + + clear_bit(CF_RECV_PENDING, &con->flags); + release_sock(con->sock->sk); + free_processqueue_entry(pentry); + return DLM_IO_END; + } else if (ret == 0) { + /* close will clear CF_RECV_PENDING */ + free_processqueue_entry(pentry); + return DLM_IO_EOF; + } else if (ret < 0) { + free_processqueue_entry(pentry); + return ret; + } + + /* new buflen according readed bytes and leftover from last receive */ + buflen_real = ret + con->rx_leftover; + ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf, + buflen_real); + if (ret < 0) { + free_processqueue_entry(pentry); + return ret; + } + + pentry->buflen = ret; + + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen_real - ret; + memmove(con->rx_leftover_buf, pentry->buf + ret, + con->rx_leftover); + + spin_lock(&processqueue_lock); + list_add_tail(&pentry->list, &processqueue); + if (!process_dlm_messages_pending) { + process_dlm_messages_pending = true; + queue_work(process_workqueue, &process_work); + } + spin_unlock(&processqueue_lock); + + return DLM_IO_SUCCESS; +} + +/* Listening socket is busy, accept a connection */ +static int accept_from_sock(void) +{ + struct sockaddr_storage peeraddr; + int len, idx, result, nodeid; + struct connection *newcon; + struct socket *newsock; + unsigned int mark; + + result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK); + if (result == -EAGAIN) + return DLM_IO_END; + else if (result < 0) + goto accept_err; + + /* Get the connected socket's peer */ + memset(&peeraddr, 0, sizeof(peeraddr)); + len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); + if (len < 0) { + result = -ECONNABORTED; + goto accept_err; + } + + /* Get the new node's NODEID */ + make_sockaddr(&peeraddr, 0, &len); + if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { + switch (peeraddr.ss_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr; + + log_print("connect from non cluster IPv4 node %pI4", + &sin->sin_addr); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr; + + log_print("connect from non cluster IPv6 node %pI6c", + &sin6->sin6_addr); + break; + } +#endif + default: + log_print("invalid family from non cluster node"); + break; + } + + sock_release(newsock); + return -1; + } + + log_print("got connection from %d", nodeid); + + /* Check to see if we already have a connection to this node. This + * could happen if the two nodes initiate a connection at roughly + * the same time and the connections cross on the wire. + * In this case we store the incoming one in "othercon" + */ + idx = srcu_read_lock(&connections_srcu); + newcon = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!newcon)) { + srcu_read_unlock(&connections_srcu, idx); + result = -ENOENT; + goto accept_err; + } + + sock_set_mark(newsock->sk, mark); + + down_write(&newcon->sock_lock); + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kzalloc(sizeof(*othercon), GFP_NOFS); + if (!othercon) { + log_print("failed to allocate incoming socket"); + up_write(&newcon->sock_lock); + srcu_read_unlock(&connections_srcu, idx); + result = -ENOMEM; + goto accept_err; + } + + dlm_con_init(othercon, nodeid); + lockdep_set_subclass(&othercon->sock_lock, 1); + newcon->othercon = othercon; + set_bit(CF_IS_OTHERCON, &othercon->flags); + } else { + /* close other sock con if we have something new */ + close_connection(othercon, false); + } + + down_write(&othercon->sock_lock); + add_sock(newsock, othercon); + + /* check if we receved something while adding */ + lock_sock(othercon->sock->sk); + lowcomms_queue_rwork(othercon); + release_sock(othercon->sock->sk); + up_write(&othercon->sock_lock); + } + else { + /* accept copies the sk after we've saved the callbacks, so we + don't want to save them a second time or comm errors will + result in calling sk_error_report recursively. */ + add_sock(newsock, newcon); + + /* check if we receved something while adding */ + lock_sock(newcon->sock->sk); + lowcomms_queue_rwork(newcon); + release_sock(newcon->sock->sk); + } + up_write(&newcon->sock_lock); + srcu_read_unlock(&connections_srcu, idx); + + return DLM_IO_SUCCESS; + +accept_err: + if (newsock) + sock_release(newsock); + + return result; +} + +/* + * writequeue_entry_complete - try to delete and free write queue entry + * @e: write queue entry to try to delete + * @completed: bytes completed + * + * writequeue_lock must be held. + */ +static void writequeue_entry_complete(struct writequeue_entry *e, int completed) +{ + e->offset += completed; + e->len -= completed; + /* signal that page was half way transmitted */ + e->dirty = true; + + if (e->len == 0 && e->users == 0) + free_entry(e); +} + +/* + * sctp_bind_addrs - bind a SCTP socket to all our addresses + */ +static int sctp_bind_addrs(struct socket *sock, uint16_t port) +{ + struct sockaddr_storage localaddr; + struct sockaddr *addr = (struct sockaddr *)&localaddr; + int i, addr_len, result = 0; + + for (i = 0; i < dlm_local_count; i++) { + memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr)); + make_sockaddr(&localaddr, port, &addr_len); + + if (!i) + result = kernel_bind(sock, addr, addr_len); + else + result = sock_bind_add(sock->sk, addr, addr_len); + + if (result < 0) { + log_print("Can't bind to %d addr number %d, %d.\n", + port, i + 1, result); + break; + } + } + return result; +} + +/* Get local addresses */ +static void init_local(void) +{ + struct sockaddr_storage sas; + int i; + + dlm_local_count = 0; + for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) { + if (dlm_our_addr(&sas, i)) + break; + + memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas)); + } +} + +static struct writequeue_entry *new_writequeue_entry(struct connection *con) +{ + struct writequeue_entry *entry; + + entry = dlm_allocate_writequeue(); + if (!entry) + return NULL; + + entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); + if (!entry->page) { + dlm_free_writequeue(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->dirty = false; + entry->con = con; + entry->users = 1; + kref_init(&entry->ref); + return entry; +} + +static struct writequeue_entry *new_wq_entry(struct connection *con, int len, + char **ppc, void (*cb)(void *data), + void *data) +{ + struct writequeue_entry *e; + + spin_lock_bh(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_last_entry(&con->writequeue, struct writequeue_entry, list); + if (DLM_WQ_REMAIN_BYTES(e) >= len) { + kref_get(&e->ref); + + *ppc = page_address(e->page) + e->end; + if (cb) + cb(data); + + e->end += len; + e->users++; + goto out; + } + } + + e = new_writequeue_entry(con); + if (!e) + goto out; + + kref_get(&e->ref); + *ppc = page_address(e->page); + e->end += len; + if (cb) + cb(data); + + list_add_tail(&e->list, &con->writequeue); + +out: + spin_unlock_bh(&con->writequeue_lock); + return e; +}; + +static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, + gfp_t allocation, char **ppc, + void (*cb)(void *data), + void *data) +{ + struct writequeue_entry *e; + struct dlm_msg *msg; + + msg = dlm_allocate_msg(allocation); + if (!msg) + return NULL; + + kref_init(&msg->ref); + + e = new_wq_entry(con, len, ppc, cb, data); + if (!e) { + dlm_free_msg(msg); + return NULL; + } + + msg->retransmit = false; + msg->orig_msg = NULL; + msg->ppc = *ppc; + msg->len = len; + msg->entry = e; + + return msg; +} + +/* avoid false positive for nodes_srcu, unlock happens in + * dlm_lowcomms_commit_msg which is a must call if success + */ +#ifndef __CHECKER__ +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(void *data), + void *data) +{ + struct connection *con; + struct dlm_msg *msg; + int idx; + + if (len > DLM_MAX_SOCKET_BUFSIZE || + len < sizeof(struct dlm_header)) { + BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); + log_print("failed to allocate a buffer of size %d", len); + WARN_ON_ONCE(1); + return NULL; + } + + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return NULL; + } + + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data); + if (!msg) { + srcu_read_unlock(&connections_srcu, idx); + return NULL; + } + + /* for dlm_lowcomms_commit_msg() */ + kref_get(&msg->ref); + /* we assume if successful commit must called */ + msg->idx = idx; + return msg; +} +#endif + +static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) +{ + struct writequeue_entry *e = msg->entry; + struct connection *con = e->con; + int users; + + spin_lock_bh(&con->writequeue_lock); + kref_get(&msg->ref); + list_add(&msg->list, &e->msgs); + + users = --e->users; + if (users) + goto out; + + e->len = DLM_WQ_LENGTH_BYTES(e); + + lowcomms_queue_swork(con); + +out: + spin_unlock_bh(&con->writequeue_lock); + return; +} + +/* avoid false positive for nodes_srcu, lock was happen in + * dlm_lowcomms_new_msg + */ +#ifndef __CHECKER__ +void dlm_lowcomms_commit_msg(struct dlm_msg *msg) +{ + _dlm_lowcomms_commit_msg(msg); + srcu_read_unlock(&connections_srcu, msg->idx); + /* because dlm_lowcomms_new_msg() */ + kref_put(&msg->ref, dlm_msg_release); +} +#endif + +void dlm_lowcomms_put_msg(struct dlm_msg *msg) +{ + kref_put(&msg->ref, dlm_msg_release); +} + +/* does not held connections_srcu, usage lowcomms_error_report only */ +int dlm_lowcomms_resend_msg(struct dlm_msg *msg) +{ + struct dlm_msg *msg_resend; + char *ppc; + + if (msg->retransmit) + return 1; + + msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len, + GFP_ATOMIC, &ppc, NULL, NULL); + if (!msg_resend) + return -ENOMEM; + + msg->retransmit = true; + kref_get(&msg->ref); + msg_resend->orig_msg = msg; + + memcpy(ppc, msg->ppc, msg->len); + _dlm_lowcomms_commit_msg(msg_resend); + dlm_lowcomms_put_msg(msg_resend); + + return 0; +} + +/* Send a message */ +static int send_to_sock(struct connection *con) +{ + struct writequeue_entry *e; + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL, + }; + int len, offset, ret; + + spin_lock_bh(&con->writequeue_lock); + e = con_next_wq(con); + if (!e) { + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + return DLM_IO_END; + } + + len = e->len; + offset = e->offset; + WARN_ON_ONCE(len == 0 && e->users == 0); + spin_unlock_bh(&con->writequeue_lock); + + bvec_set_page(&bvec, e->page, len, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(con->sock, &msg); + trace_dlm_send(con->nodeid, ret); + if (ret == -EAGAIN || ret == 0) { + lock_sock(con->sock->sk); + spin_lock_bh(&con->writequeue_lock); + if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags); + con->sock->sk->sk_write_pending++; + + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + + /* wait for write_space() event */ + return DLM_IO_END; + } + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + + return DLM_IO_RESCHED; + } else if (ret < 0) { + return ret; + } + + spin_lock_bh(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + spin_unlock_bh(&con->writequeue_lock); + + return DLM_IO_SUCCESS; +} + +static void clean_one_writequeue(struct connection *con) +{ + struct writequeue_entry *e, *safe; + + spin_lock_bh(&con->writequeue_lock); + list_for_each_entry_safe(e, safe, &con->writequeue, list) { + free_entry(e); + } + spin_unlock_bh(&con->writequeue_lock); +} + +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + WARN_ON_ONCE(!list_empty(&con->writequeue)); + WARN_ON_ONCE(con->sock); + kfree(con); +} + +/* Called from recovery when it knows that a node has + left the cluster */ +int dlm_lowcomms_close(int nodeid) +{ + struct connection *con; + int idx; + + log_print("closing connection to node %d", nodeid); + + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + stop_connection_io(con); + log_print("io handling for node: %d stopped", nodeid); + close_connection(con, true); + + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + clean_one_writequeue(con); + call_srcu(&connections_srcu, &con->rcu, connection_release); + if (con->othercon) { + clean_one_writequeue(con->othercon); + call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); + } + srcu_read_unlock(&connections_srcu, idx); + + /* for debugging we print when we are done to compare with other + * messages in between. This function need to be correctly synchronized + * with io handling + */ + log_print("closing connection to node %d done", nodeid); + + return 0; +} + +/* Receive worker function */ +static void process_recv_sockets(struct work_struct *work) +{ + struct connection *con = container_of(work, struct connection, rwork); + int ret, buflen; + + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + buflen = READ_ONCE(dlm_config.ci_buffer_size); + do { + ret = receive_from_sock(con, buflen); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); + + switch (ret) { + case DLM_IO_END: + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_EOF: + close_connection(con, false); + wake_up(&con->shutdown_wait); + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_RESCHED: + cond_resched(); + queue_work(io_workqueue, &con->rwork); + /* CF_RECV_PENDING not cleared */ + break; + default: + if (ret < 0) { + if (test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, false); + } else { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + + /* CF_RECV_PENDING cleared for othercon + * we trigger send queue if not already done + * and process_send_sockets will handle it + */ + break; + } + + WARN_ON_ONCE(1); + break; + } +} + +static void process_listen_recv_socket(struct work_struct *work) +{ + int ret; + + if (WARN_ON_ONCE(!listen_con.sock)) + return; + + do { + ret = accept_from_sock(); + } while (ret == DLM_IO_SUCCESS); + + if (ret < 0) + log_print("critical error accepting connection: %d", ret); +} + +static int dlm_connect(struct connection *con) +{ + struct sockaddr_storage addr; + int result, addr_len; + struct socket *sock; + unsigned int mark; + + memset(&addr, 0, sizeof(addr)); + result = nodeid_to_addr(con->nodeid, &addr, NULL, + dlm_proto_ops->try_new_addr, &mark); + if (result < 0) { + log_print("no address for nodeid %d", con->nodeid); + return result; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) + return result; + + sock_set_mark(sock->sk, mark); + dlm_proto_ops->sockopts(sock); + + result = dlm_proto_ops->bind(sock); + if (result < 0) { + sock_release(sock); + return result; + } + + add_sock(sock, con); + + log_print_ratelimited("connecting to %d", con->nodeid); + make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); + result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, + addr_len); + switch (result) { + case -EINPROGRESS: + /* not an error */ + fallthrough; + case 0: + break; + default: + if (result < 0) + dlm_close_sock(&con->sock); + + break; + } + + return result; +} + +/* Send worker function */ +static void process_send_sockets(struct work_struct *work) +{ + struct connection *con = container_of(work, struct connection, swork); + int ret; + + WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags)); + + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + down_write(&con->sock_lock); + if (!con->sock) { + ret = dlm_connect(con); + switch (ret) { + case 0: + break; + case -EINPROGRESS: + /* avoid spamming resched on connection + * we might can switch to a state_change + * event based mechanism if established + */ + msleep(100); + break; + default: + /* CF_SEND_PENDING not cleared */ + up_write(&con->sock_lock); + log_print("connect to node %d try %d error %d", + con->nodeid, con->retries++, ret); + msleep(1000); + /* For now we try forever to reconnect. In + * future we should send a event to cluster + * manager to fence itself after certain amount + * of retries. + */ + queue_work(io_workqueue, &con->swork); + return; + } + } + downgrade_write(&con->sock_lock); + } + + do { + ret = send_to_sock(con); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); + + switch (ret) { + case DLM_IO_END: + /* CF_SEND_PENDING cleared */ + break; + case DLM_IO_RESCHED: + /* CF_SEND_PENDING not cleared */ + cond_resched(); + queue_work(io_workqueue, &con->swork); + break; + default: + if (ret < 0) { + close_connection(con, false); + + /* CF_SEND_PENDING cleared */ + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + break; + } + + WARN_ON_ONCE(1); + break; + } +} + +static void work_stop(void) +{ + if (io_workqueue) { + destroy_workqueue(io_workqueue); + io_workqueue = NULL; + } + + if (process_workqueue) { + destroy_workqueue(process_workqueue); + process_workqueue = NULL; + } +} + +static int work_start(void) +{ + io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_UNBOUND, 0); + if (!io_workqueue) { + log_print("can't start dlm_io"); + return -ENOMEM; + } + + /* ordered dlm message process queue, + * should be converted to a tasklet + */ + process_workqueue = alloc_ordered_workqueue("dlm_process", + WQ_HIGHPRI | WQ_MEM_RECLAIM); + if (!process_workqueue) { + log_print("can't start dlm_process"); + destroy_workqueue(io_workqueue); + io_workqueue = NULL; + return -ENOMEM; + } + + return 0; +} + +void dlm_lowcomms_shutdown(void) +{ + struct connection *con; + int i, idx; + + /* stop lowcomms_listen_data_ready calls */ + lock_sock(listen_con.sock->sk); + listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; + release_sock(listen_con.sock->sk); + + cancel_work_sync(&listen_con.rwork); + dlm_close_sock(&listen_con.sock); + + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + shutdown_connection(con, true); + stop_connection_io(con); + flush_workqueue(process_workqueue); + close_connection(con, true); + + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + } + } + srcu_read_unlock(&connections_srcu, idx); +} + +void dlm_lowcomms_stop(void) +{ + work_stop(); + dlm_proto_ops = NULL; +} + +static int dlm_listen_for_all(void) +{ + struct socket *sock; + int result; + + log_print("Using %s for communications", + dlm_proto_ops->name); + + result = dlm_proto_ops->listen_validate(); + if (result < 0) + return result; + + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) { + log_print("Can't create comms socket: %d", result); + return result; + } + + sock_set_mark(sock->sk, dlm_config.ci_mark); + dlm_proto_ops->listen_sockopts(sock); + + result = dlm_proto_ops->listen_bind(sock); + if (result < 0) + goto out; + + lock_sock(sock->sk); + listen_sock.sk_data_ready = sock->sk->sk_data_ready; + listen_sock.sk_write_space = sock->sk->sk_write_space; + listen_sock.sk_error_report = sock->sk->sk_error_report; + listen_sock.sk_state_change = sock->sk->sk_state_change; + + listen_con.sock = sock; + + sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; + sock->sk->sk_data_ready = lowcomms_listen_data_ready; + release_sock(sock->sk); + + result = sock->ops->listen(sock, 128); + if (result < 0) { + dlm_close_sock(&listen_con.sock); + return result; + } + + return 0; + +out: + sock_release(sock); + return result; +} + +static int dlm_tcp_bind(struct socket *sock) +{ + struct sockaddr_storage src_addr; + int result, addr_len; + + /* Bind to our cluster-known address connecting to avoid + * routing problems. + */ + memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + + result = kernel_bind(sock, (struct sockaddr *)&src_addr, + addr_len); + if (result < 0) { + /* This *may* not indicate a critical error */ + log_print("could not bind for connect: %d", result); + } + + return 0; +} + +static int dlm_tcp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + return kernel_connect(sock, addr, addr_len, O_NONBLOCK); +} + +static int dlm_tcp_listen_validate(void) +{ + /* We don't support multi-homed hosts */ + if (dlm_local_count > 1) { + log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); + return -EINVAL; + } + + return 0; +} + +static void dlm_tcp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + tcp_sock_set_nodelay(sock->sk); +} + +static void dlm_tcp_listen_sockopts(struct socket *sock) +{ + dlm_tcp_sockopts(sock); + sock_set_reuseaddr(sock->sk); +} + +static int dlm_tcp_listen_bind(struct socket *sock) +{ + int addr_len; + + /* Bind to our port */ + make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0], + addr_len); +} + +static const struct dlm_proto_ops dlm_tcp_ops = { + .name = "TCP", + .proto = IPPROTO_TCP, + .connect = dlm_tcp_connect, + .sockopts = dlm_tcp_sockopts, + .bind = dlm_tcp_bind, + .listen_validate = dlm_tcp_listen_validate, + .listen_sockopts = dlm_tcp_listen_sockopts, + .listen_bind = dlm_tcp_listen_bind, +}; + +static int dlm_sctp_bind(struct socket *sock) +{ + return sctp_bind_addrs(sock, 0); +} + +static int dlm_sctp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + /* + * Make kernel_connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + sock_set_sndtimeo(sock->sk, 5); + ret = kernel_connect(sock, addr, addr_len, 0); + sock_set_sndtimeo(sock->sk, 0); + return ret; +} + +static int dlm_sctp_listen_validate(void) +{ + if (!IS_ENABLED(CONFIG_IP_SCTP)) { + log_print("SCTP is not enabled by this kernel"); + return -EOPNOTSUPP; + } + + request_module("sctp"); + return 0; +} + +static int dlm_sctp_bind_listen(struct socket *sock) +{ + return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); +} + +static void dlm_sctp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + sctp_sock_set_nodelay(sock->sk); + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); +} + +static const struct dlm_proto_ops dlm_sctp_ops = { + .name = "SCTP", + .proto = IPPROTO_SCTP, + .try_new_addr = true, + .connect = dlm_sctp_connect, + .sockopts = dlm_sctp_sockopts, + .bind = dlm_sctp_bind, + .listen_validate = dlm_sctp_listen_validate, + .listen_sockopts = dlm_sctp_sockopts, + .listen_bind = dlm_sctp_bind_listen, +}; + +int dlm_lowcomms_start(void) +{ + int error; + + init_local(); + if (!dlm_local_count) { + error = -ENOTCONN; + log_print("no local IP address has been set"); + goto fail; + } + + error = work_start(); + if (error) + goto fail; + + /* Start listening */ + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: + dlm_proto_ops = &dlm_tcp_ops; + break; + case DLM_PROTO_SCTP: + dlm_proto_ops = &dlm_sctp_ops; + break; + default: + log_print("Invalid protocol identifier %d set", + dlm_config.ci_protocol); + error = -EINVAL; + goto fail_proto_ops; + } + + error = dlm_listen_for_all(); + if (error) + goto fail_listen; + + return 0; + +fail_listen: + dlm_proto_ops = NULL; +fail_proto_ops: + work_stop(); +fail: + return error; +} + +void dlm_lowcomms_init(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&connection_hash[i]); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); +} + +void dlm_lowcomms_exit(void) +{ + struct connection *con; + int i, idx; + + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + if (con->othercon) + call_srcu(&connections_srcu, &con->othercon->rcu, + connection_release); + call_srcu(&connections_srcu, &con->rcu, connection_release); + } + } + srcu_read_unlock(&connections_srcu, idx); +} diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h new file mode 100644 index 0000000000..3e8dca6618 --- /dev/null +++ b/fs/dlm/lowcomms.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOWCOMMS_DOT_H__ +#define __LOWCOMMS_DOT_H__ + +#include "dlm_internal.h" + +#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts) +#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \ + DLM_MIDCOMMS_OPT_LEN) + +#define CONN_HASH_SIZE 32 + +/* This is deliberately very simple because most clusters have simple + * sequential nodeids, so we should be able to go straight to a connection + * struct in the array + */ +static inline int nodeid_hash(int nodeid) +{ + return nodeid & (CONN_HASH_SIZE-1); +} + +/* check if dlm is running */ +bool dlm_lowcomms_is_running(void); + +int dlm_lowcomms_start(void); +void dlm_lowcomms_shutdown(void); +void dlm_lowcomms_shutdown_node(int nodeid, bool force); +void dlm_lowcomms_stop(void); +void dlm_lowcomms_init(void); +void dlm_lowcomms_exit(void); +int dlm_lowcomms_close(int nodeid); +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(void *data), + void *data); +void dlm_lowcomms_commit_msg(struct dlm_msg *msg); +void dlm_lowcomms_put_msg(struct dlm_msg *msg); +int dlm_lowcomms_resend_msg(struct dlm_msg *msg); +int dlm_lowcomms_connect_node(int nodeid); +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); +int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); +void dlm_midcomms_receive_done(int nodeid); +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void); +struct kmem_cache *dlm_lowcomms_msg_cache_create(void); + +#endif /* __LOWCOMMS_DOT_H__ */ + diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h new file mode 100644 index 0000000000..09052d9671 --- /dev/null +++ b/fs/dlm/lvb_table.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LVB_TABLE_DOT_H__ +#define __LVB_TABLE_DOT_H__ + +extern const int dlm_lvb_operations[8][8]; + +#endif diff --git a/fs/dlm/main.c b/fs/dlm/main.c new file mode 100644 index 0000000000..6ca28299c9 --- /dev/null +++ b/fs/dlm/main.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include <linux/module.h> + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lock.h" +#include "user.h" +#include "memory.h" +#include "config.h" +#include "midcomms.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/dlm.h> + +static int __init init_dlm(void) +{ + int error; + + error = dlm_memory_init(); + if (error) + goto out; + + dlm_midcomms_init(); + + error = dlm_lockspace_init(); + if (error) + goto out_mem; + + error = dlm_config_init(); + if (error) + goto out_lockspace; + + dlm_register_debugfs(); + + error = dlm_user_init(); + if (error) + goto out_debug; + + error = dlm_plock_init(); + if (error) + goto out_user; + + printk("DLM installed\n"); + + return 0; + + out_user: + dlm_user_exit(); + out_debug: + dlm_unregister_debugfs(); + dlm_config_exit(); + out_lockspace: + dlm_lockspace_exit(); + out_mem: + dlm_midcomms_exit(); + dlm_memory_exit(); + out: + return error; +} + +static void __exit exit_dlm(void) +{ + dlm_plock_exit(); + dlm_user_exit(); + dlm_config_exit(); + dlm_lockspace_exit(); + dlm_midcomms_exit(); + dlm_unregister_debugfs(); + dlm_memory_exit(); +} + +module_init(init_dlm); +module_exit(exit_dlm); + +MODULE_DESCRIPTION("Distributed Lock Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL_GPL(dlm_new_lockspace); +EXPORT_SYMBOL_GPL(dlm_release_lockspace); +EXPORT_SYMBOL_GPL(dlm_lock); +EXPORT_SYMBOL_GPL(dlm_unlock); + diff --git a/fs/dlm/member.c b/fs/dlm/member.c new file mode 100644 index 0000000000..be7909ead7 --- /dev/null +++ b/fs/dlm/member.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "recoverd.h" +#include "recover.h" +#include "rcom.h" +#include "config.h" +#include "midcomms.h" +#include "lowcomms.h" + +int dlm_slots_version(const struct dlm_header *h) +{ + if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS) + return 0; + return 1; +} + +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb) +{ + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + + if (!dlm_slots_version(&rc->rc_header)) + return; + + memb->slot = le16_to_cpu(rf->rf_our_slot); + memb->generation = le32_to_cpu(rf->rf_generation); +} + +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct dlm_slot *slot; + struct rcom_slot *ro; + int i; + + ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + /* ls_slots array is sparse, but not rcom_slots */ + + for (i = 0; i < ls->ls_slots_size; i++) { + slot = &ls->ls_slots[i]; + if (!slot->nodeid) + continue; + ro->ro_nodeid = cpu_to_le32(slot->nodeid); + ro->ro_slot = cpu_to_le16(slot->slot); + ro++; + } +} + +#define SLOT_DEBUG_LINE 128 + +static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, + struct rcom_slot *ro0, struct dlm_slot *array, + int array_size) +{ + char line[SLOT_DEBUG_LINE]; + int len = SLOT_DEBUG_LINE - 1; + int pos = 0; + int ret, i; + + memset(line, 0, sizeof(line)); + + if (array) { + for (i = 0; i < array_size; i++) { + if (!array[i].nodeid) + continue; + + ret = snprintf(line + pos, len - pos, " %d:%d", + array[i].slot, array[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } else if (ro0) { + for (i = 0; i < num_slots; i++) { + ret = snprintf(line + pos, len - pos, " %d:%d", + ro0[i].ro_slot, ro0[i].ro_nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } + + log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line); +} + +int dlm_slots_copy_in(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_rcom *rc = ls->ls_recover_buf; + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + struct rcom_slot *ro0, *ro; + int our_nodeid = dlm_our_nodeid(); + int i, num_slots; + uint32_t gen; + + if (!dlm_slots_version(&rc->rc_header)) + return -1; + + gen = le32_to_cpu(rf->rf_generation); + if (gen <= ls->ls_generation) { + log_error(ls, "dlm_slots_copy_in gen %u old %u", + gen, ls->ls_generation); + } + ls->ls_generation = gen; + + num_slots = le16_to_cpu(rf->rf_num_slots); + if (!num_slots) + return -1; + + ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + log_slots(ls, gen, num_slots, ro0, NULL, 0); + + list_for_each_entry(memb, &ls->ls_nodes, list) { + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + if (le32_to_cpu(ro->ro_nodeid) != memb->nodeid) + continue; + memb->slot = le16_to_cpu(ro->ro_slot); + memb->slot_prev = memb->slot; + break; + } + + if (memb->nodeid == our_nodeid) { + if (ls->ls_slot && ls->ls_slot != memb->slot) { + log_error(ls, "dlm_slots_copy_in our slot " + "changed %d %d", ls->ls_slot, + memb->slot); + return -1; + } + + if (!ls->ls_slot) + ls->ls_slot = memb->slot; + } + + if (!memb->slot) { + log_error(ls, "dlm_slots_copy_in nodeid %d no slot", + memb->nodeid); + return -1; + } + } + + return 0; +} + +/* for any nodes that do not support slots, we will not have set memb->slot + in wait_status_all(), so memb->slot will remain -1, and we will not + assign slots or set ls_num_slots here */ + +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out) +{ + struct dlm_member *memb; + struct dlm_slot *array; + int our_nodeid = dlm_our_nodeid(); + int array_size, max_slots, i; + int need = 0; + int max = 0; + int num = 0; + uint32_t gen = 0; + + /* our own memb struct will have slot -1 gen 0 */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == our_nodeid) { + memb->slot = ls->ls_slot; + memb->generation = ls->ls_generation; + break; + } + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->generation > gen) + gen = memb->generation; + + /* node doesn't support slots */ + + if (memb->slot == -1) + return -1; + + /* node needs a slot assigned */ + + if (!memb->slot) + need++; + + /* node has a slot assigned */ + + num++; + + if (!max || max < memb->slot) + max = memb->slot; + + /* sanity check, once slot is assigned it shouldn't change */ + + if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) { + log_error(ls, "nodeid %d slot changed %d %d", + memb->nodeid, memb->slot_prev, memb->slot); + return -1; + } + memb->slot_prev = memb->slot; + } + + array_size = max + need; + array = kcalloc(array_size, sizeof(*array), GFP_NOFS); + if (!array) + return -ENOMEM; + + num = 0; + + /* fill in slots (offsets) that are used */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!memb->slot) + continue; + + if (memb->slot > array_size) { + log_error(ls, "invalid slot number %d", memb->slot); + kfree(array); + return -1; + } + + array[memb->slot - 1].nodeid = memb->nodeid; + array[memb->slot - 1].slot = memb->slot; + num++; + } + + /* assign new slots from unused offsets */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->slot) + continue; + + for (i = 0; i < array_size; i++) { + if (array[i].nodeid) + continue; + + memb->slot = i + 1; + memb->slot_prev = memb->slot; + array[i].nodeid = memb->nodeid; + array[i].slot = memb->slot; + num++; + + if (!ls->ls_slot && memb->nodeid == our_nodeid) + ls->ls_slot = memb->slot; + break; + } + + if (!memb->slot) { + log_error(ls, "no free slot found"); + kfree(array); + return -1; + } + } + + gen++; + + log_slots(ls, gen, num, NULL, array, array_size); + + max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) - + sizeof(struct rcom_config)) / sizeof(struct rcom_slot); + + if (num > max_slots) { + log_error(ls, "num_slots %d exceeds max_slots %d", + num, max_slots); + kfree(array); + return -1; + } + + *gen_out = gen; + *slots_out = array; + *slots_size = array_size; + *num_slots = num; + return 0; +} + +static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) +{ + struct dlm_member *memb = NULL; + struct list_head *tmp; + struct list_head *newlist = &new->list; + struct list_head *head = &ls->ls_nodes; + + list_for_each(tmp, head) { + memb = list_entry(tmp, struct dlm_member, list); + if (new->nodeid < memb->nodeid) + break; + } + + if (!memb) + list_add_tail(newlist, head); + else { + /* FIXME: can use list macro here */ + newlist->prev = tmp->prev; + newlist->next = tmp; + tmp->prev->next = newlist; + tmp->prev = newlist; + } +} + +static int add_remote_member(int nodeid) +{ + int error; + + if (nodeid == dlm_our_nodeid()) + return 0; + + error = dlm_lowcomms_connect_node(nodeid); + if (error < 0) + return error; + + dlm_midcomms_add_member(nodeid); + return 0; +} + +static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) +{ + struct dlm_member *memb; + int error; + + memb = kzalloc(sizeof(*memb), GFP_NOFS); + if (!memb) + return -ENOMEM; + + memb->nodeid = node->nodeid; + memb->weight = node->weight; + memb->comm_seq = node->comm_seq; + + error = add_remote_member(node->nodeid); + if (error < 0) { + kfree(memb); + return error; + } + + add_ordered_member(ls, memb); + ls->ls_num_nodes++; + return 0; +} + +static struct dlm_member *find_memb(struct list_head *head, int nodeid) +{ + struct dlm_member *memb; + + list_for_each_entry(memb, head, list) { + if (memb->nodeid == nodeid) + return memb; + } + return NULL; +} + +int dlm_is_member(struct dlm_ls *ls, int nodeid) +{ + if (find_memb(&ls->ls_nodes, nodeid)) + return 1; + return 0; +} + +int dlm_is_removed(struct dlm_ls *ls, int nodeid) +{ + if (find_memb(&ls->ls_nodes_gone, nodeid)) + return 1; + return 0; +} + +static void clear_memb_list(struct list_head *head, + void (*after_del)(int nodeid)) +{ + struct dlm_member *memb; + + while (!list_empty(head)) { + memb = list_entry(head->next, struct dlm_member, list); + list_del(&memb->list); + if (after_del) + after_del(memb->nodeid); + kfree(memb); + } +} + +static void remove_remote_member(int nodeid) +{ + if (nodeid == dlm_our_nodeid()) + return; + + dlm_midcomms_remove_member(nodeid); +} + +void dlm_clear_members(struct dlm_ls *ls) +{ + clear_memb_list(&ls->ls_nodes, remove_remote_member); + ls->ls_num_nodes = 0; +} + +void dlm_clear_members_gone(struct dlm_ls *ls) +{ + clear_memb_list(&ls->ls_nodes_gone, NULL); +} + +static void make_member_array(struct dlm_ls *ls) +{ + struct dlm_member *memb; + int i, w, x = 0, total = 0, all_zero = 0, *array; + + kfree(ls->ls_node_array); + ls->ls_node_array = NULL; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->weight) + total += memb->weight; + } + + /* all nodes revert to weight of 1 if all have weight 0 */ + + if (!total) { + total = ls->ls_num_nodes; + all_zero = 1; + } + + ls->ls_total_weight = total; + array = kmalloc_array(total, sizeof(*array), GFP_NOFS); + if (!array) + return; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!all_zero && !memb->weight) + continue; + + if (all_zero) + w = 1; + else + w = memb->weight; + + DLM_ASSERT(x < total, printk("total %d x %d\n", total, x);); + + for (i = 0; i < w; i++) + array[x++] = memb->nodeid; + } + + ls->ls_node_array = array; +} + +/* send a status request to all members just to establish comms connections */ + +static int ping_members(struct dlm_ls *ls, uint64_t seq) +{ + struct dlm_member *memb; + int error = 0; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + break; + } + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); + if (error) + break; + } + if (error) + log_rinfo(ls, "ping_members aborted %d last nodeid %d", + error, ls->ls_recover_nodeid); + return error; +} + +static void dlm_lsop_recover_prep(struct dlm_ls *ls) +{ + if (!ls->ls_ops || !ls->ls_ops->recover_prep) + return; + ls->ls_ops->recover_prep(ls->ls_ops_arg); +} + +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +{ + struct dlm_slot slot; + uint32_t seq; + int error; + + if (!ls->ls_ops || !ls->ls_ops->recover_slot) + return; + + /* if there is no comms connection with this node + or the present comms connection is newer + than the one when this member was added, then + we consider the node to have failed (versus + being removed due to dlm_release_lockspace) */ + + error = dlm_comm_seq(memb->nodeid, &seq); + + if (!error && seq == memb->comm_seq) + return; + + slot.nodeid = memb->nodeid; + slot.slot = memb->slot; + + ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot); +} + +void dlm_lsop_recover_done(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_slot *slots; + int i, num; + + if (!ls->ls_ops || !ls->ls_ops->recover_done) + return; + + num = ls->ls_num_nodes; + slots = kcalloc(num, sizeof(*slots), GFP_KERNEL); + if (!slots) + return; + + i = 0; + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (i == num) { + log_error(ls, "dlm_lsop_recover_done bad num %d", num); + goto out; + } + slots[i].nodeid = memb->nodeid; + slots[i].slot = memb->slot; + i++; + } + + ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num, + ls->ls_slot, ls->ls_generation); + out: + kfree(slots); +} + +static struct dlm_config_node *find_config_node(struct dlm_recover *rv, + int nodeid) +{ + int i; + + for (i = 0; i < rv->nodes_count; i++) { + if (rv->nodes[i].nodeid == nodeid) + return &rv->nodes[i]; + } + return NULL; +} + +int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) +{ + struct dlm_member *memb, *safe; + struct dlm_config_node *node; + int i, error, neg = 0, low = -1; + + /* previously removed members that we've not finished removing need to + * count as a negative change so the "neg" recovery steps will happen + * + * This functionality must report all member changes to lsops or + * midcomms layer and must never return before. + */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + log_rinfo(ls, "prev removed member %d", memb->nodeid); + neg++; + } + + /* move departed members from ls_nodes to ls_nodes_gone */ + + list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { + node = find_config_node(rv, memb->nodeid); + if (node && !node->new) + continue; + + if (!node) { + log_rinfo(ls, "remove member %d", memb->nodeid); + } else { + /* removed and re-added */ + log_rinfo(ls, "remove member %d comm_seq %u %u", + memb->nodeid, memb->comm_seq, node->comm_seq); + } + + neg++; + list_move(&memb->list, &ls->ls_nodes_gone); + remove_remote_member(memb->nodeid); + ls->ls_num_nodes--; + dlm_lsop_recover_slot(ls, memb); + } + + /* add new members to ls_nodes */ + + for (i = 0; i < rv->nodes_count; i++) { + node = &rv->nodes[i]; + if (dlm_is_member(ls, node->nodeid)) + continue; + error = dlm_add_member(ls, node); + if (error) + return error; + + log_rinfo(ls, "add member %d", node->nodeid); + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (low == -1 || memb->nodeid < low) + low = memb->nodeid; + } + ls->ls_low_nodeid = low; + + make_member_array(ls); + *neg_out = neg; + + error = ping_members(ls, rv->seq); + log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); + return error; +} + +/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before + dlm_ls_start() is called on any of them to start the new recovery. */ + +int dlm_ls_stop(struct dlm_ls *ls) +{ + int new; + + /* + * Prevent dlm_recv from being in the middle of something when we do + * the stop. This includes ensuring dlm_recv isn't processing a + * recovery message (rcom), while dlm_recoverd is aborting and + * resetting things from an in-progress recovery. i.e. we want + * dlm_recoverd to abort its recovery without worrying about dlm_recv + * processing an rcom at the same time. Stopping dlm_recv also makes + * it easy for dlm_receive_message() to check locking stopped and add a + * message to the requestqueue without races. + */ + + down_write(&ls->ls_recv_active); + + /* + * Abort any recovery that's in progress (see RECOVER_STOP, + * dlm_recovery_stopped()) and tell any other threads running in the + * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). + */ + + spin_lock(&ls->ls_recover_lock); + set_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); + ls->ls_recover_seq++; + spin_unlock(&ls->ls_recover_lock); + + /* + * Let dlm_recv run again, now any normal messages will be saved on the + * requestqueue for later. + */ + + up_write(&ls->ls_recv_active); + + /* + * This in_recovery lock does two things: + * 1) Keeps this function from returning until all threads are out + * of locking routines and locking is truly stopped. + * 2) Keeps any new requests from being processed until it's unlocked + * when recovery is complete. + */ + + if (new) { + set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); + wait_event(ls->ls_recover_lock_wait, + test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + } + + /* + * The recoverd suspend/resume makes sure that dlm_recoverd (if + * running) has noticed RECOVER_STOP above and quit processing the + * previous recovery. + */ + + dlm_recoverd_suspend(ls); + + spin_lock(&ls->ls_recover_lock); + kfree(ls->ls_slots); + ls->ls_slots = NULL; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; + ls->ls_recover_status = 0; + spin_unlock(&ls->ls_recover_lock); + + dlm_recoverd_resume(ls); + + if (!ls->ls_recover_begin) + ls->ls_recover_begin = jiffies; + + /* call recover_prep ops only once and not multiple times + * for each possible dlm_ls_stop() when recovery is already + * stopped. + * + * If we successful was able to clear LSFL_RUNNING bit and + * it was set we know it is the first dlm_ls_stop() call. + */ + if (new) + dlm_lsop_recover_prep(ls); + + return 0; +} + +int dlm_ls_start(struct dlm_ls *ls) +{ + struct dlm_recover *rv, *rv_old; + struct dlm_config_node *nodes = NULL; + int error, count; + + rv = kzalloc(sizeof(*rv), GFP_NOFS); + if (!rv) + return -ENOMEM; + + error = dlm_config_nodes(ls->ls_name, &nodes, &count); + if (error < 0) + goto fail_rv; + + spin_lock(&ls->ls_recover_lock); + + /* the lockspace needs to be stopped before it can be started */ + + if (!dlm_locking_stopped(ls)) { + spin_unlock(&ls->ls_recover_lock); + log_error(ls, "start ignored: lockspace running"); + error = -EINVAL; + goto fail; + } + + rv->nodes = nodes; + rv->nodes_count = count; + rv->seq = ++ls->ls_recover_seq; + rv_old = ls->ls_recover_args; + ls->ls_recover_args = rv; + spin_unlock(&ls->ls_recover_lock); + + if (rv_old) { + log_error(ls, "unused recovery %llx %d", + (unsigned long long)rv_old->seq, rv_old->nodes_count); + kfree(rv_old->nodes); + kfree(rv_old); + } + + set_bit(LSFL_RECOVER_WORK, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); + return 0; + + fail: + kfree(nodes); + fail_rv: + kfree(rv); + return error; +} + diff --git a/fs/dlm/member.h b/fs/dlm/member.h new file mode 100644 index 0000000000..f61cfde463 --- /dev/null +++ b/fs/dlm/member.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMBER_DOT_H__ +#define __MEMBER_DOT_H__ + +int dlm_ls_stop(struct dlm_ls *ls); +int dlm_ls_start(struct dlm_ls *ls); +void dlm_clear_members(struct dlm_ls *ls); +void dlm_clear_members_gone(struct dlm_ls *ls); +int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); +int dlm_is_removed(struct dlm_ls *ls, int nodeid); +int dlm_is_member(struct dlm_ls *ls, int nodeid); +int dlm_slots_version(const struct dlm_header *h); +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb); +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_slots_copy_in(struct dlm_ls *ls); +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out); +void dlm_lsop_recover_done(struct dlm_ls *ls); + +#endif /* __MEMBER_DOT_H__ */ + diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c new file mode 100644 index 0000000000..64f212a066 --- /dev/null +++ b/fs/dlm/memory.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "midcomms.h" +#include "lowcomms.h" +#include "config.h" +#include "memory.h" +#include "ast.h" + +static struct kmem_cache *writequeue_cache; +static struct kmem_cache *mhandle_cache; +static struct kmem_cache *msg_cache; +static struct kmem_cache *lkb_cache; +static struct kmem_cache *rsb_cache; +static struct kmem_cache *cb_cache; + + +int __init dlm_memory_init(void) +{ + writequeue_cache = dlm_lowcomms_writequeue_cache_create(); + if (!writequeue_cache) + goto out; + + mhandle_cache = dlm_midcomms_cache_create(); + if (!mhandle_cache) + goto mhandle; + + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), + __alignof__(struct dlm_lkb), 0, NULL); + if (!lkb_cache) + goto lkb; + + msg_cache = dlm_lowcomms_msg_cache_create(); + if (!msg_cache) + goto msg; + + rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), + __alignof__(struct dlm_rsb), 0, NULL); + if (!rsb_cache) + goto rsb; + + cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), + __alignof__(struct dlm_callback), 0, + NULL); + if (!cb_cache) + goto cb; + + return 0; + +cb: + kmem_cache_destroy(rsb_cache); +rsb: + kmem_cache_destroy(msg_cache); +msg: + kmem_cache_destroy(lkb_cache); +lkb: + kmem_cache_destroy(mhandle_cache); +mhandle: + kmem_cache_destroy(writequeue_cache); +out: + return -ENOMEM; +} + +void dlm_memory_exit(void) +{ + kmem_cache_destroy(writequeue_cache); + kmem_cache_destroy(mhandle_cache); + kmem_cache_destroy(msg_cache); + kmem_cache_destroy(lkb_cache); + kmem_cache_destroy(rsb_cache); + kmem_cache_destroy(cb_cache); +} + +char *dlm_allocate_lvb(struct dlm_ls *ls) +{ + char *p; + + p = kzalloc(ls->ls_lvblen, GFP_NOFS); + return p; +} + +void dlm_free_lvb(char *p) +{ + kfree(p); +} + +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + + r = kmem_cache_zalloc(rsb_cache, GFP_NOFS); + return r; +} + +void dlm_free_rsb(struct dlm_rsb *r) +{ + if (r->res_lvbptr) + dlm_free_lvb(r->res_lvbptr); + kmem_cache_free(rsb_cache, r); +} + +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + + lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS); + return lkb; +} + +void dlm_free_lkb(struct dlm_lkb *lkb) +{ + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { + struct dlm_user_args *ua; + ua = lkb->lkb_ua; + if (ua) { + kfree(ua->lksb.sb_lvbptr); + kfree(ua); + } + } + + /* drop references if they are set */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + + kmem_cache_free(lkb_cache, lkb); +} + +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation) +{ + return kmem_cache_alloc(mhandle_cache, allocation); +} + +void dlm_free_mhandle(struct dlm_mhandle *mhandle) +{ + kmem_cache_free(mhandle_cache, mhandle); +} + +struct writequeue_entry *dlm_allocate_writequeue(void) +{ + return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC); +} + +void dlm_free_writequeue(struct writequeue_entry *writequeue) +{ + kmem_cache_free(writequeue_cache, writequeue); +} + +struct dlm_msg *dlm_allocate_msg(gfp_t allocation) +{ + return kmem_cache_alloc(msg_cache, allocation); +} + +void dlm_free_msg(struct dlm_msg *msg) +{ + kmem_cache_free(msg_cache, msg); +} + +struct dlm_callback *dlm_allocate_cb(void) +{ + return kmem_cache_alloc(cb_cache, GFP_ATOMIC); +} + +void dlm_free_cb(struct dlm_callback *cb) +{ + kmem_cache_free(cb_cache, cb); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h new file mode 100644 index 0000000000..6b29563d24 --- /dev/null +++ b/fs/dlm/memory.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMORY_DOT_H__ +#define __MEMORY_DOT_H__ + +int dlm_memory_init(void); +void dlm_memory_exit(void); +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls); +void dlm_free_rsb(struct dlm_rsb *r); +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); +void dlm_free_lkb(struct dlm_lkb *l); +char *dlm_allocate_lvb(struct dlm_ls *ls); +void dlm_free_lvb(char *l); +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation); +void dlm_free_mhandle(struct dlm_mhandle *mhandle); +struct writequeue_entry *dlm_allocate_writequeue(void); +void dlm_free_writequeue(struct writequeue_entry *writequeue); +struct dlm_msg *dlm_allocate_msg(gfp_t allocation); +void dlm_free_msg(struct dlm_msg *msg); +struct dlm_callback *dlm_allocate_cb(void); +void dlm_free_cb(struct dlm_callback *cb); + +#endif /* __MEMORY_DOT_H__ */ + diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c new file mode 100644 index 0000000000..2247ebb61b --- /dev/null +++ b/fs/dlm/midcomms.c @@ -0,0 +1,1514 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +/* + * midcomms.c + * + * This is the appallingly named "mid-level" comms layer. It takes care about + * deliver an on application layer "reliable" communication above the used + * lowcomms transport layer. + * + * How it works: + * + * Each nodes keeps track of all send DLM messages in send_queue with a sequence + * number. The receive will send an DLM_ACK message back for every DLM message + * received at the other side. If a reconnect happens in lowcomms we will send + * all unacknowledged dlm messages again. The receiving side might drop any already + * received message by comparing sequence numbers. + * + * How version detection works: + * + * Due the fact that dlm has pre-configured node addresses on every side + * it is in it's nature that every side connects at starts to transmit + * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS + * and their replies are the first messages which are exchanges. Due backwards + * compatibility these messages are not covered by the midcomms re-transmission + * layer. These messages have their own re-transmission handling in the dlm + * application layer. The version field of every node will be set on these RCOM + * messages as soon as they arrived and the node isn't yet part of the nodes + * hash. There exists also logic to detect version mismatched if something weird + * going on or the first messages isn't an expected one. + * + * Termination: + * + * The midcomms layer does a 4 way handshake for termination on DLM protocol + * like TCP supports it with half-closed socket support. SCTP doesn't support + * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be + * interrupted by .e.g. tcp reset itself. Additional there exists the othercon + * paradigm in lowcomms which cannot be easily without breaking backwards + * compatibility. A node cannot send anything to another node when a DLM_FIN + * message was send. There exists additional logic to print a warning if + * DLM wants to do it. There exists a state handling like RFC 793 but reduced + * to termination only. The event "member removal event" describes the cluster + * manager removed the node from internal lists, at this point DLM does not + * send any message to the other node. There exists two cases: + * + * 1. The cluster member was removed and we received a FIN + * OR + * 2. We received a FIN but the member was not removed yet + * + * One of these cases will do the CLOSE_WAIT to LAST_ACK change. + * + * + * +---------+ + * | CLOSED | + * +---------+ + * | add member/receive RCOM version + * | detection msg + * V + * +---------+ + * | ESTAB | + * +---------+ + * CLOSE | | rcv FIN + * ------- | | ------- + * +---------+ snd FIN / \ snd ACK +---------+ + * | FIN |<----------------- ------------------>| CLOSE | + * | WAIT-1 |------------------ | WAIT | + * +---------+ rcv FIN \ +---------+ + * | rcv ACK of FIN ------- | CLOSE | member + * | -------------- snd ACK | ------- | removal + * V x V snd FIN V event + * +---------+ +---------+ +---------+ + * |FINWAIT-2| | CLOSING | | LAST-ACK| + * +---------+ +---------+ +---------+ + * | rcv ACK of FIN | rcv ACK of FIN | + * | rcv FIN -------------- | -------------- | + * | ------- x V x V + * \ snd ACK +---------+ +---------+ + * ------------------------>| CLOSED | | CLOSED | + * +---------+ +---------+ + * + * NOTE: any state can interrupted by midcomms_close() and state will be + * switched to CLOSED in case of fencing. There exists also some timeout + * handling when we receive the version detection RCOM messages which is + * made by observation. + * + * Future improvements: + * + * There exists some known issues/improvements of the dlm handling. Some + * of them should be done in a next major dlm version bump which makes + * it incompatible with previous versions. + * + * Unaligned memory access: + * + * There exists cases when the dlm message buffer length is not aligned + * to 8 byte. However seems nobody detected any problem with it. This + * can be fixed in the next major version bump of dlm. + * + * Version detection: + * + * The version detection and how it's done is related to backwards + * compatibility. There exists better ways to make a better handling. + * However this should be changed in the next major version bump of dlm. + * + * Tail Size checking: + * + * There exists a message tail payload in e.g. DLM_MSG however we don't + * check it against the message length yet regarding to the receive buffer + * length. That need to be validated. + * + * Fencing bad nodes: + * + * At timeout places or weird sequence number behaviours we should send + * a fencing request to the cluster manager. + */ + +/* Debug switch to enable a 5 seconds sleep waiting of a termination. + * This can be useful to test fencing while termination is running. + * This requires a setup with only gfs2 as dlm user, so that the + * last umount will terminate the connection. + * + * However it became useful to test, while the 5 seconds block in umount + * just press the reset button. In a lot of dropping the termination + * process can could take several seconds. + */ +#define DLM_DEBUG_FENCE_TERMINATION 0 + +#include <trace/events/dlm.h> +#include <net/tcp.h> + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "util.h" +#include "midcomms.h" + +/* init value for sequence numbers for testing purpose only e.g. overflows */ +#define DLM_SEQ_INIT 0 +/* 5 seconds wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) +#define DLM_VERSION_NOT_SET 0 +#define DLM_SEND_ACK_BACK_MSG_THRESHOLD 32 +#define DLM_RECV_ACK_BACK_MSG_THRESHOLD (DLM_SEND_ACK_BACK_MSG_THRESHOLD * 8) + +struct midcomms_node { + int nodeid; + uint32_t version; + atomic_t seq_send; + atomic_t seq_next; + /* These queues are unbound because we cannot drop any message in dlm. + * We could send a fence signal for a specific node to the cluster + * manager if queues hits some maximum value, however this handling + * not supported yet. + */ + struct list_head send_queue; + spinlock_t send_queue_lock; + atomic_t send_queue_cnt; +#define DLM_NODE_FLAG_CLOSE 1 +#define DLM_NODE_FLAG_STOP_TX 2 +#define DLM_NODE_FLAG_STOP_RX 3 + atomic_t ulp_delivered; + unsigned long flags; + wait_queue_head_t shutdown_wait; + + /* dlm tcp termination state */ +#define DLM_CLOSED 1 +#define DLM_ESTABLISHED 2 +#define DLM_FIN_WAIT1 3 +#define DLM_FIN_WAIT2 4 +#define DLM_CLOSE_WAIT 5 +#define DLM_LAST_ACK 6 +#define DLM_CLOSING 7 + int state; + spinlock_t state_lock; + + /* counts how many lockspaces are using this node + * this refcount is necessary to determine if the + * node wants to disconnect. + */ + int users; + + /* not protected by srcu, node_hash lifetime */ + void *debugfs; + + struct hlist_node hlist; + struct rcu_head rcu; +}; + +struct dlm_mhandle { + const union dlm_packet *inner_p; + struct midcomms_node *node; + struct dlm_opts *opts; + struct dlm_msg *msg; + bool committed; + uint32_t seq; + + void (*ack_rcv)(struct midcomms_node *node); + + /* get_mhandle/commit srcu idx exchange */ + int idx; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct hlist_head node_hash[CONN_HASH_SIZE]; +static DEFINE_SPINLOCK(nodes_lock); +DEFINE_STATIC_SRCU(nodes_srcu); + +/* This mutex prevents that midcomms_close() is running while + * stop() or remove(). As I experienced invalid memory access + * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and + * resetting machines. I will end in some double deletion in nodes + * datastructure. + */ +static DEFINE_MUTEX(close_lock); + +struct kmem_cache *dlm_midcomms_cache_create(void) +{ + return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle), + 0, 0, NULL); +} + +static inline const char *dlm_state_str(int state) +{ + switch (state) { + case DLM_CLOSED: + return "CLOSED"; + case DLM_ESTABLISHED: + return "ESTABLISHED"; + case DLM_FIN_WAIT1: + return "FIN_WAIT1"; + case DLM_FIN_WAIT2: + return "FIN_WAIT2"; + case DLM_CLOSE_WAIT: + return "CLOSE_WAIT"; + case DLM_LAST_ACK: + return "LAST_ACK"; + case DLM_CLOSING: + return "CLOSING"; + default: + return "UNKNOWN"; + } +} + +const char *dlm_midcomms_state(struct midcomms_node *node) +{ + return dlm_state_str(node->state); +} + +unsigned long dlm_midcomms_flags(struct midcomms_node *node) +{ + return node->flags; +} + +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) +{ + return atomic_read(&node->send_queue_cnt); +} + +uint32_t dlm_midcomms_version(struct midcomms_node *node) +{ + return node->version; +} + +static struct midcomms_node *__find_node(int nodeid, int r) +{ + struct midcomms_node *node; + + hlist_for_each_entry_rcu(node, &node_hash[r], hlist) { + if (node->nodeid == nodeid) + return node; + } + + return NULL; +} + +static void dlm_mhandle_release(struct rcu_head *rcu) +{ + struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); + + dlm_lowcomms_put_msg(mh->msg); + dlm_free_mhandle(mh); +} + +static void dlm_mhandle_delete(struct midcomms_node *node, + struct dlm_mhandle *mh) +{ + list_del_rcu(&mh->list); + atomic_dec(&node->send_queue_cnt); + call_rcu(&mh->rcu, dlm_mhandle_release); +} + +static void dlm_send_queue_flush(struct midcomms_node *node) +{ + struct dlm_mhandle *mh; + + pr_debug("flush midcomms send queue of node %d\n", node->nodeid); + + rcu_read_lock(); + spin_lock_bh(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + dlm_mhandle_delete(node, mh); + } + spin_unlock_bh(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void midcomms_node_reset(struct midcomms_node *node) +{ + pr_debug("reset node %d\n", node->nodeid); + + atomic_set(&node->seq_next, DLM_SEQ_INIT); + atomic_set(&node->seq_send, DLM_SEQ_INIT); + atomic_set(&node->ulp_delivered, 0); + node->version = DLM_VERSION_NOT_SET; + node->flags = 0; + + dlm_send_queue_flush(node); + node->state = DLM_CLOSED; + wake_up(&node->shutdown_wait); +} + +static struct midcomms_node *nodeid2node(int nodeid) +{ + return __find_node(nodeid, nodeid_hash(nodeid)); +} + +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + int ret, idx, r = nodeid_hash(nodeid); + struct midcomms_node *node; + + ret = dlm_lowcomms_addr(nodeid, addr, len); + if (ret) + return ret; + + idx = srcu_read_lock(&nodes_srcu); + node = __find_node(nodeid, r); + if (node) { + srcu_read_unlock(&nodes_srcu, idx); + return 0; + } + srcu_read_unlock(&nodes_srcu, idx); + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) + return -ENOMEM; + + node->nodeid = nodeid; + spin_lock_init(&node->state_lock); + spin_lock_init(&node->send_queue_lock); + atomic_set(&node->send_queue_cnt, 0); + INIT_LIST_HEAD(&node->send_queue); + init_waitqueue_head(&node->shutdown_wait); + node->users = 0; + midcomms_node_reset(node); + + spin_lock(&nodes_lock); + hlist_add_head_rcu(&node->hlist, &node_hash[r]); + spin_unlock(&nodes_lock); + + node->debugfs = dlm_create_debug_comms_file(nodeid, node); + return 0; +} + +static int dlm_send_ack(int nodeid, uint32_t seq) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_msg *msg; + char *ppc; + + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc, + NULL, NULL); + if (!msg) + return -ENOMEM; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); + m_header->h_length = cpu_to_le16(mb_len); + m_header->h_cmd = DLM_ACK; + m_header->u.h_seq = cpu_to_le32(seq); + + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); + + return 0; +} + +static void dlm_send_ack_threshold(struct midcomms_node *node, + uint32_t threshold) +{ + uint32_t oval, nval; + bool send_ack; + + /* let only send one user trigger threshold to send ack back */ + do { + oval = atomic_read(&node->ulp_delivered); + send_ack = (oval > threshold); + /* abort if threshold is not reached */ + if (!send_ack) + break; + + nval = 0; + /* try to reset ulp_delivered counter */ + } while (atomic_cmpxchg(&node->ulp_delivered, oval, nval) != oval); + + if (send_ack) + dlm_send_ack(node->nodeid, atomic_read(&node->seq_next)); +} + +static int dlm_send_fin(struct midcomms_node *node, + void (*ack_rcv)(struct midcomms_node *node)) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_mhandle *mh; + char *ppc; + + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc); + if (!mh) + return -ENOMEM; + + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); + mh->ack_rcv = ack_rcv; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); + m_header->h_length = cpu_to_le16(mb_len); + m_header->h_cmd = DLM_FIN; + + pr_debug("sending fin msg to node %d\n", node->nodeid); + dlm_midcomms_commit_mhandle(mh, NULL, 0); + + return 0; +} + +static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) +{ + struct dlm_mhandle *mh; + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + if (mh->ack_rcv) + mh->ack_rcv(node); + } else { + /* send queue should be ordered */ + break; + } + } + + spin_lock_bh(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + dlm_mhandle_delete(node, mh); + } else { + /* send queue should be ordered */ + break; + } + } + spin_unlock_bh(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive passive fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_LAST_ACK: + /* DLM_CLOSED */ + midcomms_node_reset(node); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d", + __func__, node->state); + WARN_ON_ONCE(1); + return; + } + spin_unlock(&node->state_lock); +} + +static void dlm_receive_buffer_3_2_trace(uint32_t seq, + const union dlm_packet *p) +{ + switch (p->header.h_cmd) { + case DLM_MSG: + trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message); + break; + case DLM_RCOM: + trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom); + break; + default: + break; + } +} + +static void dlm_midcomms_receive_buffer(const union dlm_packet *p, + struct midcomms_node *node, + uint32_t seq) +{ + bool is_expected_seq; + uint32_t oval, nval; + + do { + oval = atomic_read(&node->seq_next); + is_expected_seq = (oval == seq); + if (!is_expected_seq) + break; + + nval = oval + 1; + } while (atomic_cmpxchg(&node->seq_next, oval, nval) != oval); + + if (is_expected_seq) { + switch (p->header.h_cmd) { + case DLM_FIN: + spin_lock(&node->state_lock); + pr_debug("receive fin msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_ESTABLISHED: + dlm_send_ack(node->nodeid, nval); + + /* passive shutdown DLM_LAST_ACK case 1 + * additional we check if the node is used by + * cluster manager events at all. + */ + if (node->users == 0) { + node->state = DLM_LAST_ACK; + pr_debug("switch node %d to state %s case 1\n", + node->nodeid, dlm_state_str(node->state)); + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + } else { + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + } + break; + case DLM_FIN_WAIT1: + dlm_send_ack(node->nodeid, nval); + node->state = DLM_CLOSING; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_FIN_WAIT2: + dlm_send_ack(node->nodeid, nval); + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_LAST_ACK: + /* probably remove_member caught it, do nothing */ + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d", + __func__, node->state); + WARN_ON_ONCE(1); + return; + } + spin_unlock(&node->state_lock); + break; + default: + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer_3_2_trace(seq, p); + dlm_receive_buffer(p, node->nodeid); + atomic_inc(&node->ulp_delivered); + /* unlikely case to send ack back when we don't transmit */ + dlm_send_ack_threshold(node, DLM_RECV_ACK_BACK_MSG_THRESHOLD); + break; + } + } else { + /* retry to ack message which we already have by sending back + * current node->seq_next number as ack. + */ + if (seq < oval) + dlm_send_ack(node->nodeid, oval); + + log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", + seq, oval, node->nodeid); + } +} + +static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, + int nodeid) +{ + int len = msglen; + + /* we only trust outer header msglen because + * it's checked against receive buffer length. + */ + if (len < sizeof(struct dlm_opts)) + return -1; + len -= sizeof(struct dlm_opts); + + if (len < le16_to_cpu(p->opts.o_optlen)) + return -1; + len -= le16_to_cpu(p->opts.o_optlen); + + switch (p->opts.o_nextcmd) { + case DLM_FIN: + if (len < sizeof(struct dlm_header)) { + log_print("fin too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + case DLM_MSG: + if (len < sizeof(struct dlm_message)) { + log_print("msg too small: %d, will skip this message from node %d", + msglen, nodeid); + return -1; + } + + break; + case DLM_RCOM: + if (len < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + default: + log_print("unsupported o_nextcmd received: %u, will skip this message from node %d", + p->opts.o_nextcmd, nodeid); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + uint32_t seq; + int ret, idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) + goto out; + + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + + spin_lock(&node->state_lock); + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + break; + } + spin_unlock(&node->state_lock); + + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); + goto out; + } + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* these rcom message we use to determine version. + * they have their own retransmission handling and + * are the first messages of dlm. + * + * length already checked. + */ + switch (p->rcom.rc_type) { + case cpu_to_le32(DLM_RCOM_NAMES): + fallthrough; + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): + fallthrough; + case cpu_to_le32(DLM_RCOM_STATUS): + fallthrough; + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): + break; + default: + log_print("unsupported rcom type received: %u, will skip this message from node %d", + le32_to_cpu(p->rcom.rc_type), nodeid); + goto out; + } + + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, nodeid); + break; + case DLM_OPTS: + seq = le32_to_cpu(p->header.u.h_seq); + + ret = dlm_opts_check_msglen(p, msglen, nodeid); + if (ret < 0) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + p = (union dlm_packet *)((unsigned char *)p->opts.o_opts + + le16_to_cpu(p->opts.o_optlen)); + + /* recheck inner msglen just if it's not garbage */ + msglen = le16_to_cpu(p->header.h_length); + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("inner rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("inner msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_FIN: + if (msglen < sizeof(struct dlm_header)) { + log_print("inner fin too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + default: + log_print("unsupported inner h_cmd received: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + dlm_midcomms_receive_buffer(p, node, seq); + break; + case DLM_ACK: + seq = le32_to_cpu(p->header.u.h_seq); + dlm_receive_ack(node, seq); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + break; + } + +out: + srcu_read_unlock(&nodes_srcu, idx); +} + +static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_1; + wake_up(&node->shutdown_wait); + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, + node->nodeid); + break; + case DLM_VERSION_3_1: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_1, node->nodeid, node->version); + srcu_read_unlock(&nodes_srcu, idx); + return; + } + srcu_read_unlock(&nodes_srcu, idx); + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* length already checked */ + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return; + } + + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + return; + } + + dlm_receive_buffer(p, nodeid); +} + +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) +{ + const unsigned char *ptr = buf; + const struct dlm_header *hd; + uint16_t msglen; + int ret = 0; + + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or + * less than dlm_header size. + * + * Some messages does not have a 8 byte length boundary yet + * which can occur in a unaligned memory access of some dlm + * messages. However this problem need to be fixed at the + * sending side, for now it seems nobody run into architecture + * related issues yet but it slows down some processing. + * Fixing this issue should be scheduled in future by doing + * the next major version bump. + */ + msglen = le16_to_cpu(hd->h_length); + if (msglen > DLM_MAX_SOCKET_BUFSIZE || + msglen < sizeof(struct dlm_header)) { + log_print("received invalid length header: %u from node %d, will abort message parsing", + msglen, nodeid); + return -EBADMSG; + } + + /* caller will take care that leftover + * will be parsed next call with more data + */ + if (msglen > len) + break; + + ret += msglen; + len -= msglen; + ptr += msglen; + } + + return ret; +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + */ +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +{ + const unsigned char *ptr = buf; + const struct dlm_header *hd; + uint16_t msglen; + int ret = 0; + + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + msglen = le16_to_cpu(hd->h_length); + if (msglen > len) + break; + + switch (hd->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): + dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid); + break; + case cpu_to_le32(DLM_VERSION_3_2): + dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid); + break; + default: + log_print("received invalid version header: %u from node %d, will skip this message", + le32_to_cpu(hd->h_version), nodeid); + break; + } + + ret += msglen; + len -= msglen; + ptr += msglen; + } + + return ret; +} + +void dlm_midcomms_unack_msg_resend(int nodeid) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + int idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we don't support to retransmit on failure */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (!mh->committed) + continue; + + ret = dlm_lowcomms_resend_msg(mh->msg); + if (!ret) + log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d", + mh->seq, node->nodeid); + } + rcu_read_unlock(); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, + uint32_t seq) +{ + opts->o_header.h_cmd = DLM_OPTS; + opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len); + opts->o_header.u.h_seq = cpu_to_le32(seq); +} + +static void midcomms_new_msg_cb(void *data) +{ + struct dlm_mhandle *mh = data; + + atomic_inc(&mh->node->send_queue_cnt); + + spin_lock_bh(&mh->node->send_queue_lock); + list_add_tail_rcu(&mh->list, &mh->node->send_queue); + spin_unlock_bh(&mh->node->send_queue_lock); + + mh->seq = atomic_fetch_inc(&mh->node->seq_send); +} + +static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, + int len, gfp_t allocation, char **ppc) +{ + struct dlm_opts *opts; + struct dlm_msg *msg; + + msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN, + allocation, ppc, midcomms_new_msg_cb, mh); + if (!msg) + return NULL; + + opts = (struct dlm_opts *)*ppc; + mh->opts = opts; + + /* add possible options here */ + dlm_fill_opts_header(opts, len, mh->seq); + + *ppc += sizeof(*opts); + mh->inner_p = (const union dlm_packet *)*ppc; + return msg; +} + +/* avoid false positive for nodes_srcu, unlock happens in + * dlm_midcomms_commit_mhandle which is a must call if success + */ +#ifndef __CHECKER__ +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + struct dlm_msg *msg; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) + goto err; + + /* this is a bug, however we going on and hope it will be resolved */ + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + + mh = dlm_allocate_mhandle(allocation); + if (!mh) + goto err; + + mh->committed = false; + mh->ack_rcv = NULL; + mh->idx = idx; + mh->node = node; + + switch (node->version) { + case DLM_VERSION_3_1: + msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, + NULL, NULL); + if (!msg) { + dlm_free_mhandle(mh); + goto err; + } + + break; + case DLM_VERSION_3_2: + /* send ack back if necessary */ + dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD); + + msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, + ppc); + if (!msg) { + dlm_free_mhandle(mh); + goto err; + } + break; + default: + dlm_free_mhandle(mh); + WARN_ON_ONCE(1); + goto err; + } + + mh->msg = msg; + + /* keep in mind that is a must to call + * dlm_midcomms_commit_msg() which releases + * nodes_srcu using mh->idx which is assumed + * here that the application will call it. + */ + return mh; + +err: + srcu_read_unlock(&nodes_srcu, idx); + return NULL; +} +#endif + +static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, + const void *name, int namelen) +{ + switch (mh->inner_p->header.h_cmd) { + case DLM_MSG: + trace_dlm_send_message(mh->node->nodeid, mh->seq, + &mh->inner_p->message, + name, namelen); + break; + case DLM_RCOM: + trace_dlm_send_rcom(mh->node->nodeid, mh->seq, + &mh->inner_p->rcom); + break; + default: + /* nothing to trace */ + break; + } +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, + const void *name, int namelen) +{ + /* nexthdr chain for fast lookup */ + mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; + mh->committed = true; + dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen); + dlm_lowcomms_commit_msg(mh->msg); +} + +/* avoid false positive for nodes_srcu, lock was happen in + * dlm_midcomms_get_mhandle + */ +#ifndef __CHECKER__ +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, + const void *name, int namelen) +{ + + switch (mh->node->version) { + case DLM_VERSION_3_1: + srcu_read_unlock(&nodes_srcu, mh->idx); + + dlm_lowcomms_commit_msg(mh->msg); + dlm_lowcomms_put_msg(mh->msg); + /* mh is not part of rcu list in this case */ + dlm_free_mhandle(mh); + break; + case DLM_VERSION_3_2: + /* held rcu read lock here, because we sending the + * dlm message out, when we do that we could receive + * an ack back which releases the mhandle and we + * get a use after free. + */ + rcu_read_lock(); + dlm_midcomms_commit_msg_3_2(mh, name, namelen); + srcu_read_unlock(&nodes_srcu, mh->idx); + rcu_read_unlock(); + break; + default: + srcu_read_unlock(&nodes_srcu, mh->idx); + WARN_ON_ONCE(1); + break; + } +} +#endif + +int dlm_midcomms_start(void) +{ + return dlm_lowcomms_start(); +} + +void dlm_midcomms_stop(void) +{ + dlm_lowcomms_stop(); +} + +void dlm_midcomms_init(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&node_hash[i]); + + dlm_lowcomms_init(); +} + +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); + kfree(node); +} + +void dlm_midcomms_exit(void) +{ + struct midcomms_node *node; + int i, idx; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + + dlm_lowcomms_exit(); +} + +static void dlm_act_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive active fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_FIN_WAIT1: + node->state = DLM_FIN_WAIT2; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSING: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d", + __func__, node->state); + WARN_ON_ONCE(1); + return; + } + spin_unlock(&node->state_lock); +} + +void dlm_midcomms_add_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + if (!node->users) { + pr_debug("receive add member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + } + + node->users++; + pr_debug("node %d users inc count %d\n", nodeid, node->users); + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); +} + +void dlm_midcomms_remove_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + /* in case of dlm_midcomms_close() removes node */ + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + /* case of dlm_midcomms_addr() created node but + * was not added before because dlm_midcomms_close() + * removed the node + */ + if (!node->users) { + spin_unlock(&node->state_lock); + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + node->users--; + pr_debug("node %d users dec count %d\n", nodeid, node->users); + + /* hitting users count to zero means the + * other side is running dlm_midcomms_stop() + * we meet us to have a clean disconnect. + */ + if (node->users == 0) { + pr_debug("receive remove member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSE_WAIT: + /* passive shutdown DLM_LAST_ACK case 2 */ + node->state = DLM_LAST_ACK; + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + break; + case DLM_LAST_ACK: + /* probably receive fin caught it, do nothing */ + break; + case DLM_CLOSED: + /* already gone, do nothing */ + break; + default: + log_print("%s: unexpected state: %d", + __func__, node->state); + break; + } + } + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); +} + +void dlm_midcomms_version_wait(void) +{ + struct midcomms_node *node; + int i, idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + ret = wait_event_timeout(node->shutdown_wait, + node->version != DLM_VERSION_NOT_SET || + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + pr_debug("version wait timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + } + } + srcu_read_unlock(&nodes_srcu, idx); +} + +static void midcomms_shutdown(struct midcomms_node *node) +{ + int ret; + + /* old protocol, we don't wait for pending operations */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + return; + } + + spin_lock(&node->state_lock); + pr_debug("receive active shutdown for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_FIN_WAIT1; + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + dlm_send_fin(node, dlm_act_fin_ack_rcv); + break; + case DLM_CLOSED: + /* we have what we want */ + break; + default: + /* busy to enter DLM_FIN_WAIT1, wait until passive + * done in shutdown_wait to enter DLM_CLOSED. + */ + break; + } + spin_unlock(&node->state_lock); + + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); + + /* wait for other side dlm + fin */ + ret = wait_event_timeout(node->shutdown_wait, + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret) + pr_debug("active shutdown timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + else + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); +} + +void dlm_midcomms_shutdown(void) +{ + struct midcomms_node *node; + int i, idx; + + mutex_lock(&close_lock); + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + midcomms_shutdown(node); + } + } + + dlm_lowcomms_shutdown(); + + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + midcomms_node_reset(node); + } + } + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); +} + +int dlm_midcomms_close(int nodeid) +{ + struct midcomms_node *node; + int idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + /* Abort pending close/remove operation */ + node = nodeid2node(nodeid); + if (node) { + /* let shutdown waiters leave */ + set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); + wake_up(&node->shutdown_wait); + } + srcu_read_unlock(&nodes_srcu, idx); + + synchronize_srcu(&nodes_srcu); + + mutex_lock(&close_lock); + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + return dlm_lowcomms_close(nodeid); + } + + ret = dlm_lowcomms_close(nodeid); + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + srcu_read_unlock(&nodes_srcu, idx); + + /* wait that all readers left until flush send queue */ + synchronize_srcu(&nodes_srcu); + + /* drop all pending dlm messages, this is fine as + * this function get called when the node is fenced + */ + dlm_send_queue_flush(node); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + mutex_unlock(&close_lock); + + return ret; +} + +/* debug functionality to send raw dlm msg from user space */ +struct dlm_rawmsg_data { + struct midcomms_node *node; + void *buf; +}; + +static void midcomms_new_rawmsg_cb(void *data) +{ + struct dlm_rawmsg_data *rd = data; + struct dlm_header *h = rd->buf; + + switch (h->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): + break; + default: + switch (h->h_cmd) { + case DLM_OPTS: + if (!h->u.h_seq) + h->u.h_seq = cpu_to_le32(atomic_fetch_inc(&rd->node->seq_send)); + break; + default: + break; + } + break; + } +} + +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen) +{ + struct dlm_rawmsg_data rd; + struct dlm_msg *msg; + char *msgbuf; + + rd.node = node; + rd.buf = buf; + + msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS, + &msgbuf, midcomms_new_rawmsg_cb, &rd); + if (!msg) + return -ENOMEM; + + memcpy(msgbuf, buf, buflen); + dlm_lowcomms_commit_msg(msg); + return 0; +} + diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h new file mode 100644 index 0000000000..e7246fb3ef --- /dev/null +++ b/fs/dlm/midcomms.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MIDCOMMS_DOT_H__ +#define __MIDCOMMS_DOT_H__ + +struct midcomms_node; + +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len); +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, + int namelen); +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); +void dlm_midcomms_version_wait(void); +int dlm_midcomms_close(int nodeid); +int dlm_midcomms_start(void); +void dlm_midcomms_stop(void); +void dlm_midcomms_init(void); +void dlm_midcomms_exit(void); +void dlm_midcomms_shutdown(void); +void dlm_midcomms_add_member(int nodeid); +void dlm_midcomms_remove_member(int nodeid); +void dlm_midcomms_unack_msg_resend(int nodeid); +const char *dlm_midcomms_state(struct midcomms_node *node); +unsigned long dlm_midcomms_flags(struct midcomms_node *node); +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); +uint32_t dlm_midcomms_version(struct midcomms_node *node); +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen); +struct kmem_cache *dlm_midcomms_cache_create(void); + +#endif /* __MIDCOMMS_DOT_H__ */ + diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c new file mode 100644 index 0000000000..e6b4c1a214 --- /dev/null +++ b/fs/dlm/plock.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. + */ + +#include <linux/fs.h> +#include <linux/filelock.h> +#include <linux/miscdevice.h> +#include <linux/poll.h> +#include <linux/dlm.h> +#include <linux/dlm_plock.h> +#include <linux/slab.h> + +#include <trace/events/dlm.h> + +#include "dlm_internal.h" +#include "lockspace.h" + +static DEFINE_SPINLOCK(ops_lock); +static LIST_HEAD(send_list); +static LIST_HEAD(recv_list); +static DECLARE_WAIT_QUEUE_HEAD(send_wq); +static DECLARE_WAIT_QUEUE_HEAD(recv_wq); + +struct plock_async_data { + void *fl; + void *file; + struct file_lock flc; + int (*callback)(struct file_lock *fl, int result); +}; + +struct plock_op { + struct list_head list; + int done; + struct dlm_plock_info info; + /* if set indicates async handling */ + struct plock_async_data *data; +}; + +static inline void set_version(struct dlm_plock_info *info) +{ + info->version[0] = DLM_PLOCK_VERSION_MAJOR; + info->version[1] = DLM_PLOCK_VERSION_MINOR; + info->version[2] = DLM_PLOCK_VERSION_PATCH; +} + +static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info) +{ + struct plock_op *op = NULL, *iter; + + list_for_each_entry(iter, &recv_list, list) { + if (iter->info.fsid == info->fsid && + iter->info.number == info->number && + iter->info.owner == info->owner && + iter->info.pid == info->pid && + iter->info.start == info->start && + iter->info.end == info->end && + iter->info.ex == info->ex && + iter->info.wait) { + op = iter; + break; + } + } + + return op; +} + +static int check_version(struct dlm_plock_info *info) +{ + if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || + (DLM_PLOCK_VERSION_MINOR < info->version[1])) { + log_print("plock device version mismatch: " + "kernel (%u.%u.%u), user (%u.%u.%u)", + DLM_PLOCK_VERSION_MAJOR, + DLM_PLOCK_VERSION_MINOR, + DLM_PLOCK_VERSION_PATCH, + info->version[0], + info->version[1], + info->version[2]); + return -EINVAL; + } + return 0; +} + +static void dlm_release_plock_op(struct plock_op *op) +{ + kfree(op->data); + kfree(op); +} + +static void send_op(struct plock_op *op) +{ + set_version(&op->info); + spin_lock(&ops_lock); + list_add_tail(&op->list, &send_list); + spin_unlock(&ops_lock); + wake_up(&send_wq); +} + +static int do_lock_cancel(const struct dlm_plock_info *orig_info) +{ + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) + return -ENOMEM; + + op->info = *orig_info; + op->info.optype = DLM_PLOCK_OP_CANCEL; + op->info.wait = 0; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + rv = op->info.rv; + + dlm_release_plock_op(op); + return rv; +} + +int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + int cmd, struct file_lock *fl) +{ + struct plock_async_data *op_data; + struct dlm_ls *ls; + struct plock_op *op; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + op->info.optype = DLM_PLOCK_OP_LOCK; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.wait = IS_SETLKW(cmd); + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + /* async handling */ + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) { + dlm_release_plock_op(op); + rv = -ENOMEM; + goto out; + } + + /* fl_owner is lockd which doesn't distinguish + processes on the nfs client */ + op->info.owner = (__u64) fl->fl_pid; + op_data->callback = fl->fl_lmops->lm_grant; + locks_init_lock(&op_data->flc); + locks_copy_lock(&op_data->flc, fl); + op_data->fl = fl; + op_data->file = file; + + op->data = op_data; + + send_op(op); + rv = FILE_LOCK_DEFERRED; + goto out; + } else { + op->info.owner = (__u64)(long) fl->fl_owner; + } + + send_op(op); + + if (op->info.wait) { + rv = wait_event_interruptible(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + spin_lock(&ops_lock); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + spin_unlock(&ops_lock); + + rv = do_lock_cancel(&op->info); + switch (rv) { + case 0: + /* waiter was deleted in user space, answer will never come + * remove original request. The original request must be + * on recv_list because the answer of do_lock_cancel() + * synchronized it. + */ + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + rv = -EINTR; + break; + case -ENOENT: + /* cancellation wasn't successful but op should be done */ + fallthrough; + default: + /* internal error doing cancel we need to wait */ + goto wait; + } + + log_debug(ls, "%s: wait interrupted %x %llx pid %d", + __func__, ls->ls_global_id, + (unsigned long long)number, op->info.pid); + dlm_release_plock_op(op); + goto out; + } + } else { +wait: + wait_event(recv_wq, (op->done != 0)); + } + +do_lock_wait: + + WARN_ON(!list_empty(&op->list)); + + rv = op->info.rv; + + if (!rv) { + if (locks_lock_file_wait(file, fl) < 0) + log_error(ls, "dlm_posix_lock: vfs lock error %llx", + (unsigned long long)number); + } + + dlm_release_plock_op(op); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_lock); + +/* Returns failure iff a successful lock operation should be canceled */ +static int dlm_plock_callback(struct plock_op *op) +{ + struct plock_async_data *op_data = op->data; + struct file *file; + struct file_lock *fl; + struct file_lock *flc; + int (*notify)(struct file_lock *fl, int result) = NULL; + int rv = 0; + + WARN_ON(!list_empty(&op->list)); + + /* check if the following 2 are still valid or make a copy */ + file = op_data->file; + flc = &op_data->flc; + fl = op_data->fl; + notify = op_data->callback; + + if (op->info.rv) { + notify(fl, op->info.rv); + goto out; + } + + /* got fs lock; bookkeep locally as well: */ + flc->fl_flags &= ~FL_SLEEP; + if (posix_lock_file(file, flc, NULL)) { + /* + * This can only happen in the case of kmalloc() failure. + * The filesystem's own lock is the authoritative lock, + * so a failure to get the lock locally is not a disaster. + * As long as the fs cannot reliably cancel locks (especially + * in a low-memory situation), we're better off ignoring + * this failure than trying to recover. + */ + log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p", + (unsigned long long)op->info.number, file, fl); + } + + rv = notify(fl, 0); + if (rv) { + /* XXX: We need to cancel the fs lock here: */ + log_print("%s: lock granted after lock request failed; dangling lock!", + __func__); + goto out; + } + +out: + dlm_release_plock_op(op); + return rv; +} + +int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + int rv; + unsigned char fl_flags = fl->fl_flags; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + /* cause the vfs unlock to return ENOENT if lock is not found */ + fl->fl_flags |= FL_EXISTS; + + rv = locks_lock_file_wait(file, fl); + if (rv == -ENOENT) { + rv = 0; + goto out_free; + } + if (rv < 0) { + log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx", + rv, (unsigned long long)number); + } + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->lm_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + if (fl->fl_flags & FL_CLOSE) { + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); + rv = 0; + goto out; + } + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + WARN_ON(!list_empty(&op->list)); + + rv = op->info.rv; + + if (rv == -ENOENT) + rv = 0; + +out_free: + dlm_release_plock_op(op); +out: + dlm_put_lockspace(ls); + fl->fl_flags = fl_flags; + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_unlock); + +/* + * NOTE: This implementation can only handle async lock requests as nfs + * do it. It cannot handle cancellation of a pending lock request sitting + * in wait_event(), but for now only nfs is the only user local kernel + * user. + */ +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_plock_info info; + struct plock_op *op; + struct dlm_ls *ls; + int rv; + + /* this only works for async request for now and nfs is the only + * kernel user right now. + */ + if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) + return -EOPNOTSUPP; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + info.pid = fl->fl_pid; + info.ex = (fl->fl_type == F_WRLCK); + info.fsid = ls->ls_global_id; + dlm_put_lockspace(ls); + info.number = number; + info.start = fl->fl_start; + info.end = fl->fl_end; + info.owner = (__u64)fl->fl_pid; + + rv = do_lock_cancel(&info); + switch (rv) { + case 0: + spin_lock(&ops_lock); + /* lock request to cancel must be on recv_list because + * do_lock_cancel() synchronizes it. + */ + op = plock_lookup_waiter(&info); + if (WARN_ON_ONCE(!op)) { + spin_unlock(&ops_lock); + rv = -ENOLCK; + break; + } + + list_del(&op->list); + spin_unlock(&ops_lock); + WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); + op->data->callback(op->data->fl, -EINTR); + dlm_release_plock_op(op); + rv = -EINTR; + break; + case -ENOENT: + /* if cancel wasn't successful we probably were to late + * or it was a non-blocking lock request, so just unlock it. + */ + rv = dlm_posix_unlock(lockspace, number, file, fl); + break; + default: + break; + } + + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_cancel); + +int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + op->info.optype = DLM_PLOCK_OP_GET; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->lm_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + WARN_ON(!list_empty(&op->list)); + + /* info.rv from userspace is 1 for conflict, 0 for no-conflict, + -ENOENT if there are no locks on the file */ + + rv = op->info.rv; + + fl->fl_type = F_UNLCK; + if (rv == -ENOENT) + rv = 0; + else if (rv > 0) { + locks_init_lock(fl); + fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_flags = FL_POSIX; + fl->fl_pid = op->info.pid; + if (op->info.nodeid != dlm_our_nodeid()) + fl->fl_pid = -fl->fl_pid; + fl->fl_start = op->info.start; + fl->fl_end = op->info.end; + rv = 0; + } + + dlm_release_plock_op(op); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_get); + +/* a read copies out one plock request from the send list */ +static ssize_t dev_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct dlm_plock_info info; + struct plock_op *op = NULL; + + if (count < sizeof(info)) + return -EINVAL; + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + op = list_first_entry(&send_list, struct plock_op, list); + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + list_del(&op->list); + else + list_move_tail(&op->list, &recv_list); + memcpy(&info, &op->info, sizeof(info)); + } + spin_unlock(&ops_lock); + + if (!op) + return -EAGAIN; + + trace_dlm_plock_read(&info); + + /* there is no need to get a reply from userspace for unlocks + that were generated by the vfs cleaning up for a close + (the process did not make an unlock call). */ + + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + dlm_release_plock_op(op); + + if (copy_to_user(u, &info, sizeof(info))) + return -EFAULT; + return sizeof(info); +} + +/* a write copies in one plock result that should match a plock_op + on the recv list */ +static ssize_t dev_write(struct file *file, const char __user *u, size_t count, + loff_t *ppos) +{ + struct plock_op *op = NULL, *iter; + struct dlm_plock_info info; + int do_callback = 0; + + if (count != sizeof(info)) + return -EINVAL; + + if (copy_from_user(&info, u, sizeof(info))) + return -EFAULT; + + trace_dlm_plock_write(&info); + + if (check_version(&info)) + return -EINVAL; + + /* + * The results for waiting ops (SETLKW) can be returned in any + * order, so match all fields to find the op. The results for + * non-waiting ops are returned in the order that they were sent + * to userspace, so match the result with the first non-waiting op. + */ + spin_lock(&ops_lock); + if (info.wait) { + op = plock_lookup_waiter(&info); + } else { + list_for_each_entry(iter, &recv_list, list) { + if (!iter->info.wait && + iter->info.fsid == info.fsid) { + op = iter; + break; + } + } + } + + if (op) { + /* Sanity check that op and info match. */ + if (info.wait) + WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); + else + WARN_ON(op->info.number != info.number || + op->info.owner != info.owner || + op->info.optype != info.optype); + + list_del_init(&op->list); + memcpy(&op->info, &info, sizeof(info)); + if (op->data) + do_callback = 1; + else + op->done = 1; + } + spin_unlock(&ops_lock); + + if (op) { + if (do_callback) + dlm_plock_callback(op); + else + wake_up(&recv_wq); + } else + pr_debug("%s: no op %x %llx", __func__, + info.fsid, (unsigned long long)info.number); + return count; +} + +static __poll_t dev_poll(struct file *file, poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(file, &send_wq, wait); + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) + mask = EPOLLIN | EPOLLRDNORM; + spin_unlock(&ops_lock); + + return mask; +} + +static const struct file_operations dev_fops = { + .read = dev_read, + .write = dev_write, + .poll = dev_poll, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice plock_dev_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DLM_PLOCK_MISC_NAME, + .fops = &dev_fops +}; + +int dlm_plock_init(void) +{ + int rv; + + rv = misc_register(&plock_dev_misc); + if (rv) + log_print("dlm_plock_init: misc_register failed %d", rv); + return rv; +} + +void dlm_plock_exit(void) +{ + misc_deregister(&plock_dev_misc); + WARN_ON(!list_empty(&send_list)); + WARN_ON(!list_empty(&recv_list)); +} + diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c new file mode 100644 index 0000000000..3b734aed26 --- /dev/null +++ b/fs/dlm/rcom.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "rcom.h" +#include "recover.h" +#include "dir.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "util.h" + +static int rcom_response(struct dlm_ls *ls) +{ + return test_bit(LSFL_RCOM_READY, &ls->ls_flags); +} + +static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, char *mb, int mb_len, + uint64_t seq) +{ + struct dlm_rcom *rc; + + rc = (struct dlm_rcom *) mb; + + rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id); + rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + rc->rc_header.h_length = cpu_to_le16(mb_len); + rc->rc_header.h_cmd = DLM_RCOM; + + rc->rc_type = cpu_to_le32(type); + rc->rc_seq = cpu_to_le64(seq); + + *rc_ret = rc; +} + +static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret, + uint64_t seq) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_mhandle *mh; + char *mb; + + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("%s to %d type %d len %d ENOBUFS", + __func__, to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); + *mh_ret = mh; + return 0; +} + +static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, + int len, struct dlm_rcom **rc_ret, + struct dlm_msg **msg_ret, uint64_t seq) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_msg *msg; + char *mb; + + msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb, + NULL, NULL); + if (!msg) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); + *msg_ret = msg; + return 0; +} + +static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) +{ + dlm_midcomms_commit_mhandle(mh, NULL, 0); +} + +static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) +{ + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); +} + +static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, + uint32_t flags) +{ + rs->rs_flags = cpu_to_le32(flags); +} + +/* When replying to a status request, a node also sends back its + configuration values. The requesting node then checks that the remote + node is configured the same way as itself. */ + +static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf, + uint32_t num_slots) +{ + rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); + rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); + + rf->rf_our_slot = cpu_to_le16(ls->ls_slot); + rf->rf_num_slots = cpu_to_le16(num_slots); + rf->rf_generation = cpu_to_le32(ls->ls_generation); +} + +static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +{ + struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; + + if ((le32_to_cpu(rc->rc_header.h_version) & 0xFFFF0000) != DLM_HEADER_MAJOR) { + log_error(ls, "version mismatch: %x nodeid %d: %x", + DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, + le32_to_cpu(rc->rc_header.h_version)); + return -EPROTO; + } + + if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || + le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { + log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", + ls->ls_lvblen, ls->ls_exflags, nodeid, + le32_to_cpu(rf->rf_lvblen), + le32_to_cpu(rf->rf_lsflags)); + return -EPROTO; + } + return 0; +} + +static void allow_sync_reply(struct dlm_ls *ls, __le64 *new_seq) +{ + spin_lock(&ls->ls_rcom_spin); + *new_seq = cpu_to_le64(++ls->ls_rcom_seq); + set_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +static void disallow_sync_reply(struct dlm_ls *ls) +{ + spin_lock(&ls->ls_rcom_spin); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +/* + * low nodeid gathers one slot value at a time from each node. + * it sets need_slots=0, and saves rf_our_slot returned from each + * rcom_config. + * + * other nodes gather all slot values at once from the low nodeid. + * they set need_slots=1, and ignore the rf_our_slot returned from each + * rcom_config. they use the rf_num_slots returned from the low + * node's rcom_config. + */ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq) +{ + struct dlm_rcom *rc; + struct dlm_msg *msg; + int error = 0; + + ls->ls_recover_nodeid = nodeid; + + if (nodeid == dlm_our_nodeid()) { + rc = ls->ls_recover_buf; + rc->rc_result = cpu_to_le32(dlm_recover_status(ls)); + goto out; + } + +retry: + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &msg, + seq); + if (error) + goto out; + + set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); + + send_rcom_stateless(msg, rc); + + error = dlm_wait_function(ls, &rcom_response); + disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; + if (error) + goto out; + + rc = ls->ls_recover_buf; + + if (rc->rc_result == cpu_to_le32(-ESRCH)) { + /* we pretend the remote lockspace exists with 0 status */ + log_debug(ls, "remote node %d not ready", nodeid); + rc->rc_result = 0; + error = 0; + } else { + error = check_rcom_config(ls, rc, nodeid); + } + + /* the caller looks at rc_result for the remote recovery status */ + out: + return error; +} + +static void receive_rcom_status(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, + uint64_t seq) +{ + struct dlm_rcom *rc; + struct rcom_status *rs; + uint32_t status; + int nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); + int len = sizeof(struct rcom_config); + struct dlm_msg *msg; + int num_slots = 0; + int error; + + if (!dlm_slots_version(&rc_in->rc_header)) { + status = dlm_recover_status(ls); + goto do_create; + } + + rs = (struct rcom_status *)rc_in->rc_buf; + + if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) { + status = dlm_recover_status(ls); + goto do_create; + } + + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + num_slots = ls->ls_num_slots; + spin_unlock(&ls->ls_recover_lock); + len += num_slots * sizeof(struct rcom_slot); + + do_create: + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, + len, &rc, &msg, seq); + if (error) + return; + + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + rc->rc_result = cpu_to_le32(status); + + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots); + + if (!num_slots) + goto do_send; + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_num_slots != num_slots) { + spin_unlock(&ls->ls_recover_lock); + log_debug(ls, "receive_rcom_status num_slots %d to %d", + num_slots, ls->ls_num_slots); + rc->rc_result = 0; + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0); + goto do_send; + } + + dlm_slots_copy_out(ls, rc); + spin_unlock(&ls->ls_recover_lock); + + do_send: + send_rcom_stateless(msg, rc); +} + +static void receive_sync_reply(struct dlm_ls *ls, const struct dlm_rcom *rc_in) +{ + spin_lock(&ls->ls_rcom_spin); + if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || + le64_to_cpu(rc_in->rc_id) != ls->ls_rcom_seq) { + log_debug(ls, "reject reply %d from %d seq %llx expect %llx", + le32_to_cpu(rc_in->rc_type), + le32_to_cpu(rc_in->rc_header.h_nodeid), + (unsigned long long)le64_to_cpu(rc_in->rc_id), + (unsigned long long)ls->ls_rcom_seq); + goto out; + } + memcpy(ls->ls_recover_buf, rc_in, + le16_to_cpu(rc_in->rc_header.h_length)); + set_bit(LSFL_RCOM_READY, &ls->ls_flags); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + wake_up(&ls->ls_wait_general); + out: + spin_unlock(&ls->ls_rcom_spin); +} + +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq) +{ + struct dlm_mhandle *mh; + struct dlm_rcom *rc; + int error = 0; + + ls->ls_recover_nodeid = nodeid; + +retry: + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &mh, seq); + if (error) + goto out; + memcpy(rc->rc_buf, last_name, last_len); + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); + + send_rcom(mh, rc); + + error = dlm_wait_function(ls, &rcom_response); + disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; + out: + return error; +} + +static void receive_rcom_names(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) +{ + struct dlm_mhandle *mh; + struct dlm_rcom *rc; + int error, inlen, outlen, nodeid; + + nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); + inlen = le16_to_cpu(rc_in->rc_header.h_length) - + sizeof(struct dlm_rcom); + outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); + + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &mh, seq); + if (error) + return; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, + nodeid); + send_rcom(mh, rc); +} + +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct dlm_ls *ls = r->res_ls; + int error; + + error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, + &rc, &mh, seq); + if (error) + goto out; + memcpy(rc->rc_buf, r->res_name, r->res_length); + rc->rc_id = cpu_to_le64(r->res_id); + + send_rcom(mh, rc); + out: + return error; +} + +static void receive_rcom_lookup(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, uint64_t seq) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, ret_nodeid, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); + int len = le16_to_cpu(rc_in->rc_header.h_length) - + sizeof(struct dlm_rcom); + + /* Old code would send this special id to trigger a debug dump. */ + if (rc_in->rc_id == cpu_to_le64(0xFFFFFFFF)) { + log_error(ls, "receive_rcom_lookup dump from %d", nodeid); + dlm_dump_rsb_name(ls, rc_in->rc_buf, len); + return; + } + + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh, + seq); + if (error) + return; + + error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len, + DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); + if (error) + ret_nodeid = error; + rc->rc_result = cpu_to_le32(ret_nodeid); + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + send_rcom(mh, rc); +} + +static void receive_rcom_lookup_reply(struct dlm_ls *ls, + const struct dlm_rcom *rc_in) +{ + dlm_recover_master_reply(ls, rc_in); +} + +static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct rcom_lock *rl) +{ + memset(rl, 0, sizeof(*rl)); + + rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid); + rl->rl_lkid = cpu_to_le32(lkb->lkb_id); + rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags); + rl->rl_flags = cpu_to_le32(dlm_dflags_val(lkb)); + rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); + rl->rl_rqmode = lkb->lkb_rqmode; + rl->rl_grmode = lkb->lkb_grmode; + rl->rl_status = lkb->lkb_status; + rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type); + + if (lkb->lkb_bastfn) + rl->rl_asts |= DLM_CB_BAST; + if (lkb->lkb_astfn) + rl->rl_asts |= DLM_CB_CAST; + + rl->rl_namelen = cpu_to_le16(r->res_length); + memcpy(rl->rl_name, r->res_name, r->res_length); + + /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ? + If so, receive_rcom_lock_args() won't take this copy. */ + + if (lkb->lkb_lvbptr) + memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); +} + +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq) +{ + struct dlm_ls *ls = r->res_ls; + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct rcom_lock *rl; + int error, len = sizeof(struct rcom_lock); + + if (lkb->lkb_lvbptr) + len += ls->ls_lvblen; + + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh, + seq); + if (error) + goto out; + + rl = (struct rcom_lock *) rc->rc_buf; + pack_rcom_lock(r, lkb, rl); + rc->rc_id = cpu_to_le64((uintptr_t)r); + + send_rcom(mh, rc); + out: + return error; +} + +/* needs at least dlm_rcom + rcom_lock */ +static void receive_rcom_lock(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) +{ + __le32 rl_remid, rl_result; + struct rcom_lock *rl; + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); + + dlm_recover_master_copy(ls, rc_in, &rl_remid, &rl_result); + + error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, + sizeof(struct rcom_lock), &rc, &mh, seq); + if (error) + return; + + memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); + rl = (struct rcom_lock *)rc->rc_buf; + /* set rl_remid and rl_result from dlm_recover_master_copy() */ + rl->rl_remid = rl_remid; + rl->rl_result = rl_result; + + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + send_rcom(mh, rc); +} + +/* If the lockspace doesn't exist then still send a status message + back; it's possible that it just doesn't have its global_id yet. */ + +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct rcom_config *rf; + struct dlm_mhandle *mh; + char *mb; + int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); + + mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) + return -ENOBUFS; + + rc = (struct dlm_rcom *) mb; + + rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace; + rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + rc->rc_header.h_length = cpu_to_le16(mb_len); + rc->rc_header.h_cmd = DLM_RCOM; + + rc->rc_type = cpu_to_le32(DLM_RCOM_STATUS_REPLY); + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + rc->rc_result = cpu_to_le32(-ESRCH); + + rf = (struct rcom_config *) rc->rc_buf; + rf->rf_lvblen = cpu_to_le32(~0U); + + dlm_midcomms_commit_mhandle(mh, NULL, 0); + + return 0; +} + +/* + * Ignore messages for stage Y before we set + * recover_status bit for stage X: + * + * recover_status = 0 + * + * dlm_recover_members() + * - send nothing + * - recv nothing + * - ignore NAMES, NAMES_REPLY + * - ignore LOOKUP, LOOKUP_REPLY + * - ignore LOCK, LOCK_REPLY + * + * recover_status |= NODES + * + * dlm_recover_members_wait() + * + * dlm_recover_directory() + * - send NAMES + * - recv NAMES_REPLY + * - ignore LOOKUP, LOOKUP_REPLY + * - ignore LOCK, LOCK_REPLY + * + * recover_status |= DIR + * + * dlm_recover_directory_wait() + * + * dlm_recover_masters() + * - send LOOKUP + * - recv LOOKUP_REPLY + * + * dlm_recover_locks() + * - send LOCKS + * - recv LOCKS_REPLY + * + * recover_status |= LOCKS + * + * dlm_recover_locks_wait() + * + * recover_status |= DONE + */ + +/* Called by dlm_recv; corresponds to dlm_receive_message() but special + recovery-only comms are sent through here. */ + +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, int nodeid) +{ + int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + int stop, reply = 0, names = 0, lookup = 0, lock = 0; + uint32_t status; + uint64_t seq; + + switch (rc->rc_type) { + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): + reply = 1; + break; + case cpu_to_le32(DLM_RCOM_NAMES): + names = 1; + break; + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): + names = 1; + reply = 1; + break; + case cpu_to_le32(DLM_RCOM_LOOKUP): + lookup = 1; + break; + case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY): + lookup = 1; + reply = 1; + break; + case cpu_to_le32(DLM_RCOM_LOCK): + lock = 1; + break; + case cpu_to_le32(DLM_RCOM_LOCK_REPLY): + lock = 1; + reply = 1; + break; + } + + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + stop = dlm_recovery_stopped(ls); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + + if (stop && (rc->rc_type != cpu_to_le32(DLM_RCOM_STATUS))) + goto ignore; + + if (reply && (le64_to_cpu(rc->rc_seq_reply) != seq)) + goto ignore; + + if (!(status & DLM_RS_NODES) && (names || lookup || lock)) + goto ignore; + + if (!(status & DLM_RS_DIR) && (lookup || lock)) + goto ignore; + + switch (rc->rc_type) { + case cpu_to_le32(DLM_RCOM_STATUS): + receive_rcom_status(ls, rc, seq); + break; + + case cpu_to_le32(DLM_RCOM_NAMES): + receive_rcom_names(ls, rc, seq); + break; + + case cpu_to_le32(DLM_RCOM_LOOKUP): + receive_rcom_lookup(ls, rc, seq); + break; + + case cpu_to_le32(DLM_RCOM_LOCK): + if (le16_to_cpu(rc->rc_header.h_length) < lock_size) + goto Eshort; + receive_rcom_lock(ls, rc, seq); + break; + + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): + receive_sync_reply(ls, rc); + break; + + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): + receive_sync_reply(ls, rc); + break; + + case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY): + receive_rcom_lookup_reply(ls, rc); + break; + + case cpu_to_le32(DLM_RCOM_LOCK_REPLY): + if (le16_to_cpu(rc->rc_header.h_length) < lock_size) + goto Eshort; + dlm_recover_process_copy(ls, rc, seq); + break; + + default: + log_error(ls, "receive_rcom bad type %d", + le32_to_cpu(rc->rc_type)); + } + return; + +ignore: + log_limit(ls, "dlm_receive_rcom ignore msg %d " + "from %d %llu %llu recover seq %llu sts %x gen %u", + le32_to_cpu(rc->rc_type), + nodeid, + (unsigned long long)le64_to_cpu(rc->rc_seq), + (unsigned long long)le64_to_cpu(rc->rc_seq_reply), + (unsigned long long)seq, + status, ls->ls_generation); + return; +Eshort: + log_error(ls, "recovery message %d from %d is too short", + le32_to_cpu(rc->rc_type), nodeid); +} + diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h new file mode 100644 index 0000000000..765926ae00 --- /dev/null +++ b/fs/dlm/rcom.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RCOM_DOT_H__ +#define __RCOM_DOT_H__ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq); +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq); +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq); +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq); +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, + int nodeid); +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in); + +#endif + diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c new file mode 100644 index 0000000000..53917c0aa3 --- /dev/null +++ b/fs/dlm/recover.c @@ -0,0 +1,958 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "dir.h" +#include "config.h" +#include "ast.h" +#include "memory.h" +#include "rcom.h" +#include "lock.h" +#include "lowcomms.h" +#include "member.h" +#include "recover.h" + + +/* + * Recovery waiting routines: these functions wait for a particular reply from + * a remote node, or for the remote node to report a certain status. They need + * to abort if the lockspace is stopped indicating a node has failed (perhaps + * the one being waited for). + */ + +/* + * Wait until given function returns non-zero or lockspace is stopped + * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another + * function thinks it could have completed the waited-on task, they should wake + * up ls_wait_general to get an immediate response rather than waiting for the + * timeout. This uses a timeout so it can check periodically if the wait + * should abort due to node failure (which doesn't cause a wake_up). + * This should only be called by the dlm_recoverd thread. + */ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) +{ + int error = 0; + int rv; + + while (1) { + rv = wait_event_timeout(ls->ls_wait_general, + testfn(ls) || dlm_recovery_stopped(ls), + dlm_config.ci_recover_timer * HZ); + if (rv) + break; + if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) { + log_debug(ls, "dlm_wait_function timed out"); + return -ETIMEDOUT; + } + } + + if (dlm_recovery_stopped(ls)) { + log_debug(ls, "dlm_wait_function aborted"); + error = -EINTR; + } + return error; +} + +/* + * An efficient way for all nodes to wait for all others to have a certain + * status. The node with the lowest nodeid polls all the others for their + * status (wait_status_all) and all the others poll the node with the low id + * for its accumulated result (wait_status_low). When all nodes have set + * status flag X, then status flag X_ALL will be set on the low nodeid. + */ + +uint32_t dlm_recover_status(struct dlm_ls *ls) +{ + uint32_t status; + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + spin_unlock(&ls->ls_recover_lock); + return status; +} + +static void _set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + ls->ls_recover_status |= status; +} + +void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + spin_lock(&ls->ls_recover_lock); + _set_recover_status(ls, status); + spin_unlock(&ls->ls_recover_lock); +} + +static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, + int save_slots, uint64_t seq) +{ + struct dlm_rcom *rc = ls->ls_recover_buf; + struct dlm_member *memb; + int error = 0, delay; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + delay = 0; + for (;;) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out; + } + + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); + if (error) + goto out; + + if (save_slots) + dlm_slot_save(ls, rc, memb); + + if (le32_to_cpu(rc->rc_result) & wait_status) + break; + if (delay < 1000) + delay += 20; + msleep(delay); + } + } + out: + return error; +} + +static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, + uint32_t status_flags, uint64_t seq) +{ + struct dlm_rcom *rc = ls->ls_recover_buf; + int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; + + for (;;) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out; + } + + error = dlm_rcom_status(ls, nodeid, status_flags, seq); + if (error) + break; + + if (le32_to_cpu(rc->rc_result) & wait_status) + break; + if (delay < 1000) + delay += 20; + msleep(delay); + } + out: + return error; +} + +static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq) +{ + uint32_t status_all = status << 1; + int error; + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, status, 0, seq); + if (!error) + dlm_set_recover_status(ls, status_all); + } else + error = wait_status_low(ls, status_all, 0, seq); + + return error; +} + +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq) +{ + struct dlm_member *memb; + struct dlm_slot *slots; + int num_slots, slots_size; + int error, rv; + uint32_t gen; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + memb->slot = -1; + memb->generation = 0; + } + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, DLM_RS_NODES, 1, seq); + if (error) + goto out; + + /* slots array is sparse, slots_size may be > num_slots */ + + rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen); + if (!rv) { + spin_lock(&ls->ls_recover_lock); + _set_recover_status(ls, DLM_RS_NODES_ALL); + ls->ls_num_slots = num_slots; + ls->ls_slots_size = slots_size; + ls->ls_slots = slots; + ls->ls_generation = gen; + spin_unlock(&ls->ls_recover_lock); + } else { + dlm_set_recover_status(ls, DLM_RS_NODES_ALL); + } + } else { + error = wait_status_low(ls, DLM_RS_NODES_ALL, + DLM_RSF_NEED_SLOTS, seq); + if (error) + goto out; + + dlm_slots_copy_in(ls); + } + out: + return error; +} + +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq) +{ + return wait_status(ls, DLM_RS_DIR, seq); +} + +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq) +{ + return wait_status(ls, DLM_RS_LOCKS, seq); +} + +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq) +{ + return wait_status(ls, DLM_RS_DONE, seq); +} + +/* + * The recover_list contains all the rsb's for which we've requested the new + * master nodeid. As replies are returned from the resource directories the + * rsb's are removed from the list. When the list is empty we're done. + * + * The recover_list is later similarly used for all rsb's for which we've sent + * new lkb's and need to receive new corresponding lkid's. + * + * We use the address of the rsb struct as a simple local identifier for the + * rsb so we can match an rcom reply with the rsb it was sent for. + */ + +static int recover_list_empty(struct dlm_ls *ls) +{ + int empty; + + spin_lock(&ls->ls_recover_list_lock); + empty = list_empty(&ls->ls_recover_list); + spin_unlock(&ls->ls_recover_list_lock); + + return empty; +} + +static void recover_list_add(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + if (list_empty(&r->res_recover_list)) { + list_add_tail(&r->res_recover_list, &ls->ls_recover_list); + ls->ls_recover_list_count++; + dlm_hold_rsb(r); + } + spin_unlock(&ls->ls_recover_list_lock); +} + +static void recover_list_del(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + list_del_init(&r->res_recover_list); + ls->ls_recover_list_count--; + spin_unlock(&ls->ls_recover_list_lock); + + dlm_put_rsb(r); +} + +static void recover_list_clear(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *s; + + spin_lock(&ls->ls_recover_list_lock); + list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) { + list_del_init(&r->res_recover_list); + r->res_recover_locks_count = 0; + dlm_put_rsb(r); + ls->ls_recover_list_count--; + } + + if (ls->ls_recover_list_count != 0) { + log_error(ls, "warning: recover_list_count %d", + ls->ls_recover_list_count); + ls->ls_recover_list_count = 0; + } + spin_unlock(&ls->ls_recover_list_lock); +} + +static int recover_idr_empty(struct dlm_ls *ls) +{ + int empty = 1; + + spin_lock(&ls->ls_recover_idr_lock); + if (ls->ls_recover_list_count) + empty = 0; + spin_unlock(&ls->ls_recover_idr_lock); + + return empty; +} + +static int recover_idr_add(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + int rv; + + idr_preload(GFP_NOFS); + spin_lock(&ls->ls_recover_idr_lock); + if (r->res_id) { + rv = -1; + goto out_unlock; + } + rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT); + if (rv < 0) + goto out_unlock; + + r->res_id = rv; + ls->ls_recover_list_count++; + dlm_hold_rsb(r); + rv = 0; +out_unlock: + spin_unlock(&ls->ls_recover_idr_lock); + idr_preload_end(); + return rv; +} + +static void recover_idr_del(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_idr_lock); + idr_remove(&ls->ls_recover_idr, r->res_id); + r->res_id = 0; + ls->ls_recover_list_count--; + spin_unlock(&ls->ls_recover_idr_lock); + + dlm_put_rsb(r); +} + +static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id) +{ + struct dlm_rsb *r; + + spin_lock(&ls->ls_recover_idr_lock); + r = idr_find(&ls->ls_recover_idr, (int)id); + spin_unlock(&ls->ls_recover_idr_lock); + return r; +} + +static void recover_idr_clear(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int id; + + spin_lock(&ls->ls_recover_idr_lock); + + idr_for_each_entry(&ls->ls_recover_idr, r, id) { + idr_remove(&ls->ls_recover_idr, id); + r->res_id = 0; + r->res_recover_locks_count = 0; + ls->ls_recover_list_count--; + + dlm_put_rsb(r); + } + + if (ls->ls_recover_list_count != 0) { + log_error(ls, "warning: recover_list_count %d", + ls->ls_recover_list_count); + ls->ls_recover_list_count = 0; + } + spin_unlock(&ls->ls_recover_idr_lock); +} + + +/* Master recovery: find new master node for rsb's that were + mastered on nodes that have been removed. + + dlm_recover_masters + recover_master + dlm_send_rcom_lookup -> receive_rcom_lookup + dlm_dir_lookup + receive_rcom_lookup_reply <- + dlm_recover_master_reply + set_new_master + set_master_lkbs + set_lock_master +*/ + +/* + * Set the lock master for all LKBs in a lock queue + * If we are the new master of the rsb, we may have received new + * MSTCPY locks from other nodes already which we need to ignore + * when setting the new nodeid. + */ + +static void set_lock_master(struct list_head *queue, int nodeid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, queue, lkb_statequeue) { + if (!test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) { + lkb->lkb_nodeid = nodeid; + lkb->lkb_remid = 0; + } + } +} + +static void set_master_lkbs(struct dlm_rsb *r) +{ + set_lock_master(&r->res_grantqueue, r->res_nodeid); + set_lock_master(&r->res_convertqueue, r->res_nodeid); + set_lock_master(&r->res_waitqueue, r->res_nodeid); +} + +/* + * Propagate the new master nodeid to locks + * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. + * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which + * rsb's to consider. + */ + +static void set_new_master(struct dlm_rsb *r) +{ + set_master_lkbs(r); + rsb_set_flag(r, RSB_NEW_MASTER); + rsb_set_flag(r, RSB_NEW_MASTER2); +} + +/* + * We do async lookups on rsb's that need new masters. The rsb's + * waiting for a lookup reply are kept on the recover_list. + * + * Another node recovering the master may have sent us a rcom lookup, + * and our dlm_master_lookup() set it as the new master, along with + * NEW_MASTER so that we'll recover it here (this implies dir_nodeid + * equals our_nodeid below). + */ + +static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq) +{ + struct dlm_ls *ls = r->res_ls; + int our_nodeid, dir_nodeid; + int is_removed = 0; + int error; + + if (is_master(r)) + return 0; + + is_removed = dlm_is_removed(ls, r->res_nodeid); + + if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER)) + return 0; + + our_nodeid = dlm_our_nodeid(); + dir_nodeid = dlm_dir_nodeid(r); + + if (dir_nodeid == our_nodeid) { + if (is_removed) { + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + } + + /* set master of lkbs to ourself when is_removed, or to + another new master which we set along with NEW_MASTER + in dlm_master_lookup */ + set_new_master(r); + error = 0; + } else { + recover_idr_add(r); + error = dlm_send_rcom_lookup(r, dir_nodeid, seq); + } + + (*count)++; + return error; +} + +/* + * All MSTCPY locks are purged and rebuilt, even if the master stayed the same. + * This is necessary because recovery can be started, aborted and restarted, + * causing the master nodeid to briefly change during the aborted recovery, and + * change back to the original value in the second recovery. The MSTCPY locks + * may or may not have been purged during the aborted recovery. Another node + * with an outstanding request in waiters list and a request reply saved in the + * requestqueue, cannot know whether it should ignore the reply and resend the + * request, or accept the reply and complete the request. It must do the + * former if the remote node purged MSTCPY locks, and it must do the later if + * the remote node did not. This is solved by always purging MSTCPY locks, in + * which case, the request reply would always be ignored and the request + * resent. + */ + +static int recover_master_static(struct dlm_rsb *r, unsigned int *count) +{ + int dir_nodeid = dlm_dir_nodeid(r); + int new_master = dir_nodeid; + + if (dir_nodeid == dlm_our_nodeid()) + new_master = 0; + + dlm_purge_mstcpy_locks(r); + r->res_master_nodeid = dir_nodeid; + r->res_nodeid = new_master; + set_new_master(r); + (*count)++; + return 0; +} + +/* + * Go through local root resources and for each rsb which has a master which + * has departed, get the new master nodeid from the directory. The dir will + * assign mastery to the first node to look up the new master. That means + * we'll discover in this lookup if we're the new master of any rsb's. + * + * We fire off all the dir lookup requests individually and asynchronously to + * the correct dir node. + */ + +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq) +{ + struct dlm_rsb *r; + unsigned int total = 0; + unsigned int count = 0; + int nodir = dlm_no_directory(ls); + int error; + + log_rinfo(ls, "dlm_recover_masters"); + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (dlm_recovery_stopped(ls)) { + up_read(&ls->ls_root_sem); + error = -EINTR; + goto out; + } + + lock_rsb(r); + if (nodir) + error = recover_master_static(r, &count); + else + error = recover_master(r, &count, seq); + unlock_rsb(r); + cond_resched(); + total++; + + if (error) { + up_read(&ls->ls_root_sem); + goto out; + } + } + up_read(&ls->ls_root_sem); + + log_rinfo(ls, "dlm_recover_masters %u of %u", count, total); + + error = dlm_wait_function(ls, &recover_idr_empty); + out: + if (error) + recover_idr_clear(ls); + return error; +} + +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc) +{ + struct dlm_rsb *r; + int ret_nodeid, new_master; + + r = recover_idr_find(ls, le64_to_cpu(rc->rc_id)); + if (!r) { + log_error(ls, "dlm_recover_master_reply no id %llx", + (unsigned long long)le64_to_cpu(rc->rc_id)); + goto out; + } + + ret_nodeid = le32_to_cpu(rc->rc_result); + + if (ret_nodeid == dlm_our_nodeid()) + new_master = 0; + else + new_master = ret_nodeid; + + lock_rsb(r); + r->res_master_nodeid = ret_nodeid; + r->res_nodeid = new_master; + set_new_master(r); + unlock_rsb(r); + recover_idr_del(r); + + if (recover_idr_empty(ls)) + wake_up(&ls->ls_wait_general); + out: + return 0; +} + + +/* Lock recovery: rebuild the process-copy locks we hold on a + remastered rsb on the new rsb master. + + dlm_recover_locks + recover_locks + recover_locks_queue + dlm_send_rcom_lock -> receive_rcom_lock + dlm_recover_master_copy + receive_rcom_lock_reply <- + dlm_recover_process_copy +*/ + + +/* + * keep a count of the number of lkb's we send to the new master; when we get + * an equal number of replies then recovery for the rsb is done + */ + +static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head, + uint64_t seq) +{ + struct dlm_lkb *lkb; + int error = 0; + + list_for_each_entry(lkb, head, lkb_statequeue) { + error = dlm_send_rcom_lock(r, lkb, seq); + if (error) + break; + r->res_recover_locks_count++; + } + + return error; +} + +static int recover_locks(struct dlm_rsb *r, uint64_t seq) +{ + int error = 0; + + lock_rsb(r); + + DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r);); + + error = recover_locks_queue(r, &r->res_grantqueue, seq); + if (error) + goto out; + error = recover_locks_queue(r, &r->res_convertqueue, seq); + if (error) + goto out; + error = recover_locks_queue(r, &r->res_waitqueue, seq); + if (error) + goto out; + + if (r->res_recover_locks_count) + recover_list_add(r); + else + rsb_clear_flag(r, RSB_NEW_MASTER); + out: + unlock_rsb(r); + return error; +} + +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq) +{ + struct dlm_rsb *r; + int error, count = 0; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (is_master(r)) { + rsb_clear_flag(r, RSB_NEW_MASTER); + continue; + } + + if (!rsb_flag(r, RSB_NEW_MASTER)) + continue; + + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + up_read(&ls->ls_root_sem); + goto out; + } + + error = recover_locks(r, seq); + if (error) { + up_read(&ls->ls_root_sem); + goto out; + } + + count += r->res_recover_locks_count; + } + up_read(&ls->ls_root_sem); + + log_rinfo(ls, "dlm_recover_locks %d out", count); + + error = dlm_wait_function(ls, &recover_list_empty); + out: + if (error) + recover_list_clear(ls); + return error; +} + +void dlm_recovered_lock(struct dlm_rsb *r) +{ + DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r);); + + r->res_recover_locks_count--; + if (!r->res_recover_locks_count) { + rsb_clear_flag(r, RSB_NEW_MASTER); + recover_list_del(r); + } + + if (recover_list_empty(r->res_ls)) + wake_up(&r->res_ls->ls_wait_general); +} + +/* + * The lvb needs to be recovered on all master rsb's. This includes setting + * the VALNOTVALID flag if necessary, and determining the correct lvb contents + * based on the lvb's of the locks held on the rsb. + * + * RSB_VALNOTVALID is set in two cases: + * + * 1. we are master, but not new, and we purged an EX/PW lock held by a + * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL) + * + * 2. we are a new master, and there are only NL/CR locks left. + * (We could probably improve this by only invaliding in this way when + * the previous master left uncleanly. VMS docs mention that.) + * + * The LVB contents are only considered for changing when this is a new master + * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with + * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken + * from the lkb with the largest lvb sequence number. + */ + +static void recover_lvb(struct dlm_rsb *r) +{ + struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL; + uint32_t high_seq = 0; + int lock_lvb_exists = 0; + int lvblen = r->res_ls->ls_lvblen; + + if (!rsb_flag(r, RSB_NEW_MASTER2) && + rsb_flag(r, RSB_RECOVER_LVB_INVAL)) { + /* case 1 above */ + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!rsb_flag(r, RSB_NEW_MASTER2)) + return; + + /* we are the new master, so figure out if VALNOTVALID should + be set, and set the rsb lvb from the best lkb available. */ + + list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) { + if (!(iter->lkb_exflags & DLM_LKF_VALBLK)) + continue; + + lock_lvb_exists = 1; + + if (iter->lkb_grmode > DLM_LOCK_CR) { + big_lkb = iter; + goto setflag; + } + + if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = iter; + high_seq = iter->lkb_lvbseq; + } + } + + list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) { + if (!(iter->lkb_exflags & DLM_LKF_VALBLK)) + continue; + + lock_lvb_exists = 1; + + if (iter->lkb_grmode > DLM_LOCK_CR) { + big_lkb = iter; + goto setflag; + } + + if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = iter; + high_seq = iter->lkb_lvbseq; + } + } + + setflag: + if (!lock_lvb_exists) + goto out; + + /* lvb is invalidated if only NL/CR locks remain */ + if (!big_lkb) + rsb_set_flag(r, RSB_VALNOTVALID); + + if (!r->res_lvbptr) { + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + if (!r->res_lvbptr) + goto out; + } + + if (big_lkb) { + r->res_lvbseq = big_lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen); + } else if (high_lkb) { + r->res_lvbseq = high_lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen); + } else { + r->res_lvbseq = 0; + memset(r->res_lvbptr, 0, lvblen); + } + out: + return; +} + +/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks + converting PR->CW or CW->PR need to have their lkb_grmode set. */ + +static void recover_conversion(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + struct dlm_lkb *lkb; + int grmode = -1; + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + if (lkb->lkb_grmode == DLM_LOCK_PR || + lkb->lkb_grmode == DLM_LOCK_CW) { + grmode = lkb->lkb_grmode; + break; + } + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + if (lkb->lkb_grmode != DLM_LOCK_IV) + continue; + if (grmode == -1) { + log_debug(ls, "recover_conversion %x set gr to rq %d", + lkb->lkb_id, lkb->lkb_rqmode); + lkb->lkb_grmode = lkb->lkb_rqmode; + } else { + log_debug(ls, "recover_conversion %x set gr %d", + lkb->lkb_id, grmode); + lkb->lkb_grmode = grmode; + } + } +} + +/* We've become the new master for this rsb and waiting/converting locks may + need to be granted in dlm_recover_grant() due to locks that may have + existed from a removed node. */ + +static void recover_grant(struct dlm_rsb *r) +{ + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_RECOVER_GRANT); +} + +void dlm_recover_rsbs(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + unsigned int count = 0; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + lock_rsb(r); + if (is_master(r)) { + if (rsb_flag(r, RSB_RECOVER_CONVERT)) + recover_conversion(r); + + /* recover lvb before granting locks so the updated + lvb/VALNOTVALID is presented in the completion */ + recover_lvb(r); + + if (rsb_flag(r, RSB_NEW_MASTER2)) + recover_grant(r); + count++; + } else { + rsb_clear_flag(r, RSB_VALNOTVALID); + } + rsb_clear_flag(r, RSB_RECOVER_CONVERT); + rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL); + rsb_clear_flag(r, RSB_NEW_MASTER2); + unlock_rsb(r); + } + up_read(&ls->ls_root_sem); + + if (count) + log_rinfo(ls, "dlm_recover_rsbs %d done", count); +} + +/* Create a single list of all root rsb's to be used during recovery */ + +int dlm_create_root_list(struct dlm_ls *ls) +{ + struct rb_node *n; + struct dlm_rsb *r; + int i, error = 0; + + down_write(&ls->ls_root_sem); + if (!list_empty(&ls->ls_root_list)) { + log_error(ls, "root list not empty"); + error = -EINVAL; + goto out; + } + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + list_add(&r->res_root_list, &ls->ls_root_list); + dlm_hold_rsb(r); + } + + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss)) + log_error(ls, "dlm_create_root_list toss not empty"); + spin_unlock(&ls->ls_rsbtbl[i].lock); + } + out: + up_write(&ls->ls_root_sem); + return error; +} + +void dlm_release_root_list(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *safe; + + down_write(&ls->ls_root_sem); + list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) { + list_del_init(&r->res_root_list); + dlm_put_rsb(r); + } + up_write(&ls->ls_root_sem); +} + +void dlm_clear_toss(struct dlm_ls *ls) +{ + struct rb_node *n, *next; + struct dlm_rsb *r; + unsigned int count = 0; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) { + next = rb_next(n); + r = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].toss); + dlm_free_rsb(r); + count++; + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } + + if (count) + log_rinfo(ls, "dlm_clear_toss %u done", count); +} + diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h new file mode 100644 index 0000000000..dbc51013ec --- /dev/null +++ b/fs/dlm/recover.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVER_DOT_H__ +#define __RECOVER_DOT_H__ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); +uint32_t dlm_recover_status(struct dlm_ls *ls); +void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc); +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq); +void dlm_recovered_lock(struct dlm_rsb *r); +int dlm_create_root_list(struct dlm_ls *ls); +void dlm_release_root_list(struct dlm_ls *ls); +void dlm_clear_toss(struct dlm_ls *ls); +void dlm_recover_rsbs(struct dlm_ls *ls); + +#endif /* __RECOVER_DOT_H__ */ + diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c new file mode 100644 index 0000000000..4d17491dea --- /dev/null +++ b/fs/dlm/recoverd.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "dir.h" +#include "ast.h" +#include "recover.h" +#include "lowcomms.h" +#include "lock.h" +#include "requestqueue.h" +#include "recoverd.h" + + +/* If the start for which we're re-enabling locking (seq) has been superseded + by a newer stop (ls_recover_seq), we need to leave locking disabled. + + We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees + locking stopped and b) adds a message to the requestqueue, but dlm_recoverd + enables locking and clears the requestqueue between a and b. */ + +static int enable_locking(struct dlm_ls *ls, uint64_t seq) +{ + int error = -EINTR; + + down_write(&ls->ls_recv_active); + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_recover_seq == seq) { + set_bit(LSFL_RUNNING, &ls->ls_flags); + /* unblocks processes waiting to enter the dlm */ + up_write(&ls->ls_in_recovery); + clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + error = 0; + } + spin_unlock(&ls->ls_recover_lock); + + up_write(&ls->ls_recv_active); + return error; +} + +static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) +{ + unsigned long start; + int error, neg = 0; + + log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq); + + mutex_lock(&ls->ls_recoverd_active); + + dlm_callback_suspend(ls); + + dlm_clear_toss(ls); + + /* + * This list of root rsb's will be the basis of most of the recovery + * routines. + */ + + dlm_create_root_list(ls); + + /* + * Add or remove nodes from the lockspace's ls_nodes list. + * + * Due to the fact that we must report all membership changes to lsops + * or midcomms layer, it is not permitted to abort ls_recover() until + * this is done. + */ + + error = dlm_recover_members(ls, rv, &neg); + if (error) { + log_rinfo(ls, "dlm_recover_members error %d", error); + goto fail; + } + + dlm_recover_dir_nodeid(ls); + + ls->ls_recover_dir_sent_res = 0; + ls->ls_recover_dir_sent_msg = 0; + ls->ls_recover_locks_in = 0; + + dlm_set_recover_status(ls, DLM_RS_NODES); + + error = dlm_recover_members_wait(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_members_wait error %d", error); + goto fail; + } + + start = jiffies; + + /* + * Rebuild our own share of the directory by collecting from all other + * nodes their master rsb names that hash to us. + */ + + error = dlm_recover_directory(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_directory error %d", error); + goto fail; + } + + dlm_set_recover_status(ls, DLM_RS_DIR); + + error = dlm_recover_directory_wait(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_directory_wait error %d", error); + goto fail; + } + + log_rinfo(ls, "dlm_recover_directory %u out %u messages", + ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); + + /* + * We may have outstanding operations that are waiting for a reply from + * a failed node. Mark these to be resent after recovery. Unlock and + * cancel ops can just be completed. + */ + + dlm_recover_waiters_pre(ls); + + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto fail; + } + + if (neg || dlm_no_directory(ls)) { + /* + * Clear lkb's for departed nodes. + */ + + dlm_recover_purge(ls); + + /* + * Get new master nodeid's for rsb's that were mastered on + * departed nodes. + */ + + error = dlm_recover_masters(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_masters error %d", error); + goto fail; + } + + /* + * Send our locks on remastered rsb's to the new masters. + */ + + error = dlm_recover_locks(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_locks error %d", error); + goto fail; + } + + dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); + goto fail; + } + + log_rinfo(ls, "dlm_recover_locks %u in", + ls->ls_recover_locks_in); + + /* + * Finalize state in master rsb's now that all locks can be + * checked. This includes conversion resolution and lvb + * settings. + */ + + dlm_recover_rsbs(ls); + } else { + /* + * Other lockspace members may be going through the "neg" steps + * while also adding us to the lockspace, in which case they'll + * be doing the recover_locks (RS_LOCKS) barrier. + */ + dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); + goto fail; + } + } + + dlm_release_root_list(ls); + + /* + * Purge directory-related requests that are saved in requestqueue. + * All dir requests from before recovery are invalid now due to the dir + * rebuild and will be resent by the requesting nodes. + */ + + dlm_purge_requestqueue(ls); + + dlm_set_recover_status(ls, DLM_RS_DONE); + + error = dlm_recover_done_wait(ls, rv->seq); + if (error) { + log_rinfo(ls, "dlm_recover_done_wait error %d", error); + goto fail; + } + + dlm_clear_members_gone(ls); + + dlm_callback_resume(ls); + + error = enable_locking(ls, rv->seq); + if (error) { + log_rinfo(ls, "enable_locking error %d", error); + goto fail; + } + + error = dlm_process_requestqueue(ls); + if (error) { + log_rinfo(ls, "dlm_process_requestqueue error %d", error); + goto fail; + } + + error = dlm_recover_waiters_post(ls); + if (error) { + log_rinfo(ls, "dlm_recover_waiters_post error %d", error); + goto fail; + } + + dlm_recover_grant(ls); + + log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms", + (unsigned long long)rv->seq, ls->ls_generation, + jiffies_to_msecs(jiffies - start)); + mutex_unlock(&ls->ls_recoverd_active); + + return 0; + + fail: + dlm_release_root_list(ls); + mutex_unlock(&ls->ls_recoverd_active); + + return error; +} + +/* The dlm_ls_start() that created the rv we take here may already have been + stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP + flag set. */ + +static void do_ls_recovery(struct dlm_ls *ls) +{ + struct dlm_recover *rv = NULL; + int error; + + spin_lock(&ls->ls_recover_lock); + rv = ls->ls_recover_args; + ls->ls_recover_args = NULL; + if (rv && ls->ls_recover_seq == rv->seq) + clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + spin_unlock(&ls->ls_recover_lock); + + if (rv) { + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); + + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + + kfree(rv->nodes); + kfree(rv); + } +} + +static int dlm_recoverd(void *arg) +{ + struct dlm_ls *ls; + + ls = dlm_find_lockspace_local(arg); + if (!ls) { + log_print("dlm_recoverd: no lockspace %p", arg); + return -1; + } + + down_write(&ls->ls_in_recovery); + set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + wake_up(&ls->ls_recover_lock_wait); + + while (1) { + /* + * We call kthread_should_stop() after set_current_state(). + * This is because it works correctly if kthread_stop() is + * called just before set_current_state(). + */ + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && + !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + if (kthread_should_stop()) + break; + schedule(); + } + set_current_state(TASK_RUNNING); + + if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + down_write(&ls->ls_in_recovery); + set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + wake_up(&ls->ls_recover_lock_wait); + } + + if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags)) + do_ls_recovery(ls); + } + + if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)) + up_write(&ls->ls_in_recovery); + + dlm_put_lockspace(ls); + return 0; +} + +int dlm_recoverd_start(struct dlm_ls *ls) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(dlm_recoverd, ls, "dlm_recoverd"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + ls->ls_recoverd_task = p; + return error; +} + +void dlm_recoverd_stop(struct dlm_ls *ls) +{ + kthread_stop(ls->ls_recoverd_task); +} + +void dlm_recoverd_suspend(struct dlm_ls *ls) +{ + wake_up(&ls->ls_wait_general); + mutex_lock(&ls->ls_recoverd_active); +} + +void dlm_recoverd_resume(struct dlm_ls *ls) +{ + mutex_unlock(&ls->ls_recoverd_active); +} + diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h new file mode 100644 index 0000000000..d1944dc5f9 --- /dev/null +++ b/fs/dlm/recoverd.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVERD_DOT_H__ +#define __RECOVERD_DOT_H__ + +void dlm_recoverd_stop(struct dlm_ls *ls); +int dlm_recoverd_start(struct dlm_ls *ls); +void dlm_recoverd_suspend(struct dlm_ls *ls); +void dlm_recoverd_resume(struct dlm_ls *ls); + +#endif /* __RECOVERD_DOT_H__ */ + diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c new file mode 100644 index 0000000000..892d6ca21e --- /dev/null +++ b/fs/dlm/requestqueue.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "member.h" +#include "lock.h" +#include "dir.h" +#include "config.h" +#include "requestqueue.h" +#include "util.h" + +struct rq_entry { + struct list_head list; + uint32_t recover_seq; + int nodeid; + struct dlm_message request; +}; + +/* + * Requests received while the lockspace is in recovery get added to the + * request queue and processed when recovery is complete. This happens when + * the lockspace is suspended on some nodes before it is on others, or the + * lockspace is enabled on some while still suspended on others. + */ + +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms) +{ + struct rq_entry *e; + int length = le16_to_cpu(ms->m_header.h_length) - + sizeof(struct dlm_message); + + e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); + if (!e) { + log_print("dlm_add_requestqueue: out of memory len %d", length); + return; + } + + e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; + e->nodeid = nodeid; + memcpy(&e->request, ms, sizeof(*ms)); + memcpy(&e->request.m_extra, ms->m_extra, length); + + atomic_inc(&ls->ls_requestqueue_cnt); + mutex_lock(&ls->ls_requestqueue_mutex); + list_add_tail(&e->list, &ls->ls_requestqueue); + mutex_unlock(&ls->ls_requestqueue_mutex); +} + +/* + * Called by dlm_recoverd to process normal messages saved while recovery was + * happening. Normal locking has been enabled before this is called. dlm_recv + * upon receiving a message, will wait for all saved messages to be drained + * here before processing the message it got. If a new dlm_ls_stop() arrives + * while we're processing these saved messages, it may block trying to suspend + * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that + * case, we don't abort since locking_stopped is still 0. If dlm_recv is not + * waiting for us, then this processing may be aborted due to locking_stopped. + */ + +int dlm_process_requestqueue(struct dlm_ls *ls) +{ + struct rq_entry *e; + struct dlm_message *ms; + int error = 0; + + mutex_lock(&ls->ls_requestqueue_mutex); + + for (;;) { + if (list_empty(&ls->ls_requestqueue)) { + mutex_unlock(&ls->ls_requestqueue_mutex); + error = 0; + break; + } + e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); + mutex_unlock(&ls->ls_requestqueue_mutex); + + ms = &e->request; + + log_limit(ls, "dlm_process_requestqueue msg %d from %d " + "lkid %x remid %x result %d seq %u", + le32_to_cpu(ms->m_type), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid), + from_dlm_errno(le32_to_cpu(ms->m_result)), + e->recover_seq); + + dlm_receive_message_saved(ls, &e->request, e->recover_seq); + + mutex_lock(&ls->ls_requestqueue_mutex); + list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); + kfree(e); + + if (dlm_locking_stopped(ls)) { + log_debug(ls, "process_requestqueue abort running"); + mutex_unlock(&ls->ls_requestqueue_mutex); + error = -EINTR; + break; + } + schedule(); + } + + return error; +} + +/* + * After recovery is done, locking is resumed and dlm_recoverd takes all the + * saved requests and processes them as they would have been by dlm_recv. At + * the same time, dlm_recv will start receiving new requests from remote nodes. + * We want to delay dlm_recv processing new requests until dlm_recoverd has + * finished processing the old saved requests. We don't check for locking + * stopped here because dlm_ls_stop won't stop locking until it's suspended us + * (dlm_recv). + */ + +void dlm_wait_requestqueue(struct dlm_ls *ls) +{ + wait_event(ls->ls_requestqueue_wait, + atomic_read(&ls->ls_requestqueue_cnt) == 0); +} + +static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) +{ + __le32 type = ms->m_type; + + /* the ls is being cleaned up and freed by release_lockspace */ + if (!atomic_read(&ls->ls_count)) + return 1; + + if (dlm_is_removed(ls, nodeid)) + return 1; + + /* directory operations are always purged because the directory is + always rebuilt during recovery and the lookups resent */ + + if (type == cpu_to_le32(DLM_MSG_REMOVE) || + type == cpu_to_le32(DLM_MSG_LOOKUP) || + type == cpu_to_le32(DLM_MSG_LOOKUP_REPLY)) + return 1; + + if (!dlm_no_directory(ls)) + return 0; + + return 1; +} + +void dlm_purge_requestqueue(struct dlm_ls *ls) +{ + struct dlm_message *ms; + struct rq_entry *e, *safe; + + mutex_lock(&ls->ls_requestqueue_mutex); + list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) { + ms = &e->request; + + if (purge_request(ls, ms, e->nodeid)) { + list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); + kfree(e); + } + } + mutex_unlock(&ls->ls_requestqueue_mutex); +} + diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h new file mode 100644 index 0000000000..42bfe23cea --- /dev/null +++ b/fs/dlm/requestqueue.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __REQUESTQUEUE_DOT_H__ +#define __REQUESTQUEUE_DOT_H__ + +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms); +int dlm_process_requestqueue(struct dlm_ls *ls); +void dlm_wait_requestqueue(struct dlm_ls *ls); +void dlm_purge_requestqueue(struct dlm_ls *ls); + +#endif + diff --git a/fs/dlm/user.c b/fs/dlm/user.c new file mode 100644 index 0000000000..695e691b38 --- /dev/null +++ b/fs/dlm/user.c @@ -0,0 +1,1046 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + */ + +#include <linux/miscdevice.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/signal.h> +#include <linux/spinlock.h> +#include <linux/dlm.h> +#include <linux/dlm_device.h> +#include <linux/slab.h> +#include <linux/sched/signal.h> + +#include <trace/events/dlm.h> + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lock.h" +#include "lvb_table.h" +#include "user.h" +#include "ast.h" +#include "config.h" +#include "memory.h" + +static const char name_prefix[] = "dlm"; +static const struct file_operations device_fops; +static atomic_t dlm_monitor_opened; +static int dlm_monitor_unused = 1; + +#ifdef CONFIG_COMPAT + +struct dlm_lock_params32 { + __u8 mode; + __u8 namelen; + __u16 unused; + __u32 flags; + __u32 lkid; + __u32 parent; + __u64 xid; + __u64 timeout; + __u32 castparam; + __u32 castaddr; + __u32 bastparam; + __u32 bastaddr; + __u32 lksb; + char lvb[DLM_USER_LVB_LEN]; + char name[]; +}; + +struct dlm_write_request32 { + __u32 version[3]; + __u8 cmd; + __u8 is64bit; + __u8 unused[2]; + + union { + struct dlm_lock_params32 lock; + struct dlm_lspace_params lspace; + struct dlm_purge_params purge; + } i; +}; + +struct dlm_lksb32 { + __u32 sb_status; + __u32 sb_lkid; + __u8 sb_flags; + __u32 sb_lvbptr; +}; + +struct dlm_lock_result32 { + __u32 version[3]; + __u32 length; + __u32 user_astaddr; + __u32 user_astparam; + __u32 user_lksb; + struct dlm_lksb32 lksb; + __u8 bast_mode; + __u8 unused[3]; + /* Offsets may be zero if no data is present */ + __u32 lvb_offset; +}; + +static void compat_input(struct dlm_write_request *kb, + struct dlm_write_request32 *kb32, + int namelen) +{ + kb->version[0] = kb32->version[0]; + kb->version[1] = kb32->version[1]; + kb->version[2] = kb32->version[2]; + + kb->cmd = kb32->cmd; + kb->is64bit = kb32->is64bit; + if (kb->cmd == DLM_USER_CREATE_LOCKSPACE || + kb->cmd == DLM_USER_REMOVE_LOCKSPACE) { + kb->i.lspace.flags = kb32->i.lspace.flags; + kb->i.lspace.minor = kb32->i.lspace.minor; + memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen); + } else if (kb->cmd == DLM_USER_PURGE) { + kb->i.purge.nodeid = kb32->i.purge.nodeid; + kb->i.purge.pid = kb32->i.purge.pid; + } else { + kb->i.lock.mode = kb32->i.lock.mode; + kb->i.lock.namelen = kb32->i.lock.namelen; + kb->i.lock.flags = kb32->i.lock.flags; + kb->i.lock.lkid = kb32->i.lock.lkid; + kb->i.lock.parent = kb32->i.lock.parent; + kb->i.lock.xid = kb32->i.lock.xid; + kb->i.lock.timeout = kb32->i.lock.timeout; + kb->i.lock.castparam = (__user void *)(long)kb32->i.lock.castparam; + kb->i.lock.castaddr = (__user void *)(long)kb32->i.lock.castaddr; + kb->i.lock.bastparam = (__user void *)(long)kb32->i.lock.bastparam; + kb->i.lock.bastaddr = (__user void *)(long)kb32->i.lock.bastaddr; + kb->i.lock.lksb = (__user void *)(long)kb32->i.lock.lksb; + memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); + memcpy(kb->i.lock.name, kb32->i.lock.name, namelen); + } +} + +static void compat_output(struct dlm_lock_result *res, + struct dlm_lock_result32 *res32) +{ + memset(res32, 0, sizeof(*res32)); + + res32->version[0] = res->version[0]; + res32->version[1] = res->version[1]; + res32->version[2] = res->version[2]; + + res32->user_astaddr = (__u32)(__force long)res->user_astaddr; + res32->user_astparam = (__u32)(__force long)res->user_astparam; + res32->user_lksb = (__u32)(__force long)res->user_lksb; + res32->bast_mode = res->bast_mode; + + res32->lvb_offset = res->lvb_offset; + res32->length = res->length; + + res32->lksb.sb_status = res->lksb.sb_status; + res32->lksb.sb_flags = res->lksb.sb_flags; + res32->lksb.sb_lkid = res->lksb.sb_lkid; + res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr; +} +#endif + +/* should held proc->asts_spin lock */ +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) +{ + struct dlm_callback *cb, *safe; + + list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) { + list_del(&cb->list); + kref_put(&cb->ref, dlm_release_callback); + } + + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); + + /* invalidate */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + lkb->lkb_last_bast_mode = -1; +} + +/* Figure out if this lock is at the end of its life and no longer + available for the application to use. The lkb still exists until + the final ast is read. A lock becomes EOL in three situations: + 1. a noqueue request fails with EAGAIN + 2. an unlock completes with EUNLOCK + 3. a cancel of a waiting request completes with ECANCEL/EDEADLK + An EOL lock needs to be removed from the process's list of locks. + And we can't allow any new operation on an EOL lock. This is + not related to the lifetime of the lkb struct which is managed + entirely by refcount. */ + +static int lkb_is_endoflife(int mode, int status) +{ + switch (status) { + case -DLM_EUNLOCK: + return 1; + case -DLM_ECANCEL: + case -ETIMEDOUT: + case -EDEADLK: + case -EAGAIN: + if (mode == DLM_LOCK_IV) + return 1; + break; + } + return 0; +} + +/* we could possibly check if the cancel of an orphan has resulted in the lkb + being removed and then remove that lkb from the orphans list and free it */ + +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + struct dlm_user_proc *proc; + int rv; + + if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) || + test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags)) + return; + + ls = lkb->lkb_resource->res_ls; + spin_lock(&ls->ls_clear_proc_locks); + + /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast + can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed + lkb->ua so we can't try to use it. This second check is necessary + for cases where a completion ast is received for an operation that + began before clear_proc_locks did its cancel/unlock. */ + + if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) || + test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags)) + goto out; + + DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb);); + ua = lkb->lkb_ua; + proc = ua->proc; + + if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL) + goto out; + + if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status)) + set_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags); + + spin_lock(&proc->asts_spin); + + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_FAILURE: + spin_unlock(&proc->asts_spin); + WARN_ON_ONCE(1); + goto out; + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: + kref_get(&lkb->lkb_ref); + list_add_tail(&lkb->lkb_cb_list, &proc->asts); + wake_up_interruptible(&proc->wait); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; + } + spin_unlock(&proc->asts_spin); + + if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) { + /* N.B. spin_lock locks_spin, not asts_spin */ + spin_lock(&proc->locks_spin); + if (!list_empty(&lkb->lkb_ownqueue)) { + list_del_init(&lkb->lkb_ownqueue); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->locks_spin); + } + out: + spin_unlock(&ls->ls_clear_proc_locks); +} + +static int device_user_lock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + uint32_t lkid; + int error = -ENOMEM; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + if (!params->castaddr || !params->lksb) { + error = -EINVAL; + goto out; + } + + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; + ua->user_lksb = params->lksb; + ua->castparam = params->castparam; + ua->castaddr = params->castaddr; + ua->bastparam = params->bastparam; + ua->bastaddr = params->bastaddr; + ua->xid = params->xid; + + if (params->flags & DLM_LKF_CONVERT) { + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); + } else if (params->flags & DLM_LKF_ORPHAN) { + error = dlm_user_adopt_orphan(ls, ua, + params->mode, params->flags, + params->name, params->namelen, + &lkid); + if (!error) + error = lkid; + } else { + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); + if (!error) + error = ua->lksb.sb_lkid; + } + out: + dlm_put_lockspace(ls); + return error; +} + +static int device_user_unlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + int error = -ENOMEM; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; + ua->user_lksb = params->lksb; + ua->castparam = params->castparam; + ua->castaddr = params->castaddr; + + if (params->flags & DLM_LKF_CANCEL) + error = dlm_user_cancel(ls, ua, params->flags, params->lkid); + else + error = dlm_user_unlock(ls, ua, params->flags, params->lkid, + params->lvb); + out: + dlm_put_lockspace(ls); + return error; +} + +static int device_user_deadlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_deadlock(ls, params->flags, params->lkid); + + dlm_put_lockspace(ls); + return error; +} + +static int dlm_device_register(struct dlm_ls *ls, char *name) +{ + int error, len; + + /* The device is already registered. This happens when the + lockspace is created multiple times from userspace. */ + if (ls->ls_device.name) + return 0; + + error = -ENOMEM; + len = strlen(name) + strlen(name_prefix) + 2; + ls->ls_device.name = kzalloc(len, GFP_NOFS); + if (!ls->ls_device.name) + goto fail; + + snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix, + name); + ls->ls_device.fops = &device_fops; + ls->ls_device.minor = MISC_DYNAMIC_MINOR; + + error = misc_register(&ls->ls_device); + if (error) { + kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; + } +fail: + return error; +} + +int dlm_device_deregister(struct dlm_ls *ls) +{ + /* The device is not registered. This happens when the lockspace + was never used from userspace, or when device_create_lockspace() + calls dlm_release_lockspace() after the register fails. */ + if (!ls->ls_device.name) + return 0; + + misc_deregister(&ls->ls_device); + kfree(ls->ls_device.name); + return 0; +} + +static int device_user_purge(struct dlm_user_proc *proc, + struct dlm_purge_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_purge(ls, proc, params->nodeid, params->pid); + + dlm_put_lockspace(ls); + return error; +} + +static int device_create_lockspace(struct dlm_lspace_params *params) +{ + dlm_lockspace_t *lockspace; + struct dlm_ls *ls; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name, + params->flags, DLM_USER_LVB_LEN, NULL, + NULL, NULL, &lockspace); + if (error) + return error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -ENOENT; + + error = dlm_device_register(ls, params->name); + dlm_put_lockspace(ls); + + if (error) + dlm_release_lockspace(lockspace, 0); + else + error = ls->ls_device.minor; + + return error; +} + +static int device_remove_lockspace(struct dlm_lspace_params *params) +{ + dlm_lockspace_t *lockspace; + struct dlm_ls *ls; + int error, force = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ls = dlm_find_lockspace_device(params->minor); + if (!ls) + return -ENOENT; + + if (params->flags & DLM_USER_LSFLG_FORCEFREE) + force = 2; + + lockspace = ls->ls_local_handle; + dlm_put_lockspace(ls); + + /* The final dlm_release_lockspace waits for references to go to + zero, so all processes will need to close their device for the + ls before the release will proceed. release also calls the + device_deregister above. Converting a positive return value + from release to zero means that userspace won't know when its + release was the final one, but it shouldn't need to know. */ + + error = dlm_release_lockspace(lockspace, force); + if (error > 0) + error = 0; + return error; +} + +/* Check the user's version matches ours */ +static int check_version(struct dlm_write_request *req) +{ + if (req->version[0] != DLM_DEVICE_VERSION_MAJOR || + (req->version[0] == DLM_DEVICE_VERSION_MAJOR && + req->version[1] > DLM_DEVICE_VERSION_MINOR)) { + + printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " + "user (%d.%d.%d) kernel (%d.%d.%d)\n", + current->comm, + task_pid_nr(current), + req->version[0], + req->version[1], + req->version[2], + DLM_DEVICE_VERSION_MAJOR, + DLM_DEVICE_VERSION_MINOR, + DLM_DEVICE_VERSION_PATCH); + return -EINVAL; + } + return 0; +} + +/* + * device_write + * + * device_user_lock + * dlm_user_request -> request_lock + * dlm_user_convert -> convert_lock + * + * device_user_unlock + * dlm_user_unlock -> unlock_lock + * dlm_user_cancel -> cancel_lock + * + * device_create_lockspace + * dlm_new_lockspace + * + * device_remove_lockspace + * dlm_release_lockspace + */ + +/* a write to a lockspace device is a lock or unlock request, a write + to the control device is to create/remove a lockspace */ + +static ssize_t device_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_write_request *kbuf; + int error; + +#ifdef CONFIG_COMPAT + if (count < sizeof(struct dlm_write_request32)) +#else + if (count < sizeof(struct dlm_write_request)) +#endif + return -EINVAL; + + /* + * can't compare against COMPAT/dlm_write_request32 because + * we don't yet know if is64bit is zero + */ + if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) + return -EINVAL; + + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (check_version(kbuf)) { + error = -EBADE; + goto out_free; + } + +#ifdef CONFIG_COMPAT + if (!kbuf->is64bit) { + struct dlm_write_request32 *k32buf; + int namelen = 0; + + if (count > sizeof(struct dlm_write_request32)) + namelen = count - sizeof(struct dlm_write_request32); + + k32buf = (struct dlm_write_request32 *)kbuf; + + /* add 1 after namelen so that the name string is terminated */ + kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, + GFP_NOFS); + if (!kbuf) { + kfree(k32buf); + return -ENOMEM; + } + + if (proc) + set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); + + compat_input(kbuf, k32buf, namelen); + kfree(k32buf); + } +#endif + + /* do we really need this? can a write happen after a close? */ + if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) && + (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) { + error = -EINVAL; + goto out_free; + } + + error = -EINVAL; + + switch (kbuf->cmd) + { + case DLM_USER_LOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_lock(proc, &kbuf->i.lock); + break; + + case DLM_USER_UNLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_unlock(proc, &kbuf->i.lock); + break; + + case DLM_USER_DEADLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_deadlock(proc, &kbuf->i.lock); + break; + + case DLM_USER_CREATE_LOCKSPACE: + if (proc) { + log_print("create/remove only on control device"); + goto out_free; + } + error = device_create_lockspace(&kbuf->i.lspace); + break; + + case DLM_USER_REMOVE_LOCKSPACE: + if (proc) { + log_print("create/remove only on control device"); + goto out_free; + } + error = device_remove_lockspace(&kbuf->i.lspace); + break; + + case DLM_USER_PURGE: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_purge(proc, &kbuf->i.purge); + break; + + default: + log_print("Unknown command passed to DLM device : %d\n", + kbuf->cmd); + } + + out_free: + kfree(kbuf); + return error; +} + +/* Every process that opens the lockspace device has its own "proc" structure + hanging off the open file that's used to keep track of locks owned by the + process and asts that need to be delivered to the process. */ + +static int device_open(struct inode *inode, struct file *file) +{ + struct dlm_user_proc *proc; + struct dlm_ls *ls; + + ls = dlm_find_lockspace_device(iminor(inode)); + if (!ls) + return -ENOENT; + + proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS); + if (!proc) { + dlm_put_lockspace(ls); + return -ENOMEM; + } + + proc->lockspace = ls->ls_local_handle; + INIT_LIST_HEAD(&proc->asts); + INIT_LIST_HEAD(&proc->locks); + INIT_LIST_HEAD(&proc->unlocking); + spin_lock_init(&proc->asts_spin); + spin_lock_init(&proc->locks_spin); + init_waitqueue_head(&proc->wait); + file->private_data = proc; + + return 0; +} + +static int device_close(struct inode *inode, struct file *file) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_ls *ls; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); + + dlm_clear_proc_locks(ls, proc); + + /* at this point no more lkb's should exist for this lockspace, + so there's no chance of dlm_user_add_ast() being called and + looking for lkb->ua->proc */ + + kfree(proc); + file->private_data = NULL; + + dlm_put_lockspace(ls); + dlm_put_lockspace(ls); /* for the find in device_open() */ + + /* FIXME: AUTOFREE: if this ls is no longer used do + device_remove_lockspace() */ + + return 0; +} + +static int copy_result_to_user(struct dlm_user_args *ua, int compat, + uint32_t flags, int mode, int copy_lvb, + char __user *buf, size_t count) +{ +#ifdef CONFIG_COMPAT + struct dlm_lock_result32 result32; +#endif + struct dlm_lock_result result; + void *resultptr; + int error=0; + int len; + int struct_len; + + memset(&result, 0, sizeof(struct dlm_lock_result)); + result.version[0] = DLM_DEVICE_VERSION_MAJOR; + result.version[1] = DLM_DEVICE_VERSION_MINOR; + result.version[2] = DLM_DEVICE_VERSION_PATCH; + memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr)); + result.user_lksb = ua->user_lksb; + + /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated + in a conversion unless the conversion is successful. See code + in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though, + notes that a new blocking AST address and parameter are set even if + the conversion fails, so maybe we should just do that. */ + + if (flags & DLM_CB_BAST) { + result.user_astaddr = ua->bastaddr; + result.user_astparam = ua->bastparam; + result.bast_mode = mode; + } else { + result.user_astaddr = ua->castaddr; + result.user_astparam = ua->castparam; + } + +#ifdef CONFIG_COMPAT + if (compat) + len = sizeof(struct dlm_lock_result32); + else +#endif + len = sizeof(struct dlm_lock_result); + struct_len = len; + + /* copy lvb to userspace if there is one, it's been updated, and + the user buffer has space for it */ + + if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) { + if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, + DLM_USER_LVB_LEN)) { + error = -EFAULT; + goto out; + } + + result.lvb_offset = len; + len += DLM_USER_LVB_LEN; + } + + result.length = len; + resultptr = &result; +#ifdef CONFIG_COMPAT + if (compat) { + compat_output(&result, &result32); + resultptr = &result32; + } +#endif + + if (copy_to_user(buf, resultptr, struct_len)) + error = -EFAULT; + else + error = len; + out: + return error; +} + +static int copy_version_to_user(char __user *buf, size_t count) +{ + struct dlm_device_version ver; + + memset(&ver, 0, sizeof(struct dlm_device_version)); + ver.version[0] = DLM_DEVICE_VERSION_MAJOR; + ver.version[1] = DLM_DEVICE_VERSION_MINOR; + ver.version[2] = DLM_DEVICE_VERSION_PATCH; + + if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version))) + return -EFAULT; + return sizeof(struct dlm_device_version); +} + +/* a read returns a single ast described in a struct dlm_lock_result */ + +static ssize_t device_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_lkb *lkb; + DECLARE_WAITQUEUE(wait, current); + struct dlm_callback *cb; + int rv, copy_lvb = 0; + int old_mode, new_mode; + + if (count == sizeof(struct dlm_device_version)) { + rv = copy_version_to_user(buf, count); + return rv; + } + + if (!proc) { + log_print("non-version read from control device %zu", count); + return -EINVAL; + } + +#ifdef CONFIG_COMPAT + if (count < sizeof(struct dlm_lock_result32)) +#else + if (count < sizeof(struct dlm_lock_result)) +#endif + return -EINVAL; + + try_another: + + /* do we really need this? can a read happen after a close? */ + if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) + return -EINVAL; + + spin_lock(&proc->asts_spin); + if (list_empty(&proc->asts)) { + if (file->f_flags & O_NONBLOCK) { + spin_unlock(&proc->asts_spin); + return -EAGAIN; + } + + add_wait_queue(&proc->wait, &wait); + + repeat: + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&proc->asts) && !signal_pending(current)) { + spin_unlock(&proc->asts_spin); + schedule(); + spin_lock(&proc->asts_spin); + goto repeat; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&proc->wait, &wait); + + if (signal_pending(current)) { + spin_unlock(&proc->asts_spin); + return -ERESTARTSYS; + } + } + + /* if we empty lkb_callbacks, we don't want to unlock the spinlock + without removing lkb_cb_list; so empty lkb_cb_list is always + consistent with empty lkb_callbacks */ + + lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list); + + /* rem_lkb_callback sets a new lkb_last_cast */ + old_mode = lkb->lkb_last_cast->mode; + + rv = dlm_dequeue_lkb_callback(lkb, &cb); + switch (rv) { + case DLM_DEQUEUE_CALLBACK_EMPTY: + /* this shouldn't happen; lkb should have been removed from + * list when last item was dequeued + */ + log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); + list_del_init(&lkb->lkb_cb_list); + spin_unlock(&proc->asts_spin); + /* removes ref for proc->asts, may cause lkb to be freed */ + dlm_put_lkb(lkb); + WARN_ON_ONCE(1); + goto try_another; + case DLM_DEQUEUE_CALLBACK_LAST: + list_del_init(&lkb->lkb_cb_list); + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); + break; + case DLM_DEQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; + } + spin_unlock(&proc->asts_spin); + + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + new_mode = cb->mode; + + if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr && + dlm_lvb_operations[old_mode + 1][new_mode + 1]) + copy_lvb = 1; + + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; + trace_dlm_ast(lkb->lkb_resource->res_ls, lkb); + } + + rv = copy_result_to_user(lkb->lkb_ua, + test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), + cb->flags, cb->mode, copy_lvb, buf, count); + + kref_put(&cb->ref, dlm_release_callback); + + /* removes ref for proc->asts, may cause lkb to be freed */ + if (rv == DLM_DEQUEUE_CALLBACK_LAST) + dlm_put_lkb(lkb); + + return rv; +} + +static __poll_t device_poll(struct file *file, poll_table *wait) +{ + struct dlm_user_proc *proc = file->private_data; + + poll_wait(file, &proc->wait, wait); + + spin_lock(&proc->asts_spin); + if (!list_empty(&proc->asts)) { + spin_unlock(&proc->asts_spin); + return EPOLLIN | EPOLLRDNORM; + } + spin_unlock(&proc->asts_spin); + return 0; +} + +int dlm_user_daemon_available(void) +{ + /* dlm_controld hasn't started (or, has started, but not + properly populated configfs) */ + + if (!dlm_our_nodeid()) + return 0; + + /* This is to deal with versions of dlm_controld that don't + know about the monitor device. We assume that if the + dlm_controld was started (above), but the monitor device + was never opened, that it's an old version. dlm_controld + should open the monitor device before populating configfs. */ + + if (dlm_monitor_unused) + return 1; + + return atomic_read(&dlm_monitor_opened) ? 1 : 0; +} + +static int ctl_device_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return 0; +} + +static int ctl_device_close(struct inode *inode, struct file *file) +{ + return 0; +} + +static int monitor_device_open(struct inode *inode, struct file *file) +{ + atomic_inc(&dlm_monitor_opened); + dlm_monitor_unused = 0; + return 0; +} + +static int monitor_device_close(struct inode *inode, struct file *file) +{ + if (atomic_dec_and_test(&dlm_monitor_opened)) + dlm_stop_lockspaces(); + return 0; +} + +static const struct file_operations device_fops = { + .open = device_open, + .release = device_close, + .read = device_read, + .write = device_write, + .poll = device_poll, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static const struct file_operations ctl_device_fops = { + .open = ctl_device_open, + .release = ctl_device_close, + .read = device_read, + .write = device_write, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice ctl_device = { + .name = "dlm-control", + .fops = &ctl_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +static const struct file_operations monitor_device_fops = { + .open = monitor_device_open, + .release = monitor_device_close, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice monitor_device = { + .name = "dlm-monitor", + .fops = &monitor_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +int __init dlm_user_init(void) +{ + int error; + + atomic_set(&dlm_monitor_opened, 0); + + error = misc_register(&ctl_device); + if (error) { + log_print("misc_register failed for control device"); + goto out; + } + + error = misc_register(&monitor_device); + if (error) { + log_print("misc_register failed for monitor device"); + misc_deregister(&ctl_device); + } + out: + return error; +} + +void dlm_user_exit(void) +{ + misc_deregister(&ctl_device); + misc_deregister(&monitor_device); +} + diff --git a/fs/dlm/user.h b/fs/dlm/user.h new file mode 100644 index 0000000000..2caf8e6e24 --- /dev/null +++ b/fs/dlm/user.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + */ + +#ifndef __USER_DOT_H__ +#define __USER_DOT_H__ + +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb); +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags); +int dlm_user_init(void); +void dlm_user_exit(void); +int dlm_device_deregister(struct dlm_ls *ls); +int dlm_user_daemon_available(void); + +#endif diff --git a/fs/dlm/util.c b/fs/dlm/util.c new file mode 100644 index 0000000000..f2bc401f31 --- /dev/null +++ b/fs/dlm/util.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "rcom.h" +#include "util.h" + +#define DLM_ERRNO_EDEADLK 35 +#define DLM_ERRNO_EBADR 53 +#define DLM_ERRNO_EBADSLT 57 +#define DLM_ERRNO_EPROTO 71 +#define DLM_ERRNO_EOPNOTSUPP 95 +#define DLM_ERRNO_ETIMEDOUT 110 +#define DLM_ERRNO_EINPROGRESS 115 + +/* higher errno values are inconsistent across architectures, so select + one set of values for on the wire */ + +int to_dlm_errno(int err) +{ + switch (err) { + case -EDEADLK: + return -DLM_ERRNO_EDEADLK; + case -EBADR: + return -DLM_ERRNO_EBADR; + case -EBADSLT: + return -DLM_ERRNO_EBADSLT; + case -EPROTO: + return -DLM_ERRNO_EPROTO; + case -EOPNOTSUPP: + return -DLM_ERRNO_EOPNOTSUPP; + case -ETIMEDOUT: + return -DLM_ERRNO_ETIMEDOUT; + case -EINPROGRESS: + return -DLM_ERRNO_EINPROGRESS; + } + return err; +} + +int from_dlm_errno(int err) +{ + switch (err) { + case -DLM_ERRNO_EDEADLK: + return -EDEADLK; + case -DLM_ERRNO_EBADR: + return -EBADR; + case -DLM_ERRNO_EBADSLT: + return -EBADSLT; + case -DLM_ERRNO_EPROTO: + return -EPROTO; + case -DLM_ERRNO_EOPNOTSUPP: + return -EOPNOTSUPP; + case -DLM_ERRNO_ETIMEDOUT: + return -ETIMEDOUT; + case -DLM_ERRNO_EINPROGRESS: + return -EINPROGRESS; + } + return err; +} diff --git a/fs/dlm/util.h b/fs/dlm/util.h new file mode 100644 index 0000000000..b6a4b8adca --- /dev/null +++ b/fs/dlm/util.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** +******************************************************************************* +******************************************************************************/ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +int to_dlm_errno(int err); +int from_dlm_errno(int err); + +#endif + |