diff options
Diffstat (limited to 'fs/dlm/lock.c')
-rw-r--r-- | fs/dlm/lock.c | 1068 |
1 files changed, 611 insertions, 457 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index fd752dd038..f103b8c305 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -201,7 +201,7 @@ void dlm_dump_rsb(struct dlm_rsb *r) /* Threads cannot use the lockspace while it's being recovered */ -static inline void dlm_lock_recovery(struct dlm_ls *ls) +void dlm_lock_recovery(struct dlm_ls *ls) { down_read(&ls->ls_in_recovery); } @@ -320,11 +320,18 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) * Basic operations on rsb's and lkb's */ +static inline unsigned long rsb_toss_jiffies(void) +{ + return jiffies + (READ_ONCE(dlm_config.ci_toss_secs) * HZ); +} + /* This is only called to add a reference when the code already holds a valid reference to the rsb, so there's no need for locking. */ static inline void hold_rsb(struct dlm_rsb *r) { + /* rsbs in toss state never get referenced */ + WARN_ON(rsb_flag(r, RSB_TOSS)); kref_get(&r->res_ref); } @@ -333,19 +340,48 @@ void dlm_hold_rsb(struct dlm_rsb *r) hold_rsb(r); } +/* TODO move this to lib/refcount.c */ +static __must_check bool +dlm_refcount_dec_and_write_lock_bh(refcount_t *r, rwlock_t *lock) +__cond_acquires(lock) +{ + if (refcount_dec_not_one(r)) + return false; + + write_lock_bh(lock); + if (!refcount_dec_and_test(r)) { + write_unlock_bh(lock); + return false; + } + + return true; +} + +/* TODO move this to include/linux/kref.h */ +static inline int dlm_kref_put_write_lock_bh(struct kref *kref, + void (*release)(struct kref *kref), + rwlock_t *lock) +{ + if (dlm_refcount_dec_and_write_lock_bh(&kref->refcount, lock)) { + release(kref); + return 1; + } + + return 0; +} + /* When all references to the rsb are gone it's transferred to the tossed list for later disposal. */ static void put_rsb(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; - uint32_t bucket = r->res_bucket; int rv; - rv = kref_put_lock(&r->res_ref, toss_rsb, - &ls->ls_rsbtbl[bucket].lock); + rv = dlm_kref_put_write_lock_bh(&r->res_ref, toss_rsb, + &ls->ls_rsbtbl_lock); if (rv) - spin_unlock(&ls->ls_rsbtbl[bucket].lock); + write_unlock_bh(&ls->ls_rsbtbl_lock); } void dlm_put_rsb(struct dlm_rsb *r) @@ -358,17 +394,17 @@ static int pre_rsb_struct(struct dlm_ls *ls) struct dlm_rsb *r1, *r2; int count = 0; - spin_lock(&ls->ls_new_rsb_spin); + spin_lock_bh(&ls->ls_new_rsb_spin); if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) { - spin_unlock(&ls->ls_new_rsb_spin); + spin_unlock_bh(&ls->ls_new_rsb_spin); return 0; } - spin_unlock(&ls->ls_new_rsb_spin); + spin_unlock_bh(&ls->ls_new_rsb_spin); r1 = dlm_allocate_rsb(ls); r2 = dlm_allocate_rsb(ls); - spin_lock(&ls->ls_new_rsb_spin); + spin_lock_bh(&ls->ls_new_rsb_spin); if (r1) { list_add(&r1->res_hashchain, &ls->ls_new_rsb); ls->ls_new_rsb_count++; @@ -378,13 +414,236 @@ static int pre_rsb_struct(struct dlm_ls *ls) ls->ls_new_rsb_count++; } count = ls->ls_new_rsb_count; - spin_unlock(&ls->ls_new_rsb_spin); + spin_unlock_bh(&ls->ls_new_rsb_spin); if (!count) return -ENOMEM; return 0; } +/* connected with timer_delete_sync() in dlm_ls_stop() to stop + * new timers when recovery is triggered and don't run them + * again until a dlm_timer_resume() tries it again. + */ +static void __rsb_mod_timer(struct dlm_ls *ls, unsigned long jiffies) +{ + if (!dlm_locking_stopped(ls)) + mod_timer(&ls->ls_timer, jiffies); +} + +/* This function tries to resume the timer callback if a rsb + * is on the toss list and no timer is pending. It might that + * the first entry is on currently executed as timer callback + * but we don't care if a timer queued up again and does + * nothing. Should be a rare case. + */ +void dlm_timer_resume(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + + spin_lock_bh(&ls->ls_toss_q_lock); + r = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb, + res_toss_q_list); + if (r && !timer_pending(&ls->ls_timer)) + __rsb_mod_timer(ls, r->res_toss_time); + spin_unlock_bh(&ls->ls_toss_q_lock); +} + +/* ls_rsbtbl_lock must be held and being sure the rsb is in toss state */ +static void rsb_delete_toss_timer(struct dlm_ls *ls, struct dlm_rsb *r) +{ + struct dlm_rsb *first; + + spin_lock_bh(&ls->ls_toss_q_lock); + r->res_toss_time = 0; + + /* if the rsb is not queued do nothing */ + if (list_empty(&r->res_toss_q_list)) + goto out; + + /* get the first element before delete */ + first = list_first_entry(&ls->ls_toss_q, struct dlm_rsb, + res_toss_q_list); + list_del_init(&r->res_toss_q_list); + /* check if the first element was the rsb we deleted */ + if (first == r) { + /* try to get the new first element, if the list + * is empty now try to delete the timer, if we are + * too late we don't care. + * + * if the list isn't empty and a new first element got + * in place, set the new timer expire time. + */ + first = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb, + res_toss_q_list); + if (!first) + timer_delete(&ls->ls_timer); + else + __rsb_mod_timer(ls, first->res_toss_time); + } + +out: + spin_unlock_bh(&ls->ls_toss_q_lock); +} + +/* Caller must held ls_rsbtbl_lock and need to be called every time + * when either the rsb enters toss state or the toss state changes + * the dir/master nodeid. + */ +static void rsb_mod_timer(struct dlm_ls *ls, struct dlm_rsb *r) +{ + int our_nodeid = dlm_our_nodeid(); + struct dlm_rsb *first; + + /* If we're the directory record for this rsb, and + * we're not the master of it, then we need to wait + * for the master node to send us a dir remove for + * before removing the dir record. + */ + if (!dlm_no_directory(ls) && + (r->res_master_nodeid != our_nodeid) && + (dlm_dir_nodeid(r) == our_nodeid)) { + rsb_delete_toss_timer(ls, r); + return; + } + + spin_lock_bh(&ls->ls_toss_q_lock); + /* set the new rsb absolute expire time in the rsb */ + r->res_toss_time = rsb_toss_jiffies(); + if (list_empty(&ls->ls_toss_q)) { + /* if the queue is empty add the element and it's + * our new expire time + */ + list_add_tail(&r->res_toss_q_list, &ls->ls_toss_q); + __rsb_mod_timer(ls, r->res_toss_time); + } else { + /* check if the rsb was already queued, if so delete + * it from the toss queue + */ + if (!list_empty(&r->res_toss_q_list)) + list_del(&r->res_toss_q_list); + + /* try to get the maybe new first element and then add + * to this rsb with the oldest expire time to the end + * of the queue. If the list was empty before this + * rsb expire time is our next expiration if it wasn't + * the now new first elemet is our new expiration time + */ + first = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb, + res_toss_q_list); + list_add_tail(&r->res_toss_q_list, &ls->ls_toss_q); + if (!first) + __rsb_mod_timer(ls, r->res_toss_time); + else + __rsb_mod_timer(ls, first->res_toss_time); + } + spin_unlock_bh(&ls->ls_toss_q_lock); +} + +/* if we hit contention we do in 250 ms a retry to trylock. + * if there is any other mod_timer in between we don't care + * about that it expires earlier again this is only for the + * unlikely case nothing happened in this time. + */ +#define DLM_TOSS_TIMER_RETRY (jiffies + msecs_to_jiffies(250)) + +void dlm_rsb_toss_timer(struct timer_list *timer) +{ + struct dlm_ls *ls = from_timer(ls, timer, ls_timer); + int our_nodeid = dlm_our_nodeid(); + struct dlm_rsb *r; + int rv; + + while (1) { + /* interrupting point to leave iteration when + * recovery waits for timer_delete_sync(), recovery + * will take care to delete everything in toss queue. + */ + if (dlm_locking_stopped(ls)) + break; + + rv = spin_trylock(&ls->ls_toss_q_lock); + if (!rv) { + /* rearm again try timer */ + __rsb_mod_timer(ls, DLM_TOSS_TIMER_RETRY); + break; + } + + r = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb, + res_toss_q_list); + if (!r) { + /* nothing to do anymore next rsb queue will + * set next mod_timer() expire. + */ + spin_unlock(&ls->ls_toss_q_lock); + break; + } + + /* test if the first rsb isn't expired yet, if + * so we stop freeing rsb from toss queue as + * the order in queue is ascending to the + * absolute res_toss_time jiffies + */ + if (time_before(jiffies, r->res_toss_time)) { + /* rearm with the next rsb to expire in the future */ + __rsb_mod_timer(ls, r->res_toss_time); + spin_unlock(&ls->ls_toss_q_lock); + break; + } + + /* in find_rsb_dir/nodir there is a reverse order of this + * lock, however this is only a trylock if we hit some + * possible contention we try it again. + * + * This lock synchronized while holding ls_toss_q_lock + * synchronize everything that rsb_delete_toss_timer() + * or rsb_mod_timer() can't run after this timer callback + * deletes the rsb from the ls_toss_q. Whereas the other + * holders have always a priority to run as this is only + * a caching handling and the other holders might to put + * this rsb out of the toss state. + */ + rv = write_trylock(&ls->ls_rsbtbl_lock); + if (!rv) { + spin_unlock(&ls->ls_toss_q_lock); + /* rearm again try timer */ + __rsb_mod_timer(ls, DLM_TOSS_TIMER_RETRY); + break; + } + + list_del(&r->res_rsbs_list); + rhashtable_remove_fast(&ls->ls_rsbtbl, &r->res_node, + dlm_rhash_rsb_params); + + /* not necessary to held the ls_rsbtbl_lock when + * calling send_remove() + */ + write_unlock(&ls->ls_rsbtbl_lock); + + /* remove the rsb out of the toss queue its gone + * drom DLM now + */ + list_del_init(&r->res_toss_q_list); + spin_unlock(&ls->ls_toss_q_lock); + + /* no rsb in this state should ever run a timer */ + WARN_ON(!dlm_no_directory(ls) && + (r->res_master_nodeid != our_nodeid) && + (dlm_dir_nodeid(r) == our_nodeid)); + + /* We're the master of this rsb but we're not + * the directory record, so we need to tell the + * dir node to remove the dir record + */ + if (!dlm_no_directory(ls) && + (r->res_master_nodeid == our_nodeid) && + (dlm_dir_nodeid(r) != our_nodeid)) + send_remove(r); + + free_toss_rsb(r); + } +} + /* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can unlock any spinlocks, go back and call pre_rsb_struct again. Otherwise, take an rsb off the list and return it. */ @@ -395,10 +654,10 @@ static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len, struct dlm_rsb *r; int count; - spin_lock(&ls->ls_new_rsb_spin); + spin_lock_bh(&ls->ls_new_rsb_spin); if (list_empty(&ls->ls_new_rsb)) { count = ls->ls_new_rsb_count; - spin_unlock(&ls->ls_new_rsb_spin); + spin_unlock_bh(&ls->ls_new_rsb_spin); log_debug(ls, "find_rsb retry %d %d %s", count, dlm_config.ci_new_rsb_count, (const char *)name); @@ -407,88 +666,44 @@ static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len, r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); list_del(&r->res_hashchain); - /* Convert the empty list_head to a NULL rb_node for tree usage: */ - memset(&r->res_hashnode, 0, sizeof(struct rb_node)); ls->ls_new_rsb_count--; - spin_unlock(&ls->ls_new_rsb_spin); + spin_unlock_bh(&ls->ls_new_rsb_spin); r->res_ls = ls; r->res_length = len; memcpy(r->res_name, name, len); - mutex_init(&r->res_mutex); + spin_lock_init(&r->res_lock); INIT_LIST_HEAD(&r->res_lookup); INIT_LIST_HEAD(&r->res_grantqueue); INIT_LIST_HEAD(&r->res_convertqueue); INIT_LIST_HEAD(&r->res_waitqueue); INIT_LIST_HEAD(&r->res_root_list); + INIT_LIST_HEAD(&r->res_toss_q_list); INIT_LIST_HEAD(&r->res_recover_list); + INIT_LIST_HEAD(&r->res_masters_list); *r_ret = r; return 0; } -static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) +int dlm_search_rsb_tree(struct rhashtable *rhash, const void *name, int len, + struct dlm_rsb **r_ret) { - char maxname[DLM_RESNAME_MAXLEN]; + char key[DLM_RESNAME_MAXLEN] = {}; - memset(maxname, 0, DLM_RESNAME_MAXLEN); - memcpy(maxname, name, nlen); - return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); -} + memcpy(key, name, len); + *r_ret = rhashtable_lookup_fast(rhash, &key, dlm_rhash_rsb_params); + if (*r_ret) + return 0; -int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, - struct dlm_rsb **r_ret) -{ - struct rb_node *node = tree->rb_node; - struct dlm_rsb *r; - int rc; - - while (node) { - r = rb_entry(node, struct dlm_rsb, res_hashnode); - rc = rsb_cmp(r, name, len); - if (rc < 0) - node = node->rb_left; - else if (rc > 0) - node = node->rb_right; - else - goto found; - } - *r_ret = NULL; return -EBADR; - - found: - *r_ret = r; - return 0; } -static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) +static int rsb_insert(struct dlm_rsb *rsb, struct rhashtable *rhash) { - struct rb_node **newn = &tree->rb_node; - struct rb_node *parent = NULL; - int rc; - - while (*newn) { - struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb, - res_hashnode); - - parent = *newn; - rc = rsb_cmp(cur, rsb->res_name, rsb->res_length); - if (rc < 0) - newn = &parent->rb_left; - else if (rc > 0) - newn = &parent->rb_right; - else { - log_print("rsb_insert match"); - dlm_dump_rsb(rsb); - dlm_dump_rsb(cur); - return -EEXIST; - } - } - - rb_link_node(&rsb->res_hashnode, parent, newn); - rb_insert_color(&rsb->res_hashnode, tree); - return 0; + return rhashtable_insert_fast(rhash, &rsb->res_node, + dlm_rhash_rsb_params); } /* @@ -536,8 +751,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) */ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, - uint32_t hash, uint32_t b, - int dir_nodeid, int from_nodeid, + uint32_t hash, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) { struct dlm_rsb *r = NULL; @@ -584,24 +798,46 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, goto out; } - spin_lock(&ls->ls_rsbtbl[b].lock); + retry_lookup: - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); - if (error) - goto do_toss; + /* check if the rsb is in keep state under read lock - likely path */ + read_lock_bh(&ls->ls_rsbtbl_lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); + if (error) { + read_unlock_bh(&ls->ls_rsbtbl_lock); + goto do_new; + } /* * rsb is active, so we can't check master_nodeid without lock_rsb. */ + if (rsb_flag(r, RSB_TOSS)) { + read_unlock_bh(&ls->ls_rsbtbl_lock); + goto do_toss; + } + kref_get(&r->res_ref); - goto out_unlock; + read_unlock_bh(&ls->ls_rsbtbl_lock); + goto out; do_toss: - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (error) + write_lock_bh(&ls->ls_rsbtbl_lock); + + /* retry lookup under write lock to see if its still in toss state + * if not it's in keep state and we relookup - unlikely path. + */ + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); + if (!error) { + if (!rsb_flag(r, RSB_TOSS)) { + write_unlock_bh(&ls->ls_rsbtbl_lock); + goto retry_lookup; + } + } else { + write_unlock_bh(&ls->ls_rsbtbl_lock); goto do_new; + } /* * rsb found inactive (master_nodeid may be out of date unless @@ -616,8 +852,9 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s", from_nodeid, r->res_master_nodeid, dir_nodeid, r->res_name); + write_unlock_bh(&ls->ls_rsbtbl_lock); error = -ENOTBLK; - goto out_unlock; + goto out; } if ((r->res_master_nodeid != our_nodeid) && from_dir) { @@ -639,9 +876,17 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, r->res_first_lkid = 0; } - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); - goto out_unlock; + list_move(&r->res_rsbs_list, &ls->ls_keep); + rsb_clear_flag(r, RSB_TOSS); + /* rsb got out of toss state, it becomes alive again + * and we reinit the reference counter that is only + * valid for keep state rsbs + */ + kref_init(&r->res_ref); + rsb_delete_toss_timer(ls, r); + write_unlock_bh(&ls->ls_rsbtbl_lock); + + goto out; do_new: @@ -650,18 +895,15 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, */ if (error == -EBADR && !create) - goto out_unlock; + goto out; error = get_rsb_struct(ls, name, len, &r); - if (error == -EAGAIN) { - spin_unlock(&ls->ls_rsbtbl[b].lock); + if (error == -EAGAIN) goto retry; - } if (error) - goto out_unlock; + goto out; r->res_hash = hash; - r->res_bucket = b; r->res_dir_nodeid = dir_nodeid; kref_init(&r->res_ref); @@ -681,7 +923,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, dlm_free_rsb(r); r = NULL; error = -ENOTBLK; - goto out_unlock; + goto out; } if (from_other) { @@ -701,9 +943,20 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, } out_add: - error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); - out_unlock: - spin_unlock(&ls->ls_rsbtbl[b].lock); + + write_lock_bh(&ls->ls_rsbtbl_lock); + error = rsb_insert(r, &ls->ls_rsbtbl); + if (error == -EEXIST) { + /* somebody else was faster and it seems the + * rsb exists now, we do a whole relookup + */ + write_unlock_bh(&ls->ls_rsbtbl_lock); + dlm_free_rsb(r); + goto retry_lookup; + } else if (!error) { + list_add(&r->res_rsbs_list, &ls->ls_keep); + } + write_unlock_bh(&ls->ls_rsbtbl_lock); out: *r_ret = r; return error; @@ -714,8 +967,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, dlm_recover_masters). */ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, - uint32_t hash, uint32_t b, - int dir_nodeid, int from_nodeid, + uint32_t hash, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) { struct dlm_rsb *r = NULL; @@ -728,24 +980,48 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, if (error < 0) goto out; - spin_lock(&ls->ls_rsbtbl[b].lock); + retry_lookup: - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); - if (error) + /* check if the rsb is in keep state under read lock - likely path */ + read_lock_bh(&ls->ls_rsbtbl_lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); + if (error) { + read_unlock_bh(&ls->ls_rsbtbl_lock); + goto do_new; + } + + if (rsb_flag(r, RSB_TOSS)) { + read_unlock_bh(&ls->ls_rsbtbl_lock); goto do_toss; + } /* * rsb is active, so we can't check master_nodeid without lock_rsb. */ kref_get(&r->res_ref); - goto out_unlock; + read_unlock_bh(&ls->ls_rsbtbl_lock); + + goto out; do_toss: - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (error) + write_lock_bh(&ls->ls_rsbtbl_lock); + + /* retry lookup under write lock to see if its still in toss state + * if not it's in keep state and we relookup - unlikely path. + */ + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); + if (!error) { + if (!rsb_flag(r, RSB_TOSS)) { + write_unlock_bh(&ls->ls_rsbtbl_lock); + goto retry_lookup; + } + } else { + write_unlock_bh(&ls->ls_rsbtbl_lock); goto do_new; + } + /* * rsb found inactive. No other thread is using this rsb because @@ -759,8 +1035,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d", from_nodeid, r->res_master_nodeid, dir_nodeid); dlm_print_rsb(r); + write_unlock_bh(&ls->ls_rsbtbl_lock); error = -ENOTBLK; - goto out_unlock; + goto out; } if (!recover && (r->res_master_nodeid != our_nodeid) && @@ -774,9 +1051,17 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, r->res_nodeid = 0; } - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); - goto out_unlock; + list_move(&r->res_rsbs_list, &ls->ls_keep); + rsb_clear_flag(r, RSB_TOSS); + /* rsb got out of toss state, it becomes alive again + * and we reinit the reference counter that is only + * valid for keep state rsbs + */ + kref_init(&r->res_ref); + rsb_delete_toss_timer(ls, r); + write_unlock_bh(&ls->ls_rsbtbl_lock); + + goto out; do_new: @@ -786,22 +1071,31 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, error = get_rsb_struct(ls, name, len, &r); if (error == -EAGAIN) { - spin_unlock(&ls->ls_rsbtbl[b].lock); goto retry; } if (error) - goto out_unlock; + goto out; r->res_hash = hash; - r->res_bucket = b; r->res_dir_nodeid = dir_nodeid; r->res_master_nodeid = dir_nodeid; r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid; kref_init(&r->res_ref); - error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); - out_unlock: - spin_unlock(&ls->ls_rsbtbl[b].lock); + write_lock_bh(&ls->ls_rsbtbl_lock); + error = rsb_insert(r, &ls->ls_rsbtbl); + if (error == -EEXIST) { + /* somebody else was faster and it seems the + * rsb exists now, we do a whole relookup + */ + write_unlock_bh(&ls->ls_rsbtbl_lock); + dlm_free_rsb(r); + goto retry_lookup; + } else if (!error) { + list_add(&r->res_rsbs_list, &ls->ls_keep); + } + write_unlock_bh(&ls->ls_rsbtbl_lock); + out: *r_ret = r; return error; @@ -811,23 +1105,21 @@ static int find_rsb(struct dlm_ls *ls, const void *name, int len, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) { - uint32_t hash, b; int dir_nodeid; + uint32_t hash; if (len > DLM_RESNAME_MAXLEN) return -EINVAL; hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - dir_nodeid = dlm_hash2nodeid(ls, hash); if (dlm_no_directory(ls)) - return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid, + return find_rsb_nodir(ls, name, len, hash, dir_nodeid, from_nodeid, flags, r_ret); else - return find_rsb_dir(ls, name, len, hash, b, dir_nodeid, - from_nodeid, flags, r_ret); + return find_rsb_dir(ls, name, len, hash, dir_nodeid, + from_nodeid, flags, r_ret); } /* we have received a request and found that res_master_nodeid != our_nodeid, @@ -988,7 +1280,7 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, int len, unsigned int flags, int *r_nodeid, int *result) { struct dlm_rsb *r = NULL; - uint32_t hash, b; + uint32_t hash; int our_nodeid = dlm_our_nodeid(); int dir_nodeid, error; @@ -1002,8 +1294,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, } hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - dir_nodeid = dlm_hash2nodeid(ls, hash); if (dir_nodeid != our_nodeid) { log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d", @@ -1018,15 +1308,23 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, if (error < 0) return error; - spin_lock(&ls->ls_rsbtbl[b].lock); - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + retry_lookup: + + /* check if the rsb is in keep state under read lock - likely path */ + read_lock_bh(&ls->ls_rsbtbl_lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); if (!error) { + if (rsb_flag(r, RSB_TOSS)) { + read_unlock_bh(&ls->ls_rsbtbl_lock); + goto do_toss; + } + /* because the rsb is active, we need to lock_rsb before * checking/changing re_master_nodeid */ hold_rsb(r); - spin_unlock(&ls->ls_rsbtbl[b].lock); + read_unlock_bh(&ls->ls_rsbtbl_lock); lock_rsb(r); __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false, @@ -1037,11 +1335,31 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, put_rsb(r); return 0; + } else { + read_unlock_bh(&ls->ls_rsbtbl_lock); + goto not_found; } - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (error) + do_toss: + /* unlikely path - relookup under write */ + write_lock_bh(&ls->ls_rsbtbl_lock); + + /* rsb_mod_timer() requires to held ls_rsbtbl_lock in write lock + * check if the rsb is still in toss state, if not relookup + */ + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); + if (!error) { + if (!rsb_flag(r, RSB_TOSS)) { + write_unlock_bh(&ls->ls_rsbtbl_lock); + /* something as changed, very unlikely but + * try again + */ + goto retry_lookup; + } + } else { + write_unlock_bh(&ls->ls_rsbtbl_lock); goto not_found; + } /* because the rsb is inactive (on toss list), it's not refcounted * and lock_rsb is not used, but is protected by the rsbtbl lock @@ -1050,83 +1368,78 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags, r_nodeid, result); - r->res_toss_time = jiffies; + rsb_mod_timer(ls, r); /* the rsb was inactive (on toss list) */ - spin_unlock(&ls->ls_rsbtbl[b].lock); + write_unlock_bh(&ls->ls_rsbtbl_lock); return 0; not_found: error = get_rsb_struct(ls, name, len, &r); - if (error == -EAGAIN) { - spin_unlock(&ls->ls_rsbtbl[b].lock); + if (error == -EAGAIN) goto retry; - } if (error) - goto out_unlock; + goto out; r->res_hash = hash; - r->res_bucket = b; r->res_dir_nodeid = our_nodeid; r->res_master_nodeid = from_nodeid; r->res_nodeid = from_nodeid; kref_init(&r->res_ref); - r->res_toss_time = jiffies; + rsb_set_flag(r, RSB_TOSS); - error = rsb_insert(r, &ls->ls_rsbtbl[b].toss); - if (error) { + write_lock_bh(&ls->ls_rsbtbl_lock); + error = rsb_insert(r, &ls->ls_rsbtbl); + if (error == -EEXIST) { + /* somebody else was faster and it seems the + * rsb exists now, we do a whole relookup + */ + write_unlock_bh(&ls->ls_rsbtbl_lock); + dlm_free_rsb(r); + goto retry_lookup; + } else if (error) { + write_unlock_bh(&ls->ls_rsbtbl_lock); /* should never happen */ dlm_free_rsb(r); - spin_unlock(&ls->ls_rsbtbl[b].lock); goto retry; } + list_add(&r->res_rsbs_list, &ls->ls_toss); + rsb_mod_timer(ls, r); + write_unlock_bh(&ls->ls_rsbtbl_lock); + if (result) *result = DLM_LU_ADD; *r_nodeid = from_nodeid; - out_unlock: - spin_unlock(&ls->ls_rsbtbl[b].lock); + out: return error; } static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) { - struct rb_node *n; struct dlm_rsb *r; - int i; - for (i = 0; i < ls->ls_rsbtbl_size; i++) { - spin_lock(&ls->ls_rsbtbl[i].lock); - for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { - r = rb_entry(n, struct dlm_rsb, res_hashnode); - if (r->res_hash == hash) - dlm_dump_rsb(r); - } - spin_unlock(&ls->ls_rsbtbl[i].lock); + read_lock_bh(&ls->ls_rsbtbl_lock); + list_for_each_entry(r, &ls->ls_keep, res_rsbs_list) { + if (r->res_hash == hash) + dlm_dump_rsb(r); } + read_unlock_bh(&ls->ls_rsbtbl_lock); } void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len) { struct dlm_rsb *r = NULL; - uint32_t hash, b; int error; - hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - - spin_lock(&ls->ls_rsbtbl[b].lock); - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + read_lock_bh(&ls->ls_rsbtbl_lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); if (!error) - goto out_dump; - - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (error) goto out; - out_dump: + dlm_dump_rsb(r); out: - spin_unlock(&ls->ls_rsbtbl[b].lock); + read_unlock_bh(&ls->ls_rsbtbl_lock); } static void toss_rsb(struct kref *kref) @@ -1135,11 +1448,10 @@ static void toss_rsb(struct kref *kref) struct dlm_ls *ls = r->res_ls; DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); - kref_init(&r->res_ref); - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); - rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); - r->res_toss_time = jiffies; - set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[r->res_bucket].flags); + rsb_set_flag(r, RSB_TOSS); + list_move(&r->res_rsbs_list, &ls->ls_toss); + rsb_mod_timer(ls, r); + if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; @@ -1151,23 +1463,27 @@ static void toss_rsb(struct kref *kref) static void unhold_rsb(struct dlm_rsb *r) { int rv; + + /* rsbs in toss state never get referenced */ + WARN_ON(rsb_flag(r, RSB_TOSS)); rv = kref_put(&r->res_ref, toss_rsb); DLM_ASSERT(!rv, dlm_dump_rsb(r);); } -static void kill_rsb(struct kref *kref) +void free_toss_rsb(struct dlm_rsb *r) { - struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); - - /* All work is done after the return from kref_put() so we - can release the write_lock before the remove and free. */ + WARN_ON_ONCE(!rsb_flag(r, RSB_TOSS)); DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r);); DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r);); DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r);); DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r);); DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_toss_q_list), dlm_dump_rsb(r);); DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_masters_list), dlm_dump_rsb(r);); + + dlm_free_rsb(r); } /* Attaching/detaching lkb's from rsb's is for rsb reference counting. @@ -1197,24 +1513,20 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, if (!lkb) return -ENOMEM; - lkb->lkb_last_bast_mode = -1; + lkb->lkb_last_bast_cb_mode = DLM_LOCK_IV; + lkb->lkb_last_cast_cb_mode = DLM_LOCK_IV; + lkb->lkb_last_cb_mode = DLM_LOCK_IV; lkb->lkb_nodeid = -1; lkb->lkb_grmode = DLM_LOCK_IV; kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); - INIT_LIST_HEAD(&lkb->lkb_cb_list); - INIT_LIST_HEAD(&lkb->lkb_callbacks); - spin_lock_init(&lkb->lkb_cb_lock); - INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - idr_preload(GFP_NOFS); - spin_lock(&ls->ls_lkbidr_spin); + write_lock_bh(&ls->ls_lkbidr_lock); rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT); if (rv >= 0) lkb->lkb_id = rv; - spin_unlock(&ls->ls_lkbidr_spin); - idr_preload_end(); + write_unlock_bh(&ls->ls_lkbidr_lock); if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); @@ -1235,11 +1547,11 @@ static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - spin_lock(&ls->ls_lkbidr_spin); + read_lock_bh(&ls->ls_lkbidr_lock); lkb = idr_find(&ls->ls_lkbidr, lkid); if (lkb) kref_get(&lkb->lkb_ref); - spin_unlock(&ls->ls_lkbidr_spin); + read_unlock_bh(&ls->ls_lkbidr_lock); *lkb_ret = lkb; return lkb ? 0 : -ENOENT; @@ -1263,11 +1575,11 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) uint32_t lkid = lkb->lkb_id; int rv; - rv = kref_put_lock(&lkb->lkb_ref, kill_lkb, - &ls->ls_lkbidr_spin); + rv = dlm_kref_put_write_lock_bh(&lkb->lkb_ref, kill_lkb, + &ls->ls_lkbidr_lock); if (rv) { idr_remove(&ls->ls_lkbidr, lkid); - spin_unlock(&ls->ls_lkbidr_spin); + write_unlock_bh(&ls->ls_lkbidr_lock); detach_lkb(lkb); @@ -1408,7 +1720,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error = 0; - mutex_lock(&ls->ls_waiters_mutex); + spin_lock_bh(&ls->ls_waiters_lock); if (is_overlap_unlock(lkb) || (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) { @@ -1451,7 +1763,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) log_error(ls, "addwait error %x %d flags %x %d %d %s", lkb->lkb_id, error, dlm_iflags_val(lkb), mstype, lkb->lkb_wait_type, lkb->lkb_resource->res_name); - mutex_unlock(&ls->ls_waiters_mutex); + spin_unlock_bh(&ls->ls_waiters_lock); return error; } @@ -1551,14 +1863,18 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; - mutex_lock(&ls->ls_waiters_mutex); + spin_lock_bh(&ls->ls_waiters_lock); error = _remove_from_waiters(lkb, mstype, NULL); - mutex_unlock(&ls->ls_waiters_mutex); + spin_unlock_bh(&ls->ls_waiters_lock); return error; } /* Handles situations where we might be processing a "fake" or "local" reply in - which we can't try to take waiters_mutex again. */ + * the recovery context which stops any locking activity. Only debugfs might + * change the lockspace waiters but they will held the recovery lock to ensure + * remove_from_waiters_ms() in local case will be the only user manipulating the + * lockspace waiters in recovery context. + */ static int remove_from_waiters_ms(struct dlm_lkb *lkb, const struct dlm_message *ms, bool local) @@ -1567,159 +1883,16 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, int error; if (!local) - mutex_lock(&ls->ls_waiters_mutex); + spin_lock_bh(&ls->ls_waiters_lock); + else + WARN_ON_ONCE(!rwsem_is_locked(&ls->ls_in_recovery) || + !dlm_locking_stopped(ls)); error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms); if (!local) - mutex_unlock(&ls->ls_waiters_mutex); + spin_unlock_bh(&ls->ls_waiters_lock); return error; } -static void shrink_bucket(struct dlm_ls *ls, int b) -{ - struct rb_node *n, *next; - struct dlm_rsb *r; - char *name; - int our_nodeid = dlm_our_nodeid(); - int remote_count = 0; - int need_shrink = 0; - int i, len, rv; - - memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); - - spin_lock(&ls->ls_rsbtbl[b].lock); - - if (!test_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags)) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - return; - } - - for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) { - next = rb_next(n); - r = rb_entry(n, struct dlm_rsb, res_hashnode); - - /* If we're the directory record for this rsb, and - we're not the master of it, then we need to wait - for the master node to send us a dir remove for - before removing the dir record. */ - - if (!dlm_no_directory(ls) && - (r->res_master_nodeid != our_nodeid) && - (dlm_dir_nodeid(r) == our_nodeid)) { - continue; - } - - need_shrink = 1; - - if (!time_after_eq(jiffies, r->res_toss_time + - dlm_config.ci_toss_secs * HZ)) { - continue; - } - - if (!dlm_no_directory(ls) && - (r->res_master_nodeid == our_nodeid) && - (dlm_dir_nodeid(r) != our_nodeid)) { - - /* We're the master of this rsb but we're not - the directory record, so we need to tell the - dir node to remove the dir record. */ - - ls->ls_remove_lens[remote_count] = r->res_length; - memcpy(ls->ls_remove_names[remote_count], r->res_name, - DLM_RESNAME_MAXLEN); - remote_count++; - - if (remote_count >= DLM_REMOVE_NAMES_MAX) - break; - continue; - } - - if (!kref_put(&r->res_ref, kill_rsb)) { - log_error(ls, "tossed rsb in use %s", r->res_name); - continue; - } - - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - dlm_free_rsb(r); - } - - if (need_shrink) - set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags); - else - clear_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags); - spin_unlock(&ls->ls_rsbtbl[b].lock); - - /* - * While searching for rsb's to free, we found some that require - * remote removal. We leave them in place and find them again here - * so there is a very small gap between removing them from the toss - * list and sending the removal. Keeping this gap small is - * important to keep us (the master node) from being out of sync - * with the remote dir node for very long. - */ - - for (i = 0; i < remote_count; i++) { - name = ls->ls_remove_names[i]; - len = ls->ls_remove_lens[i]; - - spin_lock(&ls->ls_rsbtbl[b].lock); - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_debug(ls, "remove_name not toss %s", name); - continue; - } - - if (r->res_master_nodeid != our_nodeid) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_debug(ls, "remove_name master %d dir %d our %d %s", - r->res_master_nodeid, r->res_dir_nodeid, - our_nodeid, name); - continue; - } - - if (r->res_dir_nodeid == our_nodeid) { - /* should never happen */ - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "remove_name dir %d master %d our %d %s", - r->res_dir_nodeid, r->res_master_nodeid, - our_nodeid, name); - continue; - } - - if (!time_after_eq(jiffies, r->res_toss_time + - dlm_config.ci_toss_secs * HZ)) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_debug(ls, "remove_name toss_time %lu now %lu %s", - r->res_toss_time, jiffies, name); - continue; - } - - if (!kref_put(&r->res_ref, kill_rsb)) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "remove_name in use %s", name); - continue; - } - - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - send_remove(r); - spin_unlock(&ls->ls_rsbtbl[b].lock); - - dlm_free_rsb(r); - } -} - -void dlm_scan_rsbs(struct dlm_ls *ls) -{ - int i; - - for (i = 0; i < ls->ls_rsbtbl_size; i++) { - shrink_bucket(ls, i); - if (dlm_locking_stopped(ls)) - break; - cond_resched(); - } -} - /* lkb is master or local copy */ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -2538,7 +2711,6 @@ static void process_lookup_list(struct dlm_rsb *r) list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) { list_del_init(&lkb->lkb_rsb_lookup); _request_lock(r, lkb); - schedule(); } } @@ -3332,8 +3504,7 @@ int dlm_unlock(dlm_lockspace_t *lockspace, static int _create_message(struct dlm_ls *ls, int mb_len, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret, - gfp_t allocation) + struct dlm_mhandle **mh_ret) { struct dlm_message *ms; struct dlm_mhandle *mh; @@ -3343,7 +3514,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, &mb); if (!mh) return -ENOBUFS; @@ -3365,8 +3536,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret, - gfp_t allocation) + struct dlm_mhandle **mh_ret) { int mb_len = sizeof(struct dlm_message); @@ -3387,7 +3557,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, } return _create_message(r->res_ls, mb_len, to_nodeid, mstype, - ms_ret, mh_ret, allocation); + ms_ret, mh_ret); } /* further lowcomms enhancements or alternate implementations may make @@ -3456,7 +3626,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) if (error) return error; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); if (error) goto fail; @@ -3516,8 +3686,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh, - GFP_NOFS); + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); if (error) goto out; @@ -3538,8 +3707,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh, - GFP_NOFS); + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); if (error) goto out; @@ -3564,8 +3732,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) if (error) return error; - error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh, - GFP_NOFS); + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); if (error) goto fail; @@ -3589,8 +3756,7 @@ static int send_remove(struct dlm_rsb *r) to_nodeid = dlm_dir_nodeid(r); - error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh, - GFP_ATOMIC); + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); if (error) goto out; @@ -3611,7 +3777,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); if (error) goto out; @@ -3653,8 +3819,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); - error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh, - GFP_NOFS); + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); if (error) goto out; @@ -4139,7 +4304,6 @@ static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) { char name[DLM_RESNAME_MAXLEN+1]; struct dlm_rsb *r; - uint32_t hash, b; int rv, len, dir_nodeid, from_nodeid; from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); @@ -4159,47 +4323,44 @@ static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) return; } - /* Look for name on rsbtbl.toss, if it's there, kill it. - If it's on rsbtbl.keep, it's being used, and we should ignore this - message. This is an expected race between the dir node sending a - request to the master node at the same time as the master node sends - a remove to the dir node. The resolution to that race is for the - dir node to ignore the remove message, and the master node to - recreate the master rsb when it gets a request from the dir node for - an rsb it doesn't have. */ + /* Look for name in rsb toss state, if it's there, kill it. + * If it's in non toss state, it's being used, and we should ignore this + * message. This is an expected race between the dir node sending a + * request to the master node at the same time as the master node sends + * a remove to the dir node. The resolution to that race is for the + * dir node to ignore the remove message, and the master node to + * recreate the master rsb when it gets a request from the dir node for + * an rsb it doesn't have. + */ memset(name, 0, sizeof(name)); memcpy(name, ms->m_extra, len); - hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - - spin_lock(&ls->ls_rsbtbl[b].lock); + write_lock_bh(&ls->ls_rsbtbl_lock); - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r); if (rv) { - /* verify the rsb is on keep list per comment above */ - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); - if (rv) { - /* should not happen */ - log_error(ls, "receive_remove from %d not found %s", - from_nodeid, name); - spin_unlock(&ls->ls_rsbtbl[b].lock); - return; - } + /* should not happen */ + log_error(ls, "%s from %d not found %s", __func__, + from_nodeid, name); + write_unlock_bh(&ls->ls_rsbtbl_lock); + return; + } + + if (!rsb_flag(r, RSB_TOSS)) { if (r->res_master_nodeid != from_nodeid) { /* should not happen */ log_error(ls, "receive_remove keep from %d master %d", from_nodeid, r->res_master_nodeid); dlm_print_rsb(r); - spin_unlock(&ls->ls_rsbtbl[b].lock); + write_unlock_bh(&ls->ls_rsbtbl_lock); return; } log_debug(ls, "receive_remove from %d master %d first %x %s", from_nodeid, r->res_master_nodeid, r->res_first_lkid, name); - spin_unlock(&ls->ls_rsbtbl[b].lock); + write_unlock_bh(&ls->ls_rsbtbl_lock); return; } @@ -4207,20 +4368,16 @@ static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) log_error(ls, "receive_remove toss from %d master %d", from_nodeid, r->res_master_nodeid); dlm_print_rsb(r); - spin_unlock(&ls->ls_rsbtbl[b].lock); + write_unlock_bh(&ls->ls_rsbtbl_lock); return; } - if (kref_put(&r->res_ref, kill_rsb)) { - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - spin_unlock(&ls->ls_rsbtbl[b].lock); - dlm_free_rsb(r); - } else { - log_error(ls, "receive_remove from %d rsb ref error", - from_nodeid); - dlm_print_rsb(r); - spin_unlock(&ls->ls_rsbtbl[b].lock); - } + list_del(&r->res_rsbs_list); + rhashtable_remove_fast(&ls->ls_rsbtbl, &r->res_node, + dlm_rhash_rsb_params); + write_unlock_bh(&ls->ls_rsbtbl_lock); + + free_toss_rsb(r); } static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms) @@ -4407,7 +4564,6 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, if (error) goto out; - /* local reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms, local); if (error) goto out; @@ -4446,7 +4602,6 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, if (error) goto out; - /* local reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms, local); if (error) goto out; @@ -4498,7 +4653,6 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, if (error) goto out; - /* local reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms, local); if (error) goto out; @@ -4757,20 +4911,32 @@ static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms, static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms, int nodeid) { - if (dlm_locking_stopped(ls)) { +try_again: + read_lock_bh(&ls->ls_requestqueue_lock); + if (test_bit(LSFL_RECV_MSG_BLOCKED, &ls->ls_flags)) { /* If we were a member of this lockspace, left, and rejoined, other nodes may still be sending us messages from the lockspace generation before we left. */ if (WARN_ON_ONCE(!ls->ls_generation)) { + read_unlock_bh(&ls->ls_requestqueue_lock); log_limit(ls, "receive %d from %d ignore old gen", le32_to_cpu(ms->m_type), nodeid); return; } + read_unlock_bh(&ls->ls_requestqueue_lock); + write_lock_bh(&ls->ls_requestqueue_lock); + /* recheck because we hold writelock now */ + if (!test_bit(LSFL_RECV_MSG_BLOCKED, &ls->ls_flags)) { + write_unlock_bh(&ls->ls_requestqueue_lock); + goto try_again; + } + dlm_add_requestqueue(ls, nodeid, ms); + write_unlock_bh(&ls->ls_requestqueue_lock); } else { - dlm_wait_requestqueue(ls); _receive_message(ls, ms, 0); + read_unlock_bh(&ls->ls_requestqueue_lock); } } @@ -4830,7 +4996,7 @@ void dlm_receive_buffer(const union dlm_packet *p, int nodeid) /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to be inactive (in this ls) before transitioning to recovery mode */ - down_read(&ls->ls_recv_active); + read_lock_bh(&ls->ls_recv_active); if (hd->h_cmd == DLM_MSG) dlm_receive_message(ls, &p->message, nodeid); else if (hd->h_cmd == DLM_RCOM) @@ -4838,7 +5004,7 @@ void dlm_receive_buffer(const union dlm_packet *p, int nodeid) else log_error(ls, "invalid h_cmd %d from %d lockspace %x", hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace)); - up_read(&ls->ls_recv_active); + read_unlock_bh(&ls->ls_recv_active); dlm_put_lockspace(ls); } @@ -4899,8 +5065,6 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) if (!ms_local) return; - mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource); @@ -4993,7 +5157,6 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) } schedule(); } - mutex_unlock(&ls->ls_waiters_mutex); kfree(ms_local); } @@ -5001,7 +5164,7 @@ static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) { struct dlm_lkb *lkb = NULL, *iter; - mutex_lock(&ls->ls_waiters_mutex); + spin_lock_bh(&ls->ls_waiters_lock); list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) { if (test_bit(DLM_IFL_RESEND_BIT, &iter->lkb_iflags)) { hold_lkb(iter); @@ -5009,7 +5172,7 @@ static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) break; } } - mutex_unlock(&ls->ls_waiters_mutex); + spin_unlock_bh(&ls->ls_waiters_lock); return lkb; } @@ -5109,9 +5272,9 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) } /* Forcibly remove from waiters list */ - mutex_lock(&ls->ls_waiters_mutex); + spin_lock_bh(&ls->ls_waiters_lock); list_del_init(&lkb->lkb_wait_reply); - mutex_unlock(&ls->ls_waiters_mutex); + spin_unlock_bh(&ls->ls_waiters_lock); /* * The lkb is now clear of all prior waiters state and can be @@ -5236,7 +5399,7 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, /* Get rid of locks held by nodes that are gone. */ -void dlm_recover_purge(struct dlm_ls *ls) +void dlm_recover_purge(struct dlm_ls *ls, const struct list_head *root_list) { struct dlm_rsb *r; struct dlm_member *memb; @@ -5255,8 +5418,7 @@ void dlm_recover_purge(struct dlm_ls *ls) if (!nodes_count) return; - down_write(&ls->ls_root_sem); - list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + list_for_each_entry(r, root_list, res_root_list) { hold_rsb(r); lock_rsb(r); if (is_master(r)) { @@ -5271,22 +5433,18 @@ void dlm_recover_purge(struct dlm_ls *ls) unhold_rsb(r); cond_resched(); } - up_write(&ls->ls_root_sem); if (lkb_count) log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes", lkb_count, nodes_count); } -static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) +static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls) { - struct rb_node *n; struct dlm_rsb *r; - spin_lock(&ls->ls_rsbtbl[bucket].lock); - for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { - r = rb_entry(n, struct dlm_rsb, res_hashnode); - + read_lock_bh(&ls->ls_rsbtbl_lock); + list_for_each_entry(r, &ls->ls_keep, res_rsbs_list) { if (!rsb_flag(r, RSB_RECOVER_GRANT)) continue; if (!is_master(r)) { @@ -5294,10 +5452,10 @@ static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) continue; } hold_rsb(r); - spin_unlock(&ls->ls_rsbtbl[bucket].lock); + read_unlock_bh(&ls->ls_rsbtbl_lock); return r; } - spin_unlock(&ls->ls_rsbtbl[bucket].lock); + read_unlock_bh(&ls->ls_rsbtbl_lock); return NULL; } @@ -5321,19 +5479,15 @@ static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) void dlm_recover_grant(struct dlm_ls *ls) { struct dlm_rsb *r; - int bucket = 0; unsigned int count = 0; unsigned int rsb_count = 0; unsigned int lkb_count = 0; while (1) { - r = find_grant_rsb(ls, bucket); - if (!r) { - if (bucket == ls->ls_rsbtbl_size - 1) - break; - bucket++; - continue; - } + r = find_grant_rsb(ls); + if (!r) + break; + rsb_count++; count = 0; lock_rsb(r); @@ -5641,10 +5795,10 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } /* add this new lkb to the per-process list of locks */ - spin_lock(&ua->proc->locks_spin); + spin_lock_bh(&ua->proc->locks_spin); hold_lkb(lkb); list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); - spin_unlock(&ua->proc->locks_spin); + spin_unlock_bh(&ua->proc->locks_spin); do_put = false; out_put: trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false); @@ -5726,7 +5880,7 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int found_other_mode = 0; int rv = 0; - mutex_lock(&ls->ls_orphans_mutex); + spin_lock_bh(&ls->ls_orphans_lock); list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) { if (iter->lkb_resource->res_length != namelen) continue; @@ -5743,7 +5897,7 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, *lkid = iter->lkb_id; break; } - mutex_unlock(&ls->ls_orphans_mutex); + spin_unlock_bh(&ls->ls_orphans_lock); if (!lkb && found_other_mode) { rv = -EAGAIN; @@ -5774,9 +5928,9 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, * for the proc locks list. */ - spin_lock(&ua->proc->locks_spin); + spin_lock_bh(&ua->proc->locks_spin); list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); - spin_unlock(&ua->proc->locks_spin); + spin_unlock_bh(&ua->proc->locks_spin); out: kfree(ua_tmp); return rv; @@ -5820,11 +5974,11 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out_put; - spin_lock(&ua->proc->locks_spin); + spin_lock_bh(&ua->proc->locks_spin); /* dlm_user_add_cb() may have already taken lkb off the proc list */ if (!list_empty(&lkb->lkb_ownqueue)) list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); - spin_unlock(&ua->proc->locks_spin); + spin_unlock_bh(&ua->proc->locks_spin); out_put: trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); @@ -5935,9 +6089,9 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) int error; hold_lkb(lkb); /* reference for the ls_orphans list */ - mutex_lock(&ls->ls_orphans_mutex); + spin_lock_bh(&ls->ls_orphans_lock); list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans); - mutex_unlock(&ls->ls_orphans_mutex); + spin_unlock_bh(&ls->ls_orphans_lock); set_unlock_args(0, lkb->lkb_ua, &args); @@ -5975,7 +6129,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, { struct dlm_lkb *lkb = NULL; - spin_lock(&ls->ls_clear_proc_locks); + spin_lock_bh(&ls->ls_clear_proc_locks); if (list_empty(&proc->locks)) goto out; @@ -5987,7 +6141,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, else set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); out: - spin_unlock(&ls->ls_clear_proc_locks); + spin_unlock_bh(&ls->ls_clear_proc_locks); return lkb; } @@ -6003,6 +6157,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) { + struct dlm_callback *cb, *cb_safe; struct dlm_lkb *lkb, *safe; dlm_lock_recovery(ls); @@ -6023,7 +6178,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - spin_lock(&ls->ls_clear_proc_locks); + spin_lock_bh(&ls->ls_clear_proc_locks); /* in-progress unlocks */ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { @@ -6032,29 +6187,29 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - dlm_purge_lkb_callbacks(lkb); - list_del_init(&lkb->lkb_cb_list); - dlm_put_lkb(lkb); + list_for_each_entry_safe(cb, cb_safe, &proc->asts, list) { + list_del(&cb->list); + dlm_free_cb(cb); } - spin_unlock(&ls->ls_clear_proc_locks); + spin_unlock_bh(&ls->ls_clear_proc_locks); dlm_unlock_recovery(ls); } static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) { + struct dlm_callback *cb, *cb_safe; struct dlm_lkb *lkb, *safe; while (1) { lkb = NULL; - spin_lock(&proc->locks_spin); + spin_lock_bh(&proc->locks_spin); if (!list_empty(&proc->locks)) { lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue); list_del_init(&lkb->lkb_ownqueue); } - spin_unlock(&proc->locks_spin); + spin_unlock_bh(&proc->locks_spin); if (!lkb) break; @@ -6064,21 +6219,20 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); /* ref from proc->locks list */ } - spin_lock(&proc->locks_spin); + spin_lock_bh(&proc->locks_spin); list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { list_del_init(&lkb->lkb_ownqueue); set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); dlm_put_lkb(lkb); } - spin_unlock(&proc->locks_spin); + spin_unlock_bh(&proc->locks_spin); - spin_lock(&proc->asts_spin); - list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - dlm_purge_lkb_callbacks(lkb); - list_del_init(&lkb->lkb_cb_list); - dlm_put_lkb(lkb); + spin_lock_bh(&proc->asts_spin); + list_for_each_entry_safe(cb, cb_safe, &proc->asts, list) { + list_del(&cb->list); + dlm_free_cb(cb); } - spin_unlock(&proc->asts_spin); + spin_unlock_bh(&proc->asts_spin); } /* pid of 0 means purge all orphans */ @@ -6087,7 +6241,7 @@ static void do_purge(struct dlm_ls *ls, int nodeid, int pid) { struct dlm_lkb *lkb, *safe; - mutex_lock(&ls->ls_orphans_mutex); + spin_lock_bh(&ls->ls_orphans_lock); list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) { if (pid && lkb->lkb_ownpid != pid) continue; @@ -6095,7 +6249,7 @@ static void do_purge(struct dlm_ls *ls, int nodeid, int pid) list_del_init(&lkb->lkb_ownqueue); dlm_put_lkb(lkb); } - mutex_unlock(&ls->ls_orphans_mutex); + spin_unlock_bh(&ls->ls_orphans_lock); } static int send_purge(struct dlm_ls *ls, int nodeid, int pid) @@ -6105,7 +6259,7 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid) int error; error = _create_message(ls, sizeof(struct dlm_message), nodeid, - DLM_MSG_PURGE, &ms, &mh, GFP_NOFS); + DLM_MSG_PURGE, &ms, &mh); if (error) return error; ms->m_nodeid = cpu_to_le32(nodeid); |