2 files changed, 669 insertions, 0 deletions
diff --git a/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c
new file mode 100644
index 0000000..7d868a3
--- /dev/null
+++ b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c
@@ -0,0 +1,457 @@
+/*
+   CTDB mutex helper using Ceph librados locks
+
+   Copyright (C) David Disseldorp 2016-2020
+
+   Based on ctdb_mutex_fcntl_helper.c, which is:
+   Copyright (C) Martin Schwenke 2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "tevent.h"
+#include "talloc.h"
+#include "rados/librados.h"
+
+#define CTDB_MUTEX_CEPH_LOCK_NAME	"ctdb_reclock_mutex"
+#define CTDB_MUTEX_CEPH_LOCK_COOKIE	CTDB_MUTEX_CEPH_LOCK_NAME
+#define CTDB_MUTEX_CEPH_LOCK_DESC	"CTDB cluster lock"
+/*
+ * During failover it may take up to <lock duration> seconds before the
+ * newly elected recovery master can obtain the lock.
+ */
+#define CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT	10
+
+#define CTDB_MUTEX_STATUS_HOLDING "0"
+#define CTDB_MUTEX_STATUS_CONTENDED "1"
+#define CTDB_MUTEX_STATUS_TIMEOUT "2"
+#define CTDB_MUTEX_STATUS_ERROR "3"
+
+static char *progname = NULL;
+
+static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name,
+				       const char *ceph_auth_name,
+				       const char *pool_name,
+				       rados_t *_ceph_cluster,
+				       rados_ioctx_t *_ioctx)
+{
+	rados_t ceph_cluster = NULL;
+	rados_ioctx_t ioctx = NULL;
+	int ret;
+
+	ret = rados_create2(&ceph_cluster, ceph_cluster_name, ceph_auth_name, 0);
+	if (ret < 0) {
+		fprintf(stderr, "%s: failed to initialise Ceph cluster %s as %s"
+			" - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
+			strerror(-ret));
+		return ret;
+	}
+
+	/* path=NULL tells librados to use default locations */
+	ret = rados_conf_read_file(ceph_cluster, NULL);
+	if (ret < 0) {
+		fprintf(stderr, "%s: failed to parse Ceph cluster config"
+			" - (%s)\n", progname, strerror(-ret));
+		rados_shutdown(ceph_cluster);
+		return ret;
+	}
+
+	ret = rados_connect(ceph_cluster);
+	if (ret < 0) {
+		fprintf(stderr, "%s: failed to connect to Ceph cluster %s as %s"
+			" - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
+			strerror(-ret));
+		rados_shutdown(ceph_cluster);
+		return ret;
+	}
+
+
+	ret = rados_ioctx_create(ceph_cluster, pool_name, &ioctx);
+	if (ret < 0) {
+		fprintf(stderr, "%s: failed to create Ceph ioctx for pool %s"
+			" - (%s)\n", progname, pool_name, strerror(-ret));
+		rados_shutdown(ceph_cluster);
+		return ret;
+	}
+
+	*_ceph_cluster = ceph_cluster;
+	*_ioctx = ioctx;
+
+	return 0;
+}
+
+static int ctdb_mutex_rados_lock(rados_ioctx_t *ioctx,
+				 const char *oid,
+				 uint64_t lock_duration_s,
+				 uint8_t flags)
+{
+	int ret;
+	struct timeval tv = { lock_duration_s, 0 };
+
+	ret = rados_lock_exclusive(ioctx, oid,
+				   CTDB_MUTEX_CEPH_LOCK_NAME,
+				   CTDB_MUTEX_CEPH_LOCK_COOKIE,
+				   CTDB_MUTEX_CEPH_LOCK_DESC,
+				   lock_duration_s == 0 ? NULL : &tv,
+				   flags);
+	if ((ret == -EEXIST) || (ret == -EBUSY)) {
+		/* lock contention */
+		return ret;
+	} else if (ret < 0) {
+		/* unexpected failure */
+		fprintf(stderr,
+			"%s: Failed to get lock on RADOS object '%s' - (%s)\n",
+			progname, oid, strerror(-ret));
+		return ret;
+	}
+
+	/* lock obtained */
+	return 0;
+}
+
+static int ctdb_mutex_rados_unlock(rados_ioctx_t *ioctx,
+				   const char *oid)
+{
+	int ret;
+
+	ret = rados_unlock(ioctx, oid,
+			   CTDB_MUTEX_CEPH_LOCK_NAME,
+			   CTDB_MUTEX_CEPH_LOCK_COOKIE);
+	if (ret < 0) {
+		fprintf(stderr,
+			"%s: Failed to drop lock on RADOS object '%s' - (%s)\n",
+			progname, oid, strerror(-ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+struct ctdb_mutex_rados_state {
+	bool holding_mutex;
+	const char *ceph_cluster_name;
+	const char *ceph_auth_name;
+	const char *pool_name;
+	const char *object;
+	uint64_t lock_duration_s;
+	int ppid;
+	struct tevent_context *ev;
+	struct tevent_signal *sigterm_ev;
+	struct tevent_signal *sigint_ev;
+	struct tevent_timer *ppid_timer_ev;
+	struct tevent_timer *renew_timer_ev;
+	rados_t ceph_cluster;
+	rados_ioctx_t ioctx;
+};
+
+static void ctdb_mutex_rados_sigterm_cb(struct tevent_context *ev,
+					struct tevent_signal *se,
+					int signum,
+					int count,
+					void *siginfo,
+					void *private_data)
+{
+	struct ctdb_mutex_rados_state *cmr_state = private_data;
+	int ret = 0;
+
+	if (!cmr_state->holding_mutex) {
+		fprintf(stderr, "Sigterm callback invoked without mutex!\n");
+		ret = -EINVAL;
+	}
+
+	talloc_free(cmr_state);
+	exit(ret ? 1 : 0);
+}
+
+static void ctdb_mutex_rados_ppid_timer_cb(struct tevent_context *ev,
+					   struct tevent_timer *te,
+					   struct timeval current_time,
+					   void *private_data)
+{
+	struct ctdb_mutex_rados_state *cmr_state = private_data;
+	int ret = 0;
+
+	if (!cmr_state->holding_mutex) {
+		fprintf(stderr, "Timer callback invoked without mutex!\n");
+		ret = -EINVAL;
+		goto err_ctx_cleanup;
+	}
+
+	if ((kill(cmr_state->ppid, 0) == 0) || (errno != ESRCH)) {
+		/* parent still around, keep waiting */
+		cmr_state->ppid_timer_ev = tevent_add_timer(cmr_state->ev,
+							    cmr_state,
+					       tevent_timeval_current_ofs(5, 0),
+						ctdb_mutex_rados_ppid_timer_cb,
+							    cmr_state);
+		if (cmr_state->ppid_timer_ev == NULL) {
+			fprintf(stderr, "Failed to create timer event\n");
+			/* rely on signal cb */
+		}
+		return;
+	}
+
+	/* parent ended, drop lock (via destructor) and exit */
+err_ctx_cleanup:
+	talloc_free(cmr_state);
+	exit(ret ? 1 : 0);
+}
+
+#define USECS_IN_SEC 1000000
+
+static void ctdb_mutex_rados_lock_renew_timer_cb(struct tevent_context *ev,
+						 struct tevent_timer *te,
+						 struct timeval current_time,
+						 void *private_data)
+{
+	struct ctdb_mutex_rados_state *cmr_state = private_data;
+	struct timeval tv;
+	int ret;
+
+	ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object,
+				    cmr_state->lock_duration_s,
+				    LIBRADOS_LOCK_FLAG_RENEW);
+	if (ret == -EBUSY) {
+		/* should never get -EEXIST on renewal */
+		fprintf(stderr, "Lock contention during renew: %d\n", ret);
+		goto err_ctx_cleanup;
+	} else if (ret < 0) {
+		fprintf(stderr, "Lock renew failed\n");
+		goto err_ctx_cleanup;
+	}
+
+	tv = tevent_timeval_current_ofs(0,
+			    cmr_state->lock_duration_s * (USECS_IN_SEC / 2));
+	cmr_state->renew_timer_ev = tevent_add_timer(cmr_state->ev,
+						       cmr_state,
+						       tv,
+					ctdb_mutex_rados_lock_renew_timer_cb,
+						       cmr_state);
+	if (cmr_state->renew_timer_ev == NULL) {
+		fprintf(stderr, "Failed to create timer event\n");
+		goto err_ctx_cleanup;
+	}
+
+	return;
+
+err_ctx_cleanup:
+	/* drop lock (via destructor) and exit */
+	talloc_free(cmr_state);
+	exit(1);
+}
+
+static int ctdb_mutex_rados_state_destroy(struct ctdb_mutex_rados_state *cmr_state)
+{
+	if (cmr_state->holding_mutex) {
+		ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object);
+	}
+	if (cmr_state->ioctx != NULL) {
+		rados_ioctx_destroy(cmr_state->ioctx);
+	}
+	if (cmr_state->ceph_cluster != NULL) {
+		rados_shutdown(cmr_state->ceph_cluster);
+	}
+	return 0;
+}
+
+/* register this host+service with ceph-mgr for visibility */
+static int ctdb_mutex_rados_mgr_reg(rados_t ceph_cluster)
+{
+	int ret;
+	uint64_t instance_guid;
+	char id_buf[128];
+
+	instance_guid = rados_get_instance_id(ceph_cluster);
+	ret = snprintf(id_buf, sizeof(id_buf), "%s:0x%016llx",
+			"ctdb_mutex_ceph_rados_helper",
+			(unsigned long long)instance_guid);
+	if (ret < 0 || ret >= sizeof(id_buf)) {
+		fprintf(stderr, "Ceph instance name too long\n");
+		return -ENAMETOOLONG;
+	}
+
+	ret = rados_service_register(ceph_cluster, "ctdb", id_buf, "");
+	if (ret < 0) {
+		fprintf(stderr, "failed to register service with ceph-mgr\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+	struct ctdb_mutex_rados_state *cmr_state;
+
+	progname = argv[0];
+
+	if ((argc != 5) && (argc != 6)) {
+		fprintf(stderr, "Usage: %s <Ceph Cluster> <Ceph user> "
+				"<RADOS pool> <RADOS object> "
+				"[lock duration secs]\n",
+			progname);
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ret = setvbuf(stdout, NULL, _IONBF, 0);
+	if (ret != 0) {
+		fprintf(stderr, "Failed to configure unbuffered stdout I/O\n");
+	}
+
+	cmr_state = talloc_zero(NULL, struct ctdb_mutex_rados_state);
+	if (cmr_state == NULL) {
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		ret = -ENOMEM;
+		goto err_out;
+	}
+
+	talloc_set_destructor(cmr_state, ctdb_mutex_rados_state_destroy);
+	cmr_state->ceph_cluster_name = argv[1];
+	cmr_state->ceph_auth_name = argv[2];
+	cmr_state->pool_name = argv[3];
+	cmr_state->object = argv[4];
+	if (argc == 6) {
+		/* optional lock duration provided */
+		char *endptr = NULL;
+		cmr_state->lock_duration_s = strtoull(argv[5], &endptr, 0);
+		if ((endptr == argv[5]) || (*endptr != '\0')) {
+			fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+			ret = -EINVAL;
+			goto err_ctx_cleanup;
+		}
+	} else {
+		cmr_state->lock_duration_s
+			= CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT;
+	}
+
+	cmr_state->ppid = getppid();
+	if (cmr_state->ppid == 1) {
+		/*
+		 * The original parent is gone and the process has
+		 * been reparented to init.  This can happen if the
+		 * helper is started just as the parent is killed
+		 * during shutdown.  The error message doesn't need to
+		 * be stellar, since there won't be anything around to
+		 * capture and log it...
+		 */
+		fprintf(stderr, "%s: PPID == 1\n", progname);
+		ret = -EPIPE;
+		goto err_ctx_cleanup;
+	}
+
+	cmr_state->ev = tevent_context_init(cmr_state);
+	if (cmr_state->ev == NULL) {
+		fprintf(stderr, "tevent_context_init failed\n");
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		ret = -ENOMEM;
+		goto err_ctx_cleanup;
+	}
+
+	/* wait for sigterm */
+	cmr_state->sigterm_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGTERM, 0,
+					      ctdb_mutex_rados_sigterm_cb,
+					      cmr_state);
+	if (cmr_state->sigterm_ev == NULL) {
+		fprintf(stderr, "Failed to create term signal event\n");
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		ret = -ENOMEM;
+		goto err_ctx_cleanup;
+	}
+
+	cmr_state->sigint_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGINT, 0,
+					      ctdb_mutex_rados_sigterm_cb,
+					      cmr_state);
+	if (cmr_state->sigint_ev == NULL) {
+		fprintf(stderr, "Failed to create int signal event\n");
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		ret = -ENOMEM;
+		goto err_ctx_cleanup;
+	}
+
+	/* periodically check parent */
+	cmr_state->ppid_timer_ev = tevent_add_timer(cmr_state->ev, cmr_state,
+					       tevent_timeval_current_ofs(5, 0),
+					       ctdb_mutex_rados_ppid_timer_cb,
+					       cmr_state);
+	if (cmr_state->ppid_timer_ev == NULL) {
+		fprintf(stderr, "Failed to create timer event\n");
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		ret = -ENOMEM;
+		goto err_ctx_cleanup;
+	}
+
+	ret = ctdb_mutex_rados_ctx_create(cmr_state->ceph_cluster_name,
+					  cmr_state->ceph_auth_name,
+					  cmr_state->pool_name,
+					  &cmr_state->ceph_cluster,
+					  &cmr_state->ioctx);
+	if (ret < 0) {
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		goto err_ctx_cleanup;
+	}
+
+	ret = ctdb_mutex_rados_mgr_reg(cmr_state->ceph_cluster);
+	if (ret < 0) {
+		fprintf(stderr, "Failed to register with ceph-mgr\n");
+		/* ignore: ceph-mgr service registration is informational */
+	}
+
+	ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object,
+				    cmr_state->lock_duration_s,
+				    0);
+	if ((ret == -EEXIST) || (ret == -EBUSY)) {
+		fprintf(stdout, CTDB_MUTEX_STATUS_CONTENDED);
+		goto err_ctx_cleanup;
+	} else if (ret < 0) {
+		fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+		goto err_ctx_cleanup;
+	}
+	cmr_state->holding_mutex = true;
+
+	if (cmr_state->lock_duration_s != 0) {
+		/*
+		 * renew (reobtain) the lock, using a period of half the lock
+		 * duration. Convert to usecs to avoid rounding.
+		 */
+		struct timeval tv = tevent_timeval_current_ofs(0,
+			       cmr_state->lock_duration_s * (USECS_IN_SEC / 2));
+		cmr_state->renew_timer_ev = tevent_add_timer(cmr_state->ev,
+							       cmr_state,
+							       tv,
+					ctdb_mutex_rados_lock_renew_timer_cb,
+							       cmr_state);
+		if (cmr_state->renew_timer_ev == NULL) {
+			fprintf(stderr, "Failed to create timer event\n");
+			fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+			ret = -ENOMEM;
+			goto err_ctx_cleanup;
+		}
+	}
+
+	fprintf(stdout, CTDB_MUTEX_STATUS_HOLDING);
+
+	/* wait for the signal / timer events to do their work */
+	ret = tevent_loop_wait(cmr_state->ev);
+	if (ret < 0) {
+		goto err_ctx_cleanup;
+	}
+err_ctx_cleanup:
+	talloc_free(cmr_state);
+err_out:
+	return ret ? 1 : 0;
+}
diff --git a/ctdb/utils/ceph/test_ceph_rados_reclock.sh b/ctdb/utils/ceph/test_ceph_rados_reclock.sh
new file mode 100755
index 0000000..bfb9c32
--- /dev/null
+++ b/ctdb/utils/ceph/test_ceph_rados_reclock.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+# standalone test for ctdb_mutex_ceph_rados_helper
+#
+# Copyright (C) David Disseldorp 2016-2020
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# XXX The following parameters may require configuration:
+CLUSTER="ceph"				# Name of the Ceph cluster under test
+USER="client.admin"			# Ceph user - a keyring must exist
+POOL="rbd"				# RADOS pool - must exist
+OBJECT="ctdb_reclock"			# RADOS object: target for lock requests
+
+# test procedure:
+# - using ctdb_mutex_ceph_rados_helper, take a lock on the Ceph RADOS object at
+#   CLUSTER/$POOL/$OBJECT using the Ceph keyring for $USER
+#   + confirm that lock is obtained, via ctdb_mutex_ceph_rados_helper "0" output
+# - check for ceph-mgr service registration
+# - check RADOS object lock state, using the "rados lock info" command
+# - attempt to obtain the lock again, using ctdb_mutex_ceph_rados_helper
+#   + confirm that the lock is not successfully taken ("1" output=contention)
+# - tell the first locker to drop the lock and exit, via SIGTERM
+# - once the first locker has exited, attempt to get the lock again
+#   + confirm that this attempt succeeds
+
+function _fail() {
+	echo "FAILED: $*"
+	exit 1
+}
+
+# this test requires the Ceph "rados" binary, and "jq" json parser
+which jq > /dev/null || exit 1
+which rados > /dev/null || exit 1
+which ceph > /dev/null || exit 1
+which ctdb_mutex_ceph_rados_helper || exit 1
+
+TMP_DIR="$(mktemp --directory)" || exit 1
+rados -p "$POOL" rm "$OBJECT"
+
+# explicitly disable lock expiry (duration=0), to ensure that we don't get
+# intermittent failures (due to renewal) from the lock state diff further down
+(ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" 0 \
+							> ${TMP_DIR}/first) &
+locker_pid=$!
+
+# TODO wait for ctdb_mutex_ceph_rados_helper to write one byte to stdout,
+# indicating lock acquisition success/failure
+sleep 1
+
+first_out=$(cat ${TMP_DIR}/first)
+[ "$first_out" == "0" ] \
+	|| _fail "expected lock acquisition (0), but got $first_out"
+
+ceph service dump > ${TMP_DIR}/service_dump
+SERVICE_COUNT=$(jq -r '.services.ctdb.daemons | length' ${TMP_DIR}/service_dump)
+[ $SERVICE_COUNT -gt 0 ] || _fail "lock holder missing from ceph service dump"
+
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_first
+
+# echo "with lock: `cat ${TMP_DIR}/lock_state_first`"
+
+LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_first)"
+[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected lock name: $LOCK_NAME"
+LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_first)"
+[ "$LOCK_TYPE" == "exclusive" ] \
+	|| _fail "unexpected lock type: $LOCK_TYPE"
+
+LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_first)"
+[ $LOCK_COUNT -eq 1 ] || _fail "expected 1 lock in rados state, got $LOCK_COUNT"
+LOCKER_COOKIE="$(jq -r '.lockers[0].cookie' ${TMP_DIR}/lock_state_first)"
+[ "$LOCKER_COOKIE" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected locker cookie: $LOCKER_COOKIE"
+LOCKER_DESC="$(jq -r '.lockers[0].description' ${TMP_DIR}/lock_state_first)"
+[ "$LOCKER_DESC" == "CTDB cluster lock" ] \
+	|| _fail "unexpected locker description: $LOCKER_DESC"
+LOCKER_EXP="$(jq -r '.lockers[0].expiration' ${TMP_DIR}/lock_state_first)"
+[ "$LOCKER_EXP" == "0.000000" ] \
+	|| _fail "unexpected locker expiration: $LOCKER_EXP"
+
+# second attempt while first is still holding the lock - expect failure
+ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" \
+							> ${TMP_DIR}/second
+second_out=$(cat ${TMP_DIR}/second)
+[ "$second_out" == "1" ] \
+	|| _fail "expected lock contention (1), but got $second_out"
+
+# confirm lock state didn't change
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_second
+
+diff ${TMP_DIR}/lock_state_first ${TMP_DIR}/lock_state_second \
+					|| _fail "unexpected lock state change"
+
+# tell first locker to drop the lock and terminate
+kill $locker_pid || exit 1
+
+wait $locker_pid &> /dev/null
+
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_third
+# echo "without lock: `cat ${TMP_DIR}/lock_state_third`"
+
+LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_third)"
+[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected lock name: $LOCK_NAME"
+LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_third)"
+[ "$LOCK_TYPE" == "exclusive" ] \
+	|| _fail "unexpected lock type: $LOCK_TYPE"
+
+LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_third)"
+[ $LOCK_COUNT -eq 0 ] \
+	|| _fail "didn\'t expect any locks in rados state, got $LOCK_COUNT"
+
+exec >${TMP_DIR}/third -- ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" &
+locker_pid=$!
+
+sleep 1
+
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_fourth
+# echo "with lock again: `cat ${TMP_DIR}/lock_state_fourth`"
+
+LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_fourth)"
+[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected lock name: $LOCK_NAME"
+LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_fourth)"
+[ "$LOCK_TYPE" == "exclusive" ] \
+	|| _fail "unexpected lock type: $LOCK_TYPE"
+
+LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_fourth)"
+[ $LOCK_COUNT -eq 1 ] || _fail "expected 1 lock in rados state, got $LOCK_COUNT"
+LOCKER_COOKIE="$(jq -r '.lockers[0].cookie' ${TMP_DIR}/lock_state_fourth)"
+[ "$LOCKER_COOKIE" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected locker cookie: $LOCKER_COOKIE"
+LOCKER_DESC="$(jq -r '.lockers[0].description' ${TMP_DIR}/lock_state_fourth)"
+[ "$LOCKER_DESC" == "CTDB cluster lock" ] \
+	|| _fail "unexpected locker description: $LOCKER_DESC"
+
+kill $locker_pid || exit 1
+wait $locker_pid &> /dev/null
+
+third_out=$(cat ${TMP_DIR}/third)
+[ "$third_out" == "0" ] \
+	|| _fail "expected lock acquisition (0), but got $third_out"
+
+# test renew / expire behaviour using a 1s expiry (update period = 500ms)
+exec >${TMP_DIR}/forth -- ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" \
+							"$POOL" "$OBJECT" 1 &
+locker_pid=$!
+
+sleep 1
+
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_fifth_a
+#echo "with lock fifth: `cat ${TMP_DIR}/lock_state_fifth_a`"
+
+LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_fifth_a)"
+[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected lock name: $LOCK_NAME"
+LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_fifth_a)"
+[ "$LOCK_TYPE" == "exclusive" ] \
+	|| _fail "unexpected lock type: $LOCK_TYPE"
+LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_fifth_a)"
+[ $LOCK_COUNT -eq 1 ] || _fail "expected 1 lock in rados state, got $LOCK_COUNT"
+LOCKER_EXP_A="$(jq -r '.lockers[0].expiration' ${TMP_DIR}/lock_state_fifth_a)"
+[ "$LOCKER_EXP_A" != "0.000000" ] \
+	|| _fail "unexpected locker expiration: $LOCKER_EXP_A"
+sleep 1 # sleep until renewal
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_fifth_b
+LOCKER_EXP_B="$(jq -r '.lockers[0].expiration' ${TMP_DIR}/lock_state_fifth_b)"
+[ "$LOCKER_EXP_B" != "0.000000" ] \
+	|| _fail "unexpected locker expiration: $LOCKER_EXP_B"
+#echo "lock expiration before renewal $LOCKER_EXP_A, after renewal $LOCKER_EXP_B"
+[ "$LOCKER_EXP_B" != "$LOCKER_EXP_A" ] \
+	|| _fail "locker expiration matches: $LOCKER_EXP_B"
+
+# no chance to drop the lock, rely on expiry
+kill -KILL $locker_pid || exit 1
+wait $locker_pid &> /dev/null
+sleep 1	# sleep until lock expiry
+
+rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
+						> ${TMP_DIR}/lock_state_sixth
+#echo "lock expiry sixth: `cat ${TMP_DIR}/lock_state_sixth`"
+
+LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_sixth)"
+[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
+	|| _fail "unexpected lock name: $LOCK_NAME"
+LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_sixth)"
+[ "$LOCK_TYPE" == "exclusive" ] \
+	|| _fail "unexpected lock type: $LOCK_TYPE"
+LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_sixth)"
+[ $LOCK_COUNT -eq 0 ] || _fail "expected 0 locks in rados state, got $LOCK_COUNT"
+
+rm ${TMP_DIR}/*
+rmdir $TMP_DIR
+
+echo "$0: all tests passed"