43 files changed, 33017 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_banning.c b/ctdb/server/ctdb_banning.c
new file mode 100644
index 0000000..3c71157
--- /dev/null
+++ b/ctdb/server/ctdb_banning.c
@@ -0,0 +1,146 @@
+/* 
+   ctdb banning code
+
+   Copyright (C) Ronnie Sahlberg  2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+static void ctdb_ban_node_event(struct tevent_context *ev,
+				struct tevent_timer *te,
+				struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+	/* Make sure we were able to freeze databases during banning */
+	if (!ctdb_db_all_frozen(ctdb)) {
+		DEBUG(DEBUG_ERR, ("Banning timed out, but not all databases "
+				  "frozen yet - banning this node again.\n"));
+		ctdb_ban_self(ctdb);
+		return;
+	}
+
+	DEBUG(DEBUG_ERR,("Banning timed out\n"));
+	ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_BANNED;
+
+	if (ctdb->banning_ctx != NULL) {
+		talloc_free(ctdb->banning_ctx);
+		ctdb->banning_ctx = NULL;
+	}
+}
+
+int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_ban_state *bantime = (struct ctdb_ban_state *)indata.dptr;
+	bool already_banned;
+
+	DEBUG(DEBUG_INFO,("SET BAN STATE\n"));
+
+	if (bantime->pnn != ctdb->pnn) {
+		DEBUG(DEBUG_WARNING,
+		      ("SET_BAN_STATE control for PNN %d ignored\n",
+		       bantime->pnn));
+		return -1;
+	}
+
+	already_banned = false;
+	if (ctdb->banning_ctx != NULL) {
+		talloc_free(ctdb->banning_ctx);
+		ctdb->banning_ctx = NULL;
+		already_banned = true;
+	}
+
+	if (bantime->time == 0) {
+		DEBUG(DEBUG_ERR,("Unbanning this node\n"));
+		ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED;
+		return 0;
+	}
+
+	if (ctdb->tunable.enable_bans == 0) {
+		DEBUG(DEBUG_ERR,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn));
+		return 0;
+	}
+
+	ctdb->banning_ctx = talloc(ctdb, struct ctdb_ban_state);
+	if (ctdb->banning_ctx == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " ERROR Failed to allocate new banning state\n"));
+		return -1;
+	}
+	*((struct ctdb_ban_state *)(ctdb->banning_ctx)) = *bantime;
+
+
+	DEBUG(DEBUG_ERR,("Banning this node for %d seconds\n", bantime->time));
+	ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED;
+
+	tevent_add_timer(ctdb->ev, ctdb->banning_ctx,
+			 timeval_current_ofs(bantime->time,0),
+			 ctdb_ban_node_event, ctdb);
+
+	if (!already_banned) {
+		ctdb_node_become_inactive(ctdb);
+	}
+	return 0;
+}
+
+int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+	struct ctdb_ban_state *bantime;
+
+	bantime = talloc(outdata, struct ctdb_ban_state);
+	CTDB_NO_MEMORY(ctdb, bantime);
+
+	if (ctdb->banning_ctx != NULL) {
+		*bantime = *(struct ctdb_ban_state *)(ctdb->banning_ctx);
+	} else {
+		bantime->pnn = ctdb->pnn;
+		bantime->time = 0;
+	}
+
+	outdata->dptr  = (uint8_t *)bantime;
+	outdata->dsize = sizeof(struct ctdb_ban_state);
+
+	return 0;
+}
+
+/* Routine to ban ourselves for a while when trouble strikes. */
+void ctdb_ban_self(struct ctdb_context *ctdb)
+{
+	TDB_DATA data;
+	struct ctdb_ban_state bantime;
+
+	bantime.pnn  = ctdb->pnn;
+	bantime.time = ctdb->tunable.recovery_ban_period;
+
+	data.dsize = sizeof(bantime);
+	data.dptr  = (uint8_t *)&bantime;
+
+	ctdb_control_set_ban_state(ctdb, data);
+}
diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c
new file mode 100644
index 0000000..a51a92d
--- /dev/null
+++ b/ctdb/server/ctdb_call.c
@@ -0,0 +1,2086 @@
+/* 
+   ctdb_call protocol code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+  see http://wiki.samba.org/index.php/Samba_%26_Clustering for
+  protocol design and packet details
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/hash_count.h"
+
+struct ctdb_sticky_record {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	TDB_CONTEXT *pindown;
+};
+
+/*
+  find the ctdb_db from a db index
+ */
+ struct ctdb_db_context *find_ctdb_db(struct ctdb_context *ctdb, uint32_t id)
+{
+	struct ctdb_db_context *ctdb_db;
+
+	for (ctdb_db=ctdb->db_list; ctdb_db; ctdb_db=ctdb_db->next) {
+		if (ctdb_db->db_id == id) {
+			break;
+		}
+	}
+	return ctdb_db;
+}
+
+/*
+  a variant of input packet that can be used in lock requeue
+*/
+static void ctdb_call_input_pkt(void *p, struct ctdb_req_header *hdr)
+{
+	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+	ctdb_input_pkt(ctdb, hdr);
+}
+
+
+/*
+  send an error reply
+*/
+static void ctdb_send_error(struct ctdb_context *ctdb, 
+			    struct ctdb_req_header *hdr, uint32_t status,
+			    const char *fmt, ...) PRINTF_ATTRIBUTE(4,5);
+static void ctdb_send_error(struct ctdb_context *ctdb, 
+			    struct ctdb_req_header *hdr, uint32_t status,
+			    const char *fmt, ...)
+{
+	va_list ap;
+	struct ctdb_reply_error_old *r;
+	char *msg;
+	int msglen, len;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Failed to send error. Transport is DOWN\n"));
+		return;
+	}
+
+	va_start(ap, fmt);
+	msg = talloc_vasprintf(ctdb, fmt, ap);
+	if (msg == NULL) {
+		ctdb_fatal(ctdb, "Unable to allocate error in ctdb_send_error\n");
+	}
+	va_end(ap);
+
+	msglen = strlen(msg)+1;
+	len = offsetof(struct ctdb_reply_error_old, msg);
+	r = ctdb_transport_allocate(ctdb, msg, CTDB_REPLY_ERROR, len + msglen, 
+				    struct ctdb_reply_error_old);
+	CTDB_NO_MEMORY_FATAL(ctdb, r);
+
+	r->hdr.destnode  = hdr->srcnode;
+	r->hdr.reqid     = hdr->reqid;
+	r->status        = status;
+	r->msglen        = msglen;
+	memcpy(&r->msg[0], msg, msglen);
+
+	ctdb_queue_packet(ctdb, &r->hdr);
+
+	talloc_free(msg);
+}
+
+
+/**
+ * send a redirect reply
+ *
+ * The logic behind this function is this:
+ *
+ * A client wants to grab a record and sends a CTDB_REQ_CALL packet
+ * to its local ctdb (ctdb_request_call). If the node is not itself
+ * the record's DMASTER, it first redirects the packet to  the
+ * record's LMASTER. The LMASTER then redirects the call packet to
+ * the current DMASTER. Note that this works because of this: When
+ * a record is migrated off a node, then the new DMASTER is stored
+ * in the record's copy on the former DMASTER.
+ */
+static void ctdb_call_send_redirect(struct ctdb_context *ctdb,
+				    struct ctdb_db_context *ctdb_db,
+				    TDB_DATA key,
+				    struct ctdb_req_call_old *c, 
+				    struct ctdb_ltdb_header *header)
+{
+	uint32_t lmaster = ctdb_lmaster(ctdb, &key);
+
+	c->hdr.destnode = lmaster;
+	if (ctdb->pnn == lmaster) {
+		c->hdr.destnode = header->dmaster;
+	}
+	c->hopcount++;
+
+	if (c->hopcount%100 > 95) {
+		DEBUG(DEBUG_WARNING,("High hopcount %d dbid:%s "
+			"key:0x%08x reqid=%08x pnn:%d src:%d lmaster:%d "
+			"header->dmaster:%d dst:%d\n",
+			c->hopcount, ctdb_db->db_name, ctdb_hash(&key),
+			c->hdr.reqid, ctdb->pnn, c->hdr.srcnode, lmaster,
+			header->dmaster, c->hdr.destnode));
+	}
+
+	ctdb_queue_packet(ctdb, &c->hdr);
+}
+
+
+/*
+  send a dmaster reply
+
+  caller must have the chainlock before calling this routine. Caller must be
+  the lmaster
+*/
+static void ctdb_send_dmaster_reply(struct ctdb_db_context *ctdb_db,
+				    struct ctdb_ltdb_header *header,
+				    TDB_DATA key, TDB_DATA data,
+				    uint32_t new_dmaster,
+				    uint32_t reqid)
+{
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct ctdb_reply_dmaster_old *r;
+	int ret, len;
+	TALLOC_CTX *tmp_ctx;
+
+	if (ctdb->pnn != ctdb_lmaster(ctdb, &key)) {
+		DEBUG(DEBUG_ALERT,(__location__ " Caller is not lmaster!\n"));
+		return;
+	}
+
+	header->dmaster = new_dmaster;
+	ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+	if (ret != 0) {
+		ctdb_fatal(ctdb, "ctdb_send_dmaster_reply unable to update dmaster");
+		return;
+	}
+
+	if (ctdb->methods == NULL) {
+		ctdb_fatal(ctdb, "ctdb_send_dmaster_reply can't update dmaster since transport is down");
+		return;
+	}
+
+	/* put the packet on a temporary context, allowing us to safely free
+	   it below even if ctdb_reply_dmaster() has freed it already */
+	tmp_ctx = talloc_new(ctdb);
+
+	/* send the CTDB_REPLY_DMASTER */
+	len = offsetof(struct ctdb_reply_dmaster_old, data) + key.dsize + data.dsize + sizeof(uint32_t);
+	r = ctdb_transport_allocate(ctdb, tmp_ctx, CTDB_REPLY_DMASTER, len,
+				    struct ctdb_reply_dmaster_old);
+	CTDB_NO_MEMORY_FATAL(ctdb, r);
+
+	r->hdr.destnode  = new_dmaster;
+	r->hdr.reqid     = reqid;
+	r->hdr.generation = ctdb_db->generation;
+	r->rsn           = header->rsn;
+	r->keylen        = key.dsize;
+	r->datalen       = data.dsize;
+	r->db_id         = ctdb_db->db_id;
+	memcpy(&r->data[0], key.dptr, key.dsize);
+	memcpy(&r->data[key.dsize], data.dptr, data.dsize);
+	memcpy(&r->data[key.dsize+data.dsize], &header->flags, sizeof(uint32_t));
+
+	ctdb_queue_packet(ctdb, &r->hdr);
+
+	talloc_free(tmp_ctx);
+}
+
+/*
+  send a dmaster request (give another node the dmaster for a record)
+
+  This is always sent to the lmaster, which ensures that the lmaster
+  always knows who the dmaster is. The lmaster will then send a
+  CTDB_REPLY_DMASTER to the new dmaster
+*/
+static void ctdb_call_send_dmaster(struct ctdb_db_context *ctdb_db, 
+				   struct ctdb_req_call_old *c, 
+				   struct ctdb_ltdb_header *header,
+				   TDB_DATA *key, TDB_DATA *data)
+{
+	struct ctdb_req_dmaster_old *r;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int len;
+	uint32_t lmaster = ctdb_lmaster(ctdb, key);
+
+	if (ctdb->methods == NULL) {
+		ctdb_fatal(ctdb, "Failed ctdb_call_send_dmaster since transport is down");
+		return;
+	}
+
+	if (data->dsize != 0) {
+		header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+	}
+
+	if (lmaster == ctdb->pnn) {
+		ctdb_send_dmaster_reply(ctdb_db, header, *key, *data, 
+					c->hdr.srcnode, c->hdr.reqid);
+		return;
+	}
+	
+	len = offsetof(struct ctdb_req_dmaster_old, data) + key->dsize + data->dsize
+			+ sizeof(uint32_t);
+	r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_DMASTER, len, 
+				    struct ctdb_req_dmaster_old);
+	CTDB_NO_MEMORY_FATAL(ctdb, r);
+	r->hdr.destnode  = lmaster;
+	r->hdr.reqid     = c->hdr.reqid;
+	r->hdr.generation = ctdb_db->generation;
+	r->db_id         = c->db_id;
+	r->rsn           = header->rsn;
+	r->dmaster       = c->hdr.srcnode;
+	r->keylen        = key->dsize;
+	r->datalen       = data->dsize;
+	memcpy(&r->data[0], key->dptr, key->dsize);
+	memcpy(&r->data[key->dsize], data->dptr, data->dsize);
+	memcpy(&r->data[key->dsize + data->dsize], &header->flags, sizeof(uint32_t));
+
+	header->dmaster = c->hdr.srcnode;
+	if (ctdb_ltdb_store(ctdb_db, *key, header, *data) != 0) {
+		ctdb_fatal(ctdb, "Failed to store record in ctdb_call_send_dmaster");
+	}
+	
+	ctdb_queue_packet(ctdb, &r->hdr);
+
+	talloc_free(r);
+}
+
+static void ctdb_sticky_pindown_timeout(struct tevent_context *ev,
+					struct tevent_timer *te,
+					struct timeval t, void *private_data)
+{
+	struct ctdb_sticky_record *sr = talloc_get_type(private_data, 
+						       struct ctdb_sticky_record);
+
+	DEBUG(DEBUG_ERR,("Pindown timeout db:%s  unstick record\n", sr->ctdb_db->db_name));
+	if (sr->pindown != NULL) {
+		talloc_free(sr->pindown);
+		sr->pindown = NULL;
+	}
+}
+
+static int
+ctdb_set_sticky_pindown(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	uint32_t *k;
+	struct ctdb_sticky_record *sr;
+
+	k = ctdb_key_to_idkey(tmp_ctx, key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+	if (sr == NULL) {
+		talloc_free(tmp_ctx);
+		return 0;
+	}
+
+	talloc_free(tmp_ctx);
+
+	if (sr->pindown == NULL) {
+		DEBUG(DEBUG_ERR,("Pinning down record in %s for %d ms\n", ctdb_db->db_name, ctdb->tunable.sticky_pindown));
+		sr->pindown = talloc_new(sr);
+		if (sr->pindown == NULL) {
+			DEBUG(DEBUG_ERR,("Failed to allocate pindown context for sticky record\n"));
+			return -1;
+		}
+		tevent_add_timer(ctdb->ev, sr->pindown,
+				 timeval_current_ofs(ctdb->tunable.sticky_pindown / 1000,
+						     (ctdb->tunable.sticky_pindown * 1000) % 1000000),
+				 ctdb_sticky_pindown_timeout, sr);
+	}
+
+	return 0;
+}
+
+/*
+  called when a CTDB_REPLY_DMASTER packet comes in, or when the lmaster
+  gets a CTDB_REQUEST_DMASTER for itself. We become the dmaster.
+
+  must be called with the chainlock held. This function releases the chainlock
+*/
+static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
+				struct ctdb_req_header *hdr,
+				TDB_DATA key, TDB_DATA data,
+				uint64_t rsn, uint32_t record_flags)
+{
+	struct ctdb_call_state *state;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct ctdb_ltdb_header header;
+	int ret;
+
+	DEBUG(DEBUG_DEBUG,("pnn %u dmaster response %08x\n", ctdb->pnn, ctdb_hash(&key)));
+
+	ZERO_STRUCT(header);
+	header.rsn = rsn;
+	header.dmaster = ctdb->pnn;
+	header.flags = record_flags;
+
+	state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state);
+
+	if (state) {
+		if (state->call->flags & CTDB_CALL_FLAG_VACUUM_MIGRATION) {
+			/*
+			 * We temporarily add the VACUUM_MIGRATED flag to
+			 * the record flags, so that ctdb_ltdb_store can
+			 * decide whether the record should be stored or
+			 * deleted.
+			 */
+			header.flags |= CTDB_REC_FLAG_VACUUM_MIGRATED;
+		}
+	}
+
+	if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+		ctdb_fatal(ctdb, "ctdb_reply_dmaster store failed\n");
+
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+		return;
+	}
+
+	/* we just became DMASTER and this database is "sticky",
+	   see if the record is flagged as "hot" and set up a pin-down
+	   context to stop migrations for a little while if so
+	*/
+	if (ctdb_db_sticky(ctdb_db)) {
+		ctdb_set_sticky_pindown(ctdb, ctdb_db, key);
+	}
+
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_become_dmaster from node %u\n",
+			 ctdb->pnn, hdr->reqid, hdr->srcnode));
+
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+		return;
+	}
+
+	if (key.dsize != state->call->key.dsize || memcmp(key.dptr, state->call->key.dptr, key.dsize)) {
+		DEBUG(DEBUG_ERR, ("Got bogus DMASTER packet reqid:%u from node %u. Key does not match key held in matching idr.\n", hdr->reqid, hdr->srcnode));
+
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+		return;
+	}
+
+	if (hdr->reqid != state->reqid) {
+		/* we found a record  but it was the wrong one */
+		DEBUG(DEBUG_ERR, ("Dropped orphan in ctdb_become_dmaster with reqid:%u\n from node %u", hdr->reqid, hdr->srcnode));
+
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+		return;
+	}
+
+	(void) hash_count_increment(ctdb_db->migratedb, key);
+
+	ctdb_call_local(ctdb_db, state->call, &header, state, &data, true);
+
+	ret = ctdb_ltdb_unlock(ctdb_db, state->call->key);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+	}
+
+	state->state = CTDB_CALL_DONE;
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
+}
+
+struct dmaster_defer_call {
+	struct dmaster_defer_call *next, *prev;
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+};
+
+struct dmaster_defer_queue {
+	struct ctdb_db_context *ctdb_db;
+	uint32_t generation;
+	struct dmaster_defer_call *deferred_calls;
+};
+
+static void dmaster_defer_reprocess(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval t,
+				    void *private_data)
+{
+	struct dmaster_defer_call *call = talloc_get_type(
+		private_data, struct dmaster_defer_call);
+
+	ctdb_input_pkt(call->ctdb, call->hdr);
+	talloc_free(call);
+}
+
+static int dmaster_defer_queue_destructor(struct dmaster_defer_queue *ddq)
+{
+	/* Ignore requests, if database recovery happens in-between. */
+	if (ddq->generation != ddq->ctdb_db->generation) {
+		return 0;
+	}
+
+	while (ddq->deferred_calls != NULL) {
+		struct dmaster_defer_call *call = ddq->deferred_calls;
+
+		DLIST_REMOVE(ddq->deferred_calls, call);
+
+		talloc_steal(call->ctdb, call);
+		tevent_add_timer(call->ctdb->ev, call, timeval_zero(),
+				 dmaster_defer_reprocess, call);
+	}
+	return 0;
+}
+
+static void *insert_ddq_callback(void *parm, void *data)
+{
+	if (data) {
+		talloc_free(data);
+	}
+	return parm;
+}
+
+/**
+ * This function is used to register a key in database that needs to be updated.
+ * Any requests for that key should get deferred till this is completed.
+ */
+static int dmaster_defer_setup(struct ctdb_db_context *ctdb_db,
+			       struct ctdb_req_header *hdr,
+			       TDB_DATA key)
+{
+	uint32_t *k;
+	struct dmaster_defer_queue *ddq;
+
+	k = ctdb_key_to_idkey(hdr, key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate key for dmaster defer setup\n"));
+		return -1;
+	}
+
+	/* Already exists */
+	ddq = trbt_lookuparray32(ctdb_db->defer_dmaster, k[0], k);
+	if (ddq != NULL) {
+		if (ddq->generation == ctdb_db->generation) {
+			talloc_free(k);
+			return 0;
+		}
+
+		/* Recovery occurred - get rid of old queue. All the deferred
+		 * requests will be resent anyway from ctdb_call_resend_db.
+		 */
+		talloc_free(ddq);
+	}
+
+	ddq = talloc(hdr, struct dmaster_defer_queue);
+	if (ddq == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate dmaster defer queue\n"));
+		talloc_free(k);
+		return -1;
+	}
+	ddq->ctdb_db = ctdb_db;
+	ddq->generation = hdr->generation;
+	ddq->deferred_calls = NULL;
+
+	trbt_insertarray32_callback(ctdb_db->defer_dmaster, k[0], k,
+				    insert_ddq_callback, ddq);
+	talloc_set_destructor(ddq, dmaster_defer_queue_destructor);
+
+	talloc_free(k);
+	return 0;
+}
+
+static int dmaster_defer_add(struct ctdb_db_context *ctdb_db,
+			     struct ctdb_req_header *hdr,
+			     TDB_DATA key)
+{
+	struct dmaster_defer_queue *ddq;
+	struct dmaster_defer_call *call;
+	uint32_t *k;
+
+	k = ctdb_key_to_idkey(hdr, key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate key for dmaster defer add\n"));
+		return -1;
+	}
+
+	ddq = trbt_lookuparray32(ctdb_db->defer_dmaster, k[0], k);
+	if (ddq == NULL) {
+		talloc_free(k);
+		return -1;
+	}
+
+	talloc_free(k);
+
+	if (ddq->generation != hdr->generation) {
+		talloc_set_destructor(ddq, NULL);
+		talloc_free(ddq);
+		return -1;
+	}
+
+	call = talloc(ddq, struct dmaster_defer_call);
+	if (call == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate dmaster defer call\n"));
+		return -1;
+	}
+
+	call->ctdb = ctdb_db->ctdb;
+	call->hdr = talloc_steal(call, hdr);
+
+	DLIST_ADD_END(ddq->deferred_calls, call);
+
+	return 0;
+}
+
+/*
+  called when a CTDB_REQ_DMASTER packet comes in
+
+  this comes into the lmaster for a record when the current dmaster
+  wants to give up the dmaster role and give it to someone else
+*/
+void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_req_dmaster_old *c = (struct ctdb_req_dmaster_old *)hdr;
+	TDB_DATA key, data, data2;
+	struct ctdb_ltdb_header header;
+	struct ctdb_db_context *ctdb_db;
+	uint32_t record_flags = 0;
+	size_t len;
+	int ret;
+
+	key.dptr = c->data;
+	key.dsize = c->keylen;
+	data.dptr = c->data + c->keylen;
+	data.dsize = c->datalen;
+	len = offsetof(struct ctdb_req_dmaster_old, data) + key.dsize + data.dsize
+			+ sizeof(uint32_t);
+	if (len <= c->hdr.length) {
+		memcpy(&record_flags, &c->data[c->keylen + c->datalen],
+		       sizeof(record_flags));
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, c->db_id);
+	if (!ctdb_db) {
+		ctdb_send_error(ctdb, hdr, -1,
+				"Unknown database in request. db_id==0x%08x",
+				c->db_id);
+		return;
+	}
+
+	dmaster_defer_setup(ctdb_db, hdr, key);
+
+	/* fetch the current record */
+	ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, hdr, &data2,
+					   ctdb_call_input_pkt, ctdb, false);
+	if (ret == -1) {
+		ctdb_fatal(ctdb, "ctdb_req_dmaster failed to fetch record");
+		return;
+	}
+	if (ret == -2) {
+		DEBUG(DEBUG_INFO,(__location__ " deferring ctdb_request_dmaster\n"));
+		return;
+	}
+
+	if (ctdb_lmaster(ctdb, &key) != ctdb->pnn) {
+		DEBUG(DEBUG_ERR, ("dmaster request to non-lmaster "
+				  "db=%s lmaster=%u gen=%u curgen=%u\n",
+				  ctdb_db->db_name, ctdb_lmaster(ctdb, &key),
+				  hdr->generation, ctdb_db->generation));
+		ctdb_fatal(ctdb, "ctdb_req_dmaster to non-lmaster");
+	}
+
+	DEBUG(DEBUG_DEBUG,("pnn %u dmaster request on %08x for %u from %u\n", 
+		 ctdb->pnn, ctdb_hash(&key), c->dmaster, c->hdr.srcnode));
+
+	/* its a protocol error if the sending node is not the current dmaster */
+	if (header.dmaster != hdr->srcnode) {
+		DEBUG(DEBUG_ALERT,("pnn %u dmaster request for new-dmaster %u from non-master %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u keyval=0x%08x\n",
+			 ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key),
+			 ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation,
+			 (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid,
+			 (key.dsize >= 4)?(*(uint32_t *)key.dptr):0));
+		if (header.rsn != 0 || header.dmaster != ctdb->pnn) {
+			DEBUG(DEBUG_ERR,("ctdb_req_dmaster from non-master. Force a recovery.\n"));
+
+			ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+			ctdb_ltdb_unlock(ctdb_db, key);
+			return;
+		}
+	}
+
+	if (header.rsn > c->rsn) {
+		DEBUG(DEBUG_ALERT,("pnn %u dmaster request with older RSN new-dmaster %u from %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u\n",
+			 ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key),
+			 ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation,
+			 (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid));
+	}
+
+	/* use the rsn from the sending node */
+	header.rsn = c->rsn;
+
+	/* store the record flags from the sending node */
+	header.flags = record_flags;
+
+	/* check if the new dmaster is the lmaster, in which case we
+	   skip the dmaster reply */
+	if (c->dmaster == ctdb->pnn) {
+		ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags);
+	} else {
+		ctdb_send_dmaster_reply(ctdb_db, &header, key, data, c->dmaster, hdr->reqid);
+
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+	}
+}
+
+static void ctdb_sticky_record_timeout(struct tevent_context *ev,
+				       struct tevent_timer *te,
+				       struct timeval t, void *private_data)
+{
+	struct ctdb_sticky_record *sr = talloc_get_type(private_data, 
+						       struct ctdb_sticky_record);
+	talloc_free(sr);
+}
+
+static void *ctdb_make_sticky_record_callback(void *parm, void *data)
+{
+        if (data) {
+		DEBUG(DEBUG_ERR,("Already have sticky record registered. Free old %p and create new %p\n", data, parm));
+                talloc_free(data);
+        }
+        return parm;
+}
+
+static int
+ctdb_make_record_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	uint32_t *k;
+	struct ctdb_sticky_record *sr;
+
+	k = ctdb_key_to_idkey(tmp_ctx, key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+	if (sr != NULL) {
+		talloc_free(tmp_ctx);
+		return 0;
+	}
+
+	sr = talloc(ctdb_db->sticky_records, struct ctdb_sticky_record);
+	if (sr == NULL) {
+		talloc_free(tmp_ctx);
+		DEBUG(DEBUG_ERR,("Failed to allocate sticky record structure\n"));
+		return -1;
+	}
+
+	sr->ctdb    = ctdb;
+	sr->ctdb_db = ctdb_db;
+	sr->pindown = NULL;
+
+	DEBUG(DEBUG_ERR,("Make record sticky for %d seconds in db %s key:0x%08x.\n",
+			 ctdb->tunable.sticky_duration,
+			 ctdb_db->db_name, ctdb_hash(&key)));
+
+	trbt_insertarray32_callback(ctdb_db->sticky_records, k[0], &k[0], ctdb_make_sticky_record_callback, sr);
+
+	tevent_add_timer(ctdb->ev, sr,
+			 timeval_current_ofs(ctdb->tunable.sticky_duration, 0),
+			 ctdb_sticky_record_timeout, sr);
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
+
+struct pinned_down_requeue_handle {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+};
+
+struct pinned_down_deferred_call {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+};
+
+static void pinned_down_requeue(struct tevent_context *ev,
+				struct tevent_timer *te,
+				struct timeval t, void *private_data)
+{
+	struct pinned_down_requeue_handle *handle = talloc_get_type(private_data, struct pinned_down_requeue_handle);
+	struct ctdb_context *ctdb = handle->ctdb;
+
+	talloc_steal(ctdb, handle->hdr);
+	ctdb_call_input_pkt(ctdb, handle->hdr);
+
+	talloc_free(handle);
+}
+
+static int pinned_down_destructor(struct pinned_down_deferred_call *pinned_down)
+{
+	struct ctdb_context *ctdb = pinned_down->ctdb;
+	struct pinned_down_requeue_handle *handle = talloc(ctdb, struct pinned_down_requeue_handle);
+
+	handle->ctdb = pinned_down->ctdb;
+	handle->hdr  = pinned_down->hdr;
+	talloc_steal(handle, handle->hdr);
+
+	tevent_add_timer(ctdb->ev, handle, timeval_zero(),
+			 pinned_down_requeue, handle);
+
+	return 0;
+}
+
+static int
+ctdb_defer_pinned_down_request(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	uint32_t *k;
+	struct ctdb_sticky_record *sr;
+	struct pinned_down_deferred_call *pinned_down;
+
+	k = ctdb_key_to_idkey(tmp_ctx, key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+	if (sr == NULL) {
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	talloc_free(tmp_ctx);
+
+	if (sr->pindown == NULL) {
+		return -1;
+	}
+	
+	pinned_down = talloc(sr->pindown, struct pinned_down_deferred_call);
+	if (pinned_down == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate structure for deferred pinned down request\n"));
+		return -1;
+	}
+
+	pinned_down->ctdb = ctdb;
+	pinned_down->hdr  = hdr;
+
+	talloc_set_destructor(pinned_down, pinned_down_destructor);
+	talloc_steal(pinned_down, hdr);
+
+	return 0;
+}
+
+static int hot_key_cmp(const void *a, const void *b)
+{
+	const struct ctdb_db_hot_key *ka = (const struct ctdb_db_hot_key *)a;
+	const struct ctdb_db_hot_key *kb = (const struct ctdb_db_hot_key *)b;
+
+	if (ka->count < kb->count) {
+		return -1;
+	}
+	if (ka->count > kb->count) {
+		return 1;
+	}
+
+	return 0;
+}
+
+static void
+ctdb_update_db_stat_hot_keys(struct ctdb_db_context *ctdb_db, TDB_DATA key,
+			     unsigned int count)
+{
+	unsigned int i, id;
+	char *keystr;
+
+	/*
+	 * If all slots are being used then only need to compare
+	 * against the count in the 0th slot, since it contains the
+	 * smallest count.
+	 */
+	if (ctdb_db->statistics.num_hot_keys == MAX_HOT_KEYS &&
+	    count <= ctdb_db->hot_keys[0].count) {
+		return;
+	}
+
+	/* see if we already know this key */
+	for (i = 0; i < MAX_HOT_KEYS; i++) {
+		if (key.dsize != ctdb_db->hot_keys[i].key.dsize) {
+			continue;
+		}
+		if (memcmp(key.dptr, ctdb_db->hot_keys[i].key.dptr, key.dsize)) {
+			continue;
+		}
+		/* found an entry for this key */
+		if (count <= ctdb_db->hot_keys[i].count) {
+			return;
+		}
+		if (count >= (2 * ctdb_db->hot_keys[i].last_logged_count)) {
+			keystr = hex_encode_talloc(ctdb_db,
+						   (unsigned char *)key.dptr,
+						   key.dsize);
+			D_NOTICE("Updated hot key database=%s key=%s count=%d\n",
+				 ctdb_db->db_name,
+				 keystr ? keystr : "" ,
+				 count);
+			TALLOC_FREE(keystr);
+			ctdb_db->hot_keys[i].last_logged_count = count;
+		}
+		ctdb_db->hot_keys[i].count = count;
+		goto sort_keys;
+	}
+
+	if (ctdb_db->statistics.num_hot_keys < MAX_HOT_KEYS) {
+		id = ctdb_db->statistics.num_hot_keys;
+		ctdb_db->statistics.num_hot_keys++;
+	} else {
+		id = 0;
+	}
+
+	if (ctdb_db->hot_keys[id].key.dptr != NULL) {
+		talloc_free(ctdb_db->hot_keys[id].key.dptr);
+	}
+	ctdb_db->hot_keys[id].key.dsize = key.dsize;
+	ctdb_db->hot_keys[id].key.dptr = talloc_memdup(ctdb_db,
+						       key.dptr,
+						       key.dsize);
+	ctdb_db->hot_keys[id].count = count;
+
+	keystr = hex_encode_talloc(ctdb_db,
+				   (unsigned char *)key.dptr, key.dsize);
+	D_NOTICE("Added hot key database=%s key=%s count=%d\n",
+		 ctdb_db->db_name,
+		 keystr ? keystr : "" ,
+		 count);
+	talloc_free(keystr);
+	ctdb_db->hot_keys[id].last_logged_count = count;
+
+sort_keys:
+	qsort(&ctdb_db->hot_keys[0],
+	      ctdb_db->statistics.num_hot_keys,
+	      sizeof(struct ctdb_db_hot_key),
+	      hot_key_cmp);
+}
+
+/*
+  called when a CTDB_REQ_CALL packet comes in
+*/
+void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_req_call_old *c = (struct ctdb_req_call_old *)hdr;
+	TDB_DATA data;
+	struct ctdb_reply_call_old *r;
+	int ret, len;
+	struct ctdb_ltdb_header header;
+	struct ctdb_call *call;
+	struct ctdb_db_context *ctdb_db;
+	int tmp_count, bucket;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Failed ctdb_request_call. Transport is DOWN\n"));
+		return;
+	}
+
+
+	ctdb_db = find_ctdb_db(ctdb, c->db_id);
+	if (!ctdb_db) {
+		ctdb_send_error(ctdb, hdr, -1,
+				"Unknown database in request. db_id==0x%08x",
+				c->db_id);
+		return;
+	}
+
+	call = talloc(hdr, struct ctdb_call);
+	CTDB_NO_MEMORY_FATAL(ctdb, call);
+
+	call->call_id  = c->callid;
+	call->key.dptr = c->data;
+	call->key.dsize = c->keylen;
+	call->call_data.dptr = c->data + c->keylen;
+	call->call_data.dsize = c->calldatalen;
+	call->reply_data.dptr  = NULL;
+	call->reply_data.dsize = 0;
+
+
+	/* If this record is pinned down we should defer the
+	   request until the pindown times out
+	*/
+	if (ctdb_db_sticky(ctdb_db)) {
+		if (ctdb_defer_pinned_down_request(ctdb, ctdb_db, call->key, hdr) == 0) {
+			DEBUG(DEBUG_WARNING,
+			      ("Defer request for pinned down record in %s\n", ctdb_db->db_name));
+			talloc_free(call);
+			return;
+		}
+	}
+
+	if (dmaster_defer_add(ctdb_db, hdr, call->key) == 0) {
+		talloc_free(call);
+		return;
+	}
+
+	/* determine if we are the dmaster for this key. This also
+	   fetches the record data (if any), thus avoiding a 2nd fetch of the data 
+	   if the call will be answered locally */
+
+	ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, call->key, &header, hdr, &data,
+					   ctdb_call_input_pkt, ctdb, false);
+	if (ret == -1) {
+		ctdb_send_error(ctdb, hdr, ret, "ltdb fetch failed in ctdb_request_call");
+		talloc_free(call);
+		return;
+	}
+	if (ret == -2) {
+		DEBUG(DEBUG_INFO,(__location__ " deferred ctdb_request_call\n"));
+		talloc_free(call);
+		return;
+	}
+
+	/* Dont do READONLY if we don't have a tracking database */
+	if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
+		c->flags &= ~CTDB_WANT_READONLY;
+	}
+
+	if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+		header.flags &= ~CTDB_REC_RO_FLAGS;
+		CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+		CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+		}
+		/* and clear out the tracking data */
+		if (tdb_delete(ctdb_db->rottdb, call->key) != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+		}
+	}
+
+	/* if we are revoking, we must defer all other calls until the revoke
+	 * had completed.
+	 */
+	if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+		talloc_free(data.dptr);
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+		talloc_free(call);
+		return;
+	}
+
+	/*
+	 * If we are not the dmaster and are not hosting any delegations,
+	 * then we redirect the request to the node than can answer it
+	 * (the lmaster or the dmaster).
+	 */
+	if ((header.dmaster != ctdb->pnn) 
+	    && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) ) {
+		talloc_free(data.dptr);
+		ctdb_call_send_redirect(ctdb, ctdb_db, call->key, c, &header);
+
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+		talloc_free(call);
+		return;
+	}
+
+	if ( (!(c->flags & CTDB_WANT_READONLY))
+	&& (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+		header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+		}
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+		if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to start record revoke");
+		}
+		talloc_free(data.dptr);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+		talloc_free(call);
+
+		return;
+	}		
+
+	/* If this is the first request for delegation. bump rsn and set
+	 * the delegations flag
+	 */
+	if ((c->flags & CTDB_WANT_READONLY)
+	&&  (c->callid == CTDB_FETCH_WITH_HEADER_FUNC)
+	&&  (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS))) {
+		header.rsn     += 3;
+		header.flags   |= CTDB_REC_RO_HAVE_DELEGATIONS;
+		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+		}
+	}
+	if ((c->flags & CTDB_WANT_READONLY) 
+	&&  ((unsigned int)call->call_id == CTDB_FETCH_WITH_HEADER_FUNC)) {
+		TDB_DATA tdata;
+
+		tdata = tdb_fetch(ctdb_db->rottdb, call->key);
+		if (ctdb_trackingdb_add_pnn(ctdb, &tdata, c->hdr.srcnode) != 0) {
+			ctdb_fatal(ctdb, "Failed to add node to trackingdb");
+		}
+		if (tdb_store(ctdb_db->rottdb, call->key, tdata, TDB_REPLACE) != 0) {
+			ctdb_fatal(ctdb, "Failed to store trackingdb data");
+		}
+		free(tdata.dptr);
+
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+
+		len = offsetof(struct ctdb_reply_call_old, data) + data.dsize + sizeof(struct ctdb_ltdb_header);
+		r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, 
+					    struct ctdb_reply_call_old);
+		CTDB_NO_MEMORY_FATAL(ctdb, r);
+		r->hdr.destnode  = c->hdr.srcnode;
+		r->hdr.reqid     = c->hdr.reqid;
+		r->hdr.generation = ctdb_db->generation;
+		r->status        = 0;
+		r->datalen       = data.dsize + sizeof(struct ctdb_ltdb_header);
+		header.rsn      -= 2;
+		header.flags   |= CTDB_REC_RO_HAVE_READONLY;
+		header.flags   &= ~CTDB_REC_RO_HAVE_DELEGATIONS;
+		memcpy(&r->data[0], &header, sizeof(struct ctdb_ltdb_header));
+
+		if (data.dsize) {
+			memcpy(&r->data[sizeof(struct ctdb_ltdb_header)], data.dptr, data.dsize);
+		}
+
+		ctdb_queue_packet(ctdb, &r->hdr);
+		CTDB_INCREMENT_STAT(ctdb, total_ro_delegations);
+		CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_delegations);
+
+		talloc_free(r);
+		talloc_free(call);
+		return;
+	}
+
+	CTDB_UPDATE_STAT(ctdb, max_hop_count, c->hopcount);
+	tmp_count = c->hopcount;
+	bucket = 0;
+	while (tmp_count) {
+		tmp_count >>= 1;
+		bucket++;
+	}
+	if (bucket >= MAX_COUNT_BUCKETS) {
+		bucket = MAX_COUNT_BUCKETS - 1;
+	}
+	CTDB_INCREMENT_STAT(ctdb, hop_count_bucket[bucket]);
+	CTDB_INCREMENT_DB_STAT(ctdb_db, hop_count_bucket[bucket]);
+
+	/* If this database supports sticky records, then check if the
+	   hopcount is big. If it is it means the record is hot and we
+	   should make it sticky.
+	*/
+	if (ctdb_db_sticky(ctdb_db) &&
+	    c->hopcount >= ctdb->tunable.hopcount_make_sticky) {
+		ctdb_make_record_sticky(ctdb, ctdb_db, call->key);
+	}
+
+
+	/* Try if possible to migrate the record off to the caller node.
+	 * From the clients perspective a fetch of the data is just as 
+	 * expensive as a migration.
+	 */
+	if (c->hdr.srcnode != ctdb->pnn) {
+		if (ctdb_db->persistent_state) {
+			DEBUG(DEBUG_INFO, (__location__ " refusing migration"
+			      " of key %s while transaction is active\n",
+			      (char *)call->key.dptr));
+		} else {
+			DEBUG(DEBUG_DEBUG,("pnn %u starting migration of %08x to %u\n",
+				 ctdb->pnn, ctdb_hash(&(call->key)), c->hdr.srcnode));
+			ctdb_call_send_dmaster(ctdb_db, c, &header, &(call->key), &data);
+			talloc_free(data.dptr);
+
+			ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+			if (ret != 0) {
+				DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+			}
+		}
+		talloc_free(call);
+		return;
+	}
+
+	ret = ctdb_call_local(ctdb_db, call, &header, hdr, &data, true);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_call_local failed\n"));
+		call->status = -1;
+	}
+
+	ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+	}
+
+	len = offsetof(struct ctdb_reply_call_old, data) + call->reply_data.dsize;
+	r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, 
+				    struct ctdb_reply_call_old);
+	CTDB_NO_MEMORY_FATAL(ctdb, r);
+	r->hdr.destnode  = hdr->srcnode;
+	r->hdr.reqid     = hdr->reqid;
+	r->hdr.generation = ctdb_db->generation;
+	r->status        = call->status;
+	r->datalen       = call->reply_data.dsize;
+	if (call->reply_data.dsize) {
+		memcpy(&r->data[0], call->reply_data.dptr, call->reply_data.dsize);
+	}
+
+	ctdb_queue_packet(ctdb, &r->hdr);
+
+	talloc_free(r);
+	talloc_free(call);
+}
+
+/**
+ * called when a CTDB_REPLY_CALL packet comes in
+ *
+ * This packet comes in response to a CTDB_REQ_CALL request packet. It
+ * contains any reply data from the call
+ */
+void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_reply_call_old *c = (struct ctdb_reply_call_old *)hdr;
+	struct ctdb_call_state *state;
+
+	state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " reqid %u not found\n", hdr->reqid));
+		return;
+	}
+
+	if (hdr->reqid != state->reqid) {
+		/* we found a record  but it was the wrong one */
+		DEBUG(DEBUG_ERR, ("Dropped orphaned call reply with reqid:%u\n",hdr->reqid));
+		return;
+	}
+
+
+	/* read only delegation processing */
+	/* If we got a FETCH_WITH_HEADER we should check if this is a ro
+	 * delegation since we may need to update the record header
+	 */
+	if (state->c->callid == CTDB_FETCH_WITH_HEADER_FUNC) {
+		struct ctdb_db_context *ctdb_db = state->ctdb_db;
+		struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)&c->data[0];
+		struct ctdb_ltdb_header oldheader;
+		TDB_DATA key, data, olddata;
+		int ret;
+
+		if (!(header->flags & CTDB_REC_RO_HAVE_READONLY)) {
+			goto finished_ro;
+			return;
+		}
+
+		key.dsize = state->c->keylen;
+		key.dptr  = state->c->data;
+		ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+				     ctdb_call_input_pkt, ctdb, false);
+		if (ret == -2) {
+			return;
+		}
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_call\n"));
+			return;
+		}
+
+		ret = ctdb_ltdb_fetch(ctdb_db, key, &oldheader, state, &olddata);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, ("Failed to fetch old record in ctdb_reply_call\n"));
+			ctdb_ltdb_unlock(ctdb_db, key);
+			goto finished_ro;
+		}			
+
+		if (header->rsn <= oldheader.rsn) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+			goto finished_ro;
+		}
+
+		if (c->datalen < sizeof(struct ctdb_ltdb_header)) {
+			DEBUG(DEBUG_ERR,(__location__ " Got FETCH_WITH_HEADER reply with too little data: %d bytes\n", c->datalen));
+			ctdb_ltdb_unlock(ctdb_db, key);
+			goto finished_ro;
+		}
+
+		data.dsize = c->datalen - sizeof(struct ctdb_ltdb_header);
+		data.dptr  = &c->data[sizeof(struct ctdb_ltdb_header)];
+		ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, ("Failed to store new record in ctdb_reply_call\n"));
+			ctdb_ltdb_unlock(ctdb_db, key);
+			goto finished_ro;
+		}			
+
+		ctdb_ltdb_unlock(ctdb_db, key);
+	}
+finished_ro:
+
+	state->call->reply_data.dptr = c->data;
+	state->call->reply_data.dsize = c->datalen;
+	state->call->status = c->status;
+
+	talloc_steal(state, c);
+
+	state->state = CTDB_CALL_DONE;
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
+}
+
+
+/**
+ * called when a CTDB_REPLY_DMASTER packet comes in
+ *
+ * This packet comes in from the lmaster in response to a CTDB_REQ_CALL
+ * request packet. It means that the current dmaster wants to give us
+ * the dmaster role.
+ */
+void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_reply_dmaster_old *c = (struct ctdb_reply_dmaster_old *)hdr;
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key, data;
+	uint32_t record_flags = 0;
+	size_t len;
+	int ret;
+
+	ctdb_db = find_ctdb_db(ctdb, c->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_reply_dmaster\n", c->db_id));
+		return;
+	}
+	
+	key.dptr = c->data;
+	key.dsize = c->keylen;
+	data.dptr = &c->data[key.dsize];
+	data.dsize = c->datalen;
+	len = offsetof(struct ctdb_reply_dmaster_old, data) + key.dsize + data.dsize
+		+ sizeof(uint32_t);
+	if (len <= c->hdr.length) {
+		memcpy(&record_flags, &c->data[c->keylen + c->datalen],
+		       sizeof(record_flags));
+	}
+
+	dmaster_defer_setup(ctdb_db, hdr, key);
+
+	ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+				     ctdb_call_input_pkt, ctdb, false);
+	if (ret == -2) {
+		return;
+	}
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_dmaster\n"));
+		return;
+	}
+
+	ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags);
+}
+
+
+/*
+  called when a CTDB_REPLY_ERROR packet comes in
+*/
+void ctdb_reply_error(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_reply_error_old *c = (struct ctdb_reply_error_old *)hdr;
+	struct ctdb_call_state *state;
+
+	state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_error\n",
+			 ctdb->pnn, hdr->reqid));
+		return;
+	}
+
+	if (hdr->reqid != state->reqid) {
+		/* we found a record  but it was the wrong one */
+		DEBUG(DEBUG_ERR, ("Dropped orphaned error reply with reqid:%u\n",hdr->reqid));
+		return;
+	}
+
+	talloc_steal(state, c);
+
+	state->state  = CTDB_CALL_ERROR;
+	state->errmsg = (char *)c->msg;
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
+}
+
+
+/*
+  destroy a ctdb_call
+*/
+static int ctdb_call_destructor(struct ctdb_call_state *state)
+{
+	DLIST_REMOVE(state->ctdb_db->pending_calls, state);
+	reqid_remove(state->ctdb_db->ctdb->idr, state->reqid);
+	return 0;
+}
+
+
+/*
+  called when a ctdb_call needs to be resent after a reconfigure event
+*/
+static void ctdb_call_resend(struct ctdb_call_state *state)
+{
+	struct ctdb_context *ctdb = state->ctdb_db->ctdb;
+
+	state->generation = state->ctdb_db->generation;
+
+	/* use a new reqid, in case the old reply does eventually come in */
+	reqid_remove(ctdb->idr, state->reqid);
+	state->reqid = reqid_new(ctdb->idr, state);
+	state->c->hdr.reqid = state->reqid;
+
+	/* update the generation count for this request, so its valid with the new vnn_map */
+	state->c->hdr.generation = state->generation;
+
+	/* send the packet to ourselves, it will be redirected appropriately */
+	state->c->hdr.destnode = ctdb->pnn;
+
+	ctdb_queue_packet(ctdb, &state->c->hdr);
+	D_INFO("resent ctdb_call for db %s reqid %u generation %u\n",
+	       state->ctdb_db->db_name,
+	       state->reqid,
+	       state->generation);
+}
+
+/*
+  resend all pending calls on recovery
+ */
+void ctdb_call_resend_db(struct ctdb_db_context *ctdb_db)
+{
+	struct ctdb_call_state *state, *next;
+	unsigned int count = 0;
+
+	for (state = ctdb_db->pending_calls; state; state = next) {
+		next = state->next;
+		ctdb_call_resend(state);
+		count++;
+	}
+	/* Avoid logging a 0 count below */
+	if (count == 0) {
+		return;
+	}
+	D_NOTICE("Resent calls for database=%s, generation=%u, count=%u\n",
+		 ctdb_db->db_name,
+		 ctdb_db->generation,
+		 count);
+}
+
+void ctdb_call_resend_all(struct ctdb_context *ctdb)
+{
+	struct ctdb_db_context *ctdb_db;
+
+	for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+		ctdb_call_resend_db(ctdb_db);
+	}
+}
+
+/*
+  this allows the caller to setup a async.fn 
+*/
+static void call_local_trigger(struct tevent_context *ev,
+			       struct tevent_timer *te,
+			       struct timeval t, void *private_data)
+{
+	struct ctdb_call_state *state = talloc_get_type(private_data, struct ctdb_call_state);
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
+}	
+
+
+/*
+  construct an event driven local ctdb_call
+
+  this is used so that locally processed ctdb_call requests are processed
+  in an event driven manner
+*/
+struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db, 
+					     struct ctdb_call *call,
+					     struct ctdb_ltdb_header *header,
+					     TDB_DATA *data)
+{
+	struct ctdb_call_state *state;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int ret;
+
+	state = talloc_zero(ctdb_db, struct ctdb_call_state);
+	CTDB_NO_MEMORY_NULL(ctdb, state);
+
+	talloc_steal(state, data->dptr);
+
+	state->state = CTDB_CALL_DONE;
+	state->call  = talloc(state, struct ctdb_call);
+	CTDB_NO_MEMORY_NULL(ctdb, state->call);
+	*(state->call) = *call;
+	state->ctdb_db = ctdb_db;
+
+	ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+	if (ret != 0) {
+		DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+	}
+
+	tevent_add_timer(ctdb->ev, state, timeval_zero(),
+			 call_local_trigger, state);
+
+	return state;
+}
+
+
+/*
+  make a remote ctdb call - async send. Called in daemon context.
+
+  This constructs a ctdb_call request and queues it for processing. 
+  This call never blocks.
+*/
+struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctdb_db, 
+						     struct ctdb_call *call, 
+						     struct ctdb_ltdb_header *header)
+{
+	uint32_t len;
+	struct ctdb_call_state *state;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct ctdb_req_call_old *c;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Failed send packet. Transport is down\n"));
+		return NULL;
+	}
+
+	state = talloc_zero(ctdb_db, struct ctdb_call_state);
+	CTDB_NO_MEMORY_NULL(ctdb, state);
+	state->call = talloc(state, struct ctdb_call);
+	CTDB_NO_MEMORY_NULL(ctdb, state->call);
+
+	state->reqid = reqid_new(ctdb->idr, state);
+	state->ctdb_db = ctdb_db;
+	state->state  = CTDB_CALL_WAIT;
+	state->generation = ctdb_db->generation;
+
+	len = offsetof(struct ctdb_req_call_old, data) + call->key.dsize +
+		       call->call_data.dsize;
+
+	c = ctdb_transport_allocate(ctdb,
+				    state,
+				    CTDB_REQ_CALL,
+				    len,
+				    struct ctdb_req_call_old);
+
+	CTDB_NO_MEMORY_NULL(ctdb, c);
+	state->c = c;
+
+	c->hdr.destnode  = header->dmaster;
+	c->hdr.reqid     = state->reqid;
+	c->hdr.generation = ctdb_db->generation;
+	c->flags         = call->flags;
+	c->db_id         = ctdb_db->db_id;
+	c->callid        = call->call_id;
+	c->hopcount      = 0;
+	c->keylen        = call->key.dsize;
+	c->calldatalen   = call->call_data.dsize;
+
+	memcpy(&c->data[0], call->key.dptr, call->key.dsize);
+	memcpy(&c->data[call->key.dsize],
+	       call->call_data.dptr,
+	       call->call_data.dsize);
+
+	*(state->call) = *call;
+	state->call->call_data.dptr = &c->data[call->key.dsize];
+	state->call->key.dptr       = &c->data[0];
+
+	DLIST_ADD(ctdb_db->pending_calls, state);
+
+	talloc_set_destructor(state, ctdb_call_destructor);
+	ctdb_queue_packet(ctdb, &state->c->hdr);
+
+	return state;
+}
+
+/*
+  make a remote ctdb call - async recv - called in daemon context
+
+  This is called when the program wants to wait for a ctdb_call to complete and get the 
+  results. This call will block unless the call has already completed.
+*/
+int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call)
+{
+	while (state->state < CTDB_CALL_DONE) {
+		tevent_loop_once(state->ctdb_db->ctdb->ev);
+	}
+	if (state->state != CTDB_CALL_DONE) {
+		ctdb_set_error(state->ctdb_db->ctdb, "%s", state->errmsg);
+		talloc_free(state);
+		return -1;
+	}
+
+	if (state->call->reply_data.dsize) {
+		call->reply_data.dptr = talloc_memdup(call,
+						      state->call->reply_data.dptr,
+						      state->call->reply_data.dsize);
+		call->reply_data.dsize = state->call->reply_data.dsize;
+	} else {
+		call->reply_data.dptr = NULL;
+		call->reply_data.dsize = 0;
+	}
+	call->status = state->call->status;
+	talloc_free(state);
+	return 0;
+}
+
+
+struct revokechild_deferred_call {
+	struct revokechild_deferred_call *prev, *next;
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+	deferred_requeue_fn fn;
+	void *ctx;
+	struct revokechild_handle *rev_hdl;
+};
+
+struct revokechild_handle {
+	struct revokechild_handle *next, *prev;
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct tevent_fd *fde;
+	int status;
+	int fd[2];
+	pid_t child;
+	TDB_DATA key;
+	struct revokechild_deferred_call *deferred_call_list;
+};
+
+static void deferred_call_requeue(struct tevent_context *ev,
+				  struct tevent_timer *te,
+				  struct timeval t, void *private_data)
+{
+	struct revokechild_deferred_call *dlist = talloc_get_type_abort(
+		private_data, struct revokechild_deferred_call);
+
+	while (dlist != NULL) {
+		struct revokechild_deferred_call *dcall = dlist;
+
+		talloc_set_destructor(dcall, NULL);
+		DLIST_REMOVE(dlist, dcall);
+		dcall->fn(dcall->ctx, dcall->hdr);
+		talloc_free(dcall);
+	}
+}
+
+static int deferred_call_destructor(struct revokechild_deferred_call *dcall)
+{
+	struct revokechild_handle *rev_hdl = dcall->rev_hdl;
+
+	DLIST_REMOVE(rev_hdl->deferred_call_list, dcall);
+	return 0;
+}
+
+static int revokechild_destructor(struct revokechild_handle *rev_hdl)
+{
+	struct revokechild_deferred_call *now_list = NULL;
+	struct revokechild_deferred_call *delay_list = NULL;
+
+	if (rev_hdl->fde != NULL) {
+		talloc_free(rev_hdl->fde);
+	}
+
+	if (rev_hdl->fd[0] != -1) {
+		close(rev_hdl->fd[0]);
+	}
+	if (rev_hdl->fd[1] != -1) {
+		close(rev_hdl->fd[1]);
+	}
+	ctdb_kill(rev_hdl->ctdb, rev_hdl->child, SIGKILL);
+
+	DLIST_REMOVE(rev_hdl->ctdb_db->revokechild_active, rev_hdl);
+
+	while (rev_hdl->deferred_call_list != NULL) {
+		struct revokechild_deferred_call *dcall;
+
+		dcall = rev_hdl->deferred_call_list;
+		DLIST_REMOVE(rev_hdl->deferred_call_list, dcall);
+
+		/* If revoke is successful, then first process all the calls
+		 * that need write access, and delay readonly requests by 1
+		 * second grace.
+		 *
+		 * If revoke is unsuccessful, most likely because of node
+		 * failure, delay all the pending requests, so database can
+		 * be recovered.
+		 */
+
+		if (rev_hdl->status == 0) {
+			struct ctdb_req_call_old *c;
+
+			c = (struct ctdb_req_call_old *)dcall->hdr;
+			if (c->flags & CTDB_WANT_READONLY) {
+				DLIST_ADD(delay_list, dcall);
+			} else {
+				DLIST_ADD(now_list, dcall);
+			}
+		} else {
+			DLIST_ADD(delay_list, dcall);
+		}
+	}
+
+	if (now_list != NULL) {
+		tevent_add_timer(rev_hdl->ctdb->ev,
+				 rev_hdl->ctdb_db,
+				 tevent_timeval_current_ofs(0, 0),
+				 deferred_call_requeue,
+				 now_list);
+	}
+
+	if (delay_list != NULL) {
+		tevent_add_timer(rev_hdl->ctdb->ev,
+				 rev_hdl->ctdb_db,
+				 tevent_timeval_current_ofs(1, 0),
+				 deferred_call_requeue,
+				 delay_list);
+	}
+
+	return 0;
+}
+
+static void revokechild_handler(struct tevent_context *ev,
+				struct tevent_fd *fde,
+				uint16_t flags, void *private_data)
+{
+	struct revokechild_handle *rev_hdl =
+		talloc_get_type(private_data, struct revokechild_handle);
+	int ret;
+	char c;
+
+	ret = sys_read(rev_hdl->fd[0], &c, 1);
+	if (ret != 1) {
+		DEBUG(DEBUG_ERR,("Failed to read status from revokechild. errno:%d\n", errno));
+		rev_hdl->status = -1;
+		talloc_free(rev_hdl);
+		return;
+	}
+	if (c != 0) {
+		DEBUG(DEBUG_ERR,("revokechild returned failure. status:%d\n", c));
+		rev_hdl->status = -1;
+		talloc_free(rev_hdl);
+		return;
+	}
+
+	talloc_free(rev_hdl);
+}
+
+struct ctdb_revoke_state {
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key;
+	struct ctdb_ltdb_header *header;
+	TDB_DATA data;
+	int count;
+	int status;
+	int finished;
+};
+
+static void update_record_cb(struct ctdb_client_control_state *state)
+{
+	struct ctdb_revoke_state *revoke_state;
+	int ret;
+	int32_t res;
+
+	if (state == NULL) {
+		return;
+	}
+	revoke_state = state->async.private_data;
+
+	state->async.fn = NULL;
+        ret = ctdb_control_recv(state->ctdb, state, state, NULL, &res, NULL);
+        if ((ret != 0) || (res != 0)) {
+		DEBUG(DEBUG_ERR,("Recv for revoke update record failed ret:%d res:%d\n", ret, res));
+		revoke_state->status = -1;
+	}
+
+	revoke_state->count--;
+	if (revoke_state->count <= 0) {
+		revoke_state->finished = 1;
+	}
+}
+
+static void revoke_send_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+	struct ctdb_revoke_state *revoke_state = private_data;
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_updaterecord_send(ctdb, revoke_state, timeval_current_ofs(ctdb->tunable.control_timeout,0), pnn, revoke_state->ctdb_db, revoke_state->key, revoke_state->header, revoke_state->data);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,("Failure to send update record to revoke readonly delegation\n"));
+		revoke_state->status = -1;
+		return;
+	}
+	state->async.fn           = update_record_cb;
+	state->async.private_data = revoke_state;
+
+	revoke_state->count++;
+
+}
+
+static void ctdb_revoke_timeout_handler(struct tevent_context *ev,
+					struct tevent_timer *te,
+					struct timeval yt, void *private_data)
+{
+	struct ctdb_revoke_state *state = private_data;
+
+	DEBUG(DEBUG_ERR,("Timed out waiting for revoke to finish\n"));
+	state->finished = 1;
+	state->status   = -1;
+}
+
+static int ctdb_revoke_all_delegations(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA tdata, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	struct ctdb_revoke_state *state = talloc_zero(ctdb, struct ctdb_revoke_state);
+	struct ctdb_ltdb_header new_header;
+	TDB_DATA new_data;
+
+	state->ctdb_db = ctdb_db;
+	state->key     = key;
+	state->header  = header;
+	state->data    = data;
+ 
+	ctdb_trackingdb_traverse(ctdb, tdata, revoke_send_cb, state);
+
+	tevent_add_timer(ctdb->ev, state,
+			 timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+			 ctdb_revoke_timeout_handler, state);
+
+	while (state->finished == 0) {
+		tevent_loop_once(ctdb->ev);
+	}
+
+	if (ctdb_ltdb_lock(ctdb_db, key) != 0) {
+		DEBUG(DEBUG_ERR,("Failed to chainlock the database in revokechild\n"));
+		talloc_free(state);
+		return -1;
+	}
+	if (ctdb_ltdb_fetch(ctdb_db, key, &new_header, state, &new_data) != 0) {
+		ctdb_ltdb_unlock(ctdb_db, key);
+		DEBUG(DEBUG_ERR,("Failed for fetch tdb record in revokechild\n"));
+		talloc_free(state);
+		return -1;
+	}
+	header->rsn++;
+	if (new_header.rsn > header->rsn) {
+		ctdb_ltdb_unlock(ctdb_db, key);
+		DEBUG(DEBUG_ERR,("RSN too high in tdb record in revokechild\n"));
+		talloc_free(state);
+		return -1;
+	}
+	if ( (new_header.flags & (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS)) != (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS) ) {
+		ctdb_ltdb_unlock(ctdb_db, key);
+		DEBUG(DEBUG_ERR,("Flags are wrong in tdb record in revokechild\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	/*
+	 * If revoke on all nodes succeed, revoke is complete.  Otherwise,
+	 * remove CTDB_REC_RO_REVOKING_READONLY flag and retry.
+	 */
+	if (state->status == 0) {
+		new_header.rsn++;
+		new_header.flags |= CTDB_REC_RO_REVOKE_COMPLETE;
+	} else {
+		DEBUG(DEBUG_NOTICE, ("Revoke all delegations failed, retrying.\n"));
+		new_header.flags &= ~CTDB_REC_RO_REVOKING_READONLY;
+	}
+	if (ctdb_ltdb_store(ctdb_db, key, &new_header, new_data) != 0) {
+		ctdb_ltdb_unlock(ctdb_db, key);
+		DEBUG(DEBUG_ERR,("Failed to write new record in revokechild\n"));
+		talloc_free(state);
+		return -1;
+	}
+	ctdb_ltdb_unlock(ctdb_db, key);
+
+	talloc_free(state);
+	return 0;
+}
+
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb,
+				struct ctdb_db_context *ctdb_db,
+				TDB_DATA key,
+				struct ctdb_ltdb_header *header,
+				TDB_DATA data)
+{
+	TDB_DATA tdata;
+	struct revokechild_handle *rev_hdl;
+	pid_t parent = getpid();
+	int ret;
+
+	header->flags &= ~(CTDB_REC_RO_REVOKING_READONLY |
+			   CTDB_REC_RO_HAVE_DELEGATIONS |
+			   CTDB_REC_RO_HAVE_READONLY);
+
+	header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+	header->rsn   -= 1;
+
+	rev_hdl = talloc_zero(ctdb_db, struct revokechild_handle);
+	if (rev_hdl == NULL) {
+		D_ERR("Failed to allocate revokechild_handle\n");
+		return -1;
+	}
+
+	tdata = tdb_fetch(ctdb_db->rottdb, key);
+	if (tdata.dsize > 0) {
+		uint8_t *tmp;
+
+		tmp = tdata.dptr;
+		tdata.dptr = talloc_memdup(rev_hdl, tdata.dptr, tdata.dsize);
+		free(tmp);
+	}
+
+	rev_hdl->status    = 0;
+	rev_hdl->ctdb      = ctdb;
+	rev_hdl->ctdb_db   = ctdb_db;
+	rev_hdl->fd[0]     = -1;
+	rev_hdl->fd[1]     = -1;
+
+	rev_hdl->key.dsize = key.dsize;
+	rev_hdl->key.dptr  = talloc_memdup(rev_hdl, key.dptr, key.dsize);
+	if (rev_hdl->key.dptr == NULL) {
+		D_ERR("Failed to allocate key for revokechild_handle\n");
+		goto err_out;
+	}
+
+	ret = pipe(rev_hdl->fd);
+	if (ret != 0) {
+		D_ERR("Failed to allocate key for revokechild_handle\n");
+		goto err_out;
+	}
+
+
+	rev_hdl->child = ctdb_fork(ctdb);
+	if (rev_hdl->child == (pid_t)-1) {
+		D_ERR("Failed to fork child for revokechild\n");
+		goto err_out;
+	}
+
+	if (rev_hdl->child == 0) {
+		char c = 0;
+		close(rev_hdl->fd[0]);
+
+		prctl_set_comment("ctdb_revokechild");
+		if (switch_from_server_to_client(ctdb) != 0) {
+			D_ERR("Failed to switch from server to client "
+			      "for revokechild process\n");
+			c = 1;
+			goto child_finished;
+		}
+
+		c = ctdb_revoke_all_delegations(ctdb,
+						ctdb_db,
+						tdata,
+						key,
+						header,
+						data);
+
+child_finished:
+		sys_write(rev_hdl->fd[1], &c, 1);
+		ctdb_wait_for_process_to_exit(parent);
+		_exit(0);
+	}
+
+	close(rev_hdl->fd[1]);
+	rev_hdl->fd[1] = -1;
+	set_close_on_exec(rev_hdl->fd[0]);
+
+	rev_hdl->fde = tevent_add_fd(ctdb->ev,
+				     rev_hdl,
+				     rev_hdl->fd[0],
+				     TEVENT_FD_READ,
+				     revokechild_handler,
+				     (void *)rev_hdl);
+
+	if (rev_hdl->fde == NULL) {
+		D_ERR("Failed to set up fd event for revokechild process\n");
+		talloc_free(rev_hdl);
+	}
+	tevent_fd_set_auto_close(rev_hdl->fde);
+
+	/* This is an active revokechild child process */
+	DLIST_ADD_END(ctdb_db->revokechild_active, rev_hdl);
+	talloc_set_destructor(rev_hdl, revokechild_destructor);
+
+	return 0;
+err_out:
+	talloc_free(rev_hdl);
+	return -1;
+}
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context)
+{
+	struct revokechild_handle *rev_hdl;
+	struct revokechild_deferred_call *deferred_call;
+
+	for (rev_hdl = ctdb_db->revokechild_active;
+	     rev_hdl;
+	     rev_hdl = rev_hdl->next) {
+		if (rev_hdl->key.dsize == 0) {
+			continue;
+		}
+		if (rev_hdl->key.dsize != key.dsize) {
+			continue;
+		}
+		if (!memcmp(rev_hdl->key.dptr, key.dptr, key.dsize)) {
+			break;
+		}
+	}
+
+	if (rev_hdl == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to add deferred call to revoke list. revoke structure not found\n"));
+		return -1;
+	}
+
+	deferred_call = talloc(call_context, struct revokechild_deferred_call);
+	if (deferred_call == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n"));
+		return -1;
+	}
+
+	deferred_call->ctdb = ctdb;
+	deferred_call->hdr  = talloc_steal(deferred_call, hdr);
+	deferred_call->fn   = fn;
+	deferred_call->ctx  = call_context;
+	deferred_call->rev_hdl   = rev_hdl;
+
+	talloc_set_destructor(deferred_call, deferred_call_destructor);
+
+	DLIST_ADD(rev_hdl->deferred_call_list, deferred_call);
+
+	return 0;
+}
+
+static void ctdb_migration_count_handler(TDB_DATA key, uint64_t counter,
+					 void *private_data)
+{
+	struct ctdb_db_context *ctdb_db = talloc_get_type_abort(
+		private_data, struct ctdb_db_context);
+	unsigned int value;
+
+	value = (counter < INT_MAX ? counter : INT_MAX);
+	ctdb_update_db_stat_hot_keys(ctdb_db, key, value);
+}
+
+static void ctdb_migration_cleandb_event(struct tevent_context *ev,
+					 struct tevent_timer *te,
+					 struct timeval current_time,
+					 void *private_data)
+{
+	struct ctdb_db_context *ctdb_db = talloc_get_type_abort(
+		private_data, struct ctdb_db_context);
+
+	if (ctdb_db->migratedb == NULL) {
+		return;
+	}
+
+	hash_count_expire(ctdb_db->migratedb, NULL);
+
+	te = tevent_add_timer(ctdb_db->ctdb->ev, ctdb_db->migratedb,
+			      tevent_timeval_current_ofs(10, 0),
+			      ctdb_migration_cleandb_event, ctdb_db);
+	if (te == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Memory error in migration cleandb event for %s\n",
+		       ctdb_db->db_name));
+		TALLOC_FREE(ctdb_db->migratedb);
+	}
+}
+
+int ctdb_migration_init(struct ctdb_db_context *ctdb_db)
+{
+	struct timeval one_second = { 1, 0 };
+	struct tevent_timer *te;
+	int ret;
+
+	if (! ctdb_db_volatile(ctdb_db)) {
+		return 0;
+	}
+
+	ret = hash_count_init(ctdb_db, one_second,
+			      ctdb_migration_count_handler, ctdb_db,
+			      &ctdb_db->migratedb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Memory error in migration init for %s\n",
+		       ctdb_db->db_name));
+		return -1;
+	}
+
+	te = tevent_add_timer(ctdb_db->ctdb->ev, ctdb_db->migratedb,
+			      tevent_timeval_current_ofs(10, 0),
+			      ctdb_migration_cleandb_event, ctdb_db);
+	if (te == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Memory error in migration init for %s\n",
+		       ctdb_db->db_name));
+		TALLOC_FREE(ctdb_db->migratedb);
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_client.c b/ctdb/server/ctdb_client.c
new file mode 100644
index 0000000..c9edb1d
--- /dev/null
+++ b/ctdb/server/ctdb_client.c
@@ -0,0 +1,1709 @@
+/*
+   ctdb daemon code
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/locale.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/time.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+  allocate a packet for use in client<->daemon communication
+ */
+struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
+					    TALLOC_CTX *mem_ctx,
+					    enum ctdb_operation operation,
+					    size_t length, size_t slength,
+					    const char *type)
+{
+	int size;
+	struct ctdb_req_header *hdr;
+
+	length = MAX(length, slength);
+	size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+	hdr = (struct ctdb_req_header *)talloc_zero_size(mem_ctx, size);
+	if (hdr == NULL) {
+		DEBUG(DEBUG_ERR,("Unable to allocate packet for operation %u of length %u\n",
+			 operation, (unsigned)length));
+		return NULL;
+	}
+	talloc_set_name_const(hdr, type);
+	hdr->length       = length;
+	hdr->operation    = operation;
+	hdr->ctdb_magic   = CTDB_MAGIC;
+	hdr->ctdb_version = CTDB_PROTOCOL;
+	hdr->srcnode      = ctdb->pnn;
+	if (ctdb->vnn_map) {
+		hdr->generation = ctdb->vnn_map->generation;
+	}
+
+	return hdr;
+}
+
+/*
+  local version of ctdb_call
+*/
+int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
+		    struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
+		    TDB_DATA *data, bool updatetdb)
+{
+	struct ctdb_call_info *c;
+	struct ctdb_registered_call *fn;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+	c = talloc_zero(mem_ctx, struct ctdb_call_info);
+	CTDB_NO_MEMORY(ctdb, c);
+
+	c->key = call->key;
+	c->call_data = &call->call_data;
+	c->record_data.dptr = talloc_memdup(c, data->dptr, data->dsize);
+	c->record_data.dsize = data->dsize;
+	CTDB_NO_MEMORY(ctdb, c->record_data.dptr);
+	c->header = header;
+
+	for (fn=ctdb_db->calls;fn;fn=fn->next) {
+		if (fn->id == (uint32_t)call->call_id) {
+			break;
+		}
+	}
+	if (fn == NULL) {
+		ctdb_set_error(ctdb, "Unknown call id %u\n", call->call_id);
+		talloc_free(c);
+		return -1;
+	}
+
+	if (fn->fn(c) != 0) {
+		ctdb_set_error(ctdb, "ctdb_call %u failed\n", call->call_id);
+		talloc_free(c);
+		return -1;
+	}
+
+	/* we need to force the record to be written out if this was a remote access */
+	if (c->new_data == NULL) {
+		c->new_data = &c->record_data;
+	}
+
+	if (c->new_data && updatetdb) {
+		/* XXX check that we always have the lock here? */
+		if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) {
+			ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n");
+			talloc_free(c);
+			return -1;
+		}
+	}
+
+	if (c->reply_data) {
+		call->reply_data = *c->reply_data;
+
+		talloc_steal(call, call->reply_data.dptr);
+		talloc_set_name_const(call->reply_data.dptr, __location__);
+	} else {
+		call->reply_data.dptr = NULL;
+		call->reply_data.dsize = 0;
+	}
+	call->status = c->status;
+
+	talloc_free(c);
+
+	return 0;
+}
+
+
+/*
+  queue a packet for sending from client to daemon
+*/
+static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	return ctdb_queue_send(ctdb->daemon.queue, (uint8_t *)hdr, hdr->length);
+}
+
+
+/*
+  called when a CTDB_REPLY_CALL packet comes in in the client
+
+  This packet comes in response to a CTDB_REQ_CALL request packet. It
+  contains any reply data from the call
+*/
+static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_reply_call_old *c = (struct ctdb_reply_call_old *)hdr;
+	struct ctdb_client_call_state *state;
+
+	state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_client_call_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
+		return;
+	}
+
+	if (hdr->reqid != state->reqid) {
+		/* we found a record  but it was the wrong one */
+		DEBUG(DEBUG_ERR, ("Dropped client call reply with reqid:%u\n",hdr->reqid));
+		return;
+	}
+
+	state->call->reply_data.dptr = c->data;
+	state->call->reply_data.dsize = c->datalen;
+	state->call->status = c->status;
+
+	talloc_steal(state, c);
+
+	state->state = CTDB_CALL_DONE;
+
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
+}
+
+void ctdb_request_message(struct ctdb_context *ctdb,
+			  struct ctdb_req_header *hdr)
+{
+	struct ctdb_req_message_old *c = (struct ctdb_req_message_old *)hdr;
+	TDB_DATA data;
+
+	data.dsize = c->datalen;
+	data.dptr = talloc_memdup(c, &c->data[0], c->datalen);
+	if (data.dptr == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Memory allocation failure\n"));
+		return;
+	}
+
+	srvid_dispatch(ctdb->srv, c->srvid, CTDB_SRVID_ALL, data);
+}
+
+static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+
+/*
+  this is called in the client, when data comes in from the daemon
+ */
+void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+	struct ctdb_context *ctdb = talloc_get_type(args, struct ctdb_context);
+	struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+	TALLOC_CTX *tmp_ctx;
+
+	/* place the packet as a child of a tmp_ctx. We then use
+	   talloc_free() below to free it. If any of the calls want
+	   to keep it, then they will steal it somewhere else, and the
+	   talloc_free() will be a no-op */
+	tmp_ctx = talloc_new(ctdb);
+	talloc_steal(tmp_ctx, hdr);
+
+	if (cnt == 0) {
+		DEBUG(DEBUG_CRIT,("Daemon has exited - shutting down client\n"));
+		exit(1);
+	}
+
+	if (cnt < sizeof(*hdr)) {
+		DEBUG(DEBUG_CRIT,("Bad packet length %u in client\n", (unsigned)cnt));
+		goto done;
+	}
+	if (cnt != hdr->length) {
+		ctdb_set_error(ctdb, "Bad header length %u expected %u in client\n",
+			       (unsigned)hdr->length, (unsigned)cnt);
+		goto done;
+	}
+
+	if (hdr->ctdb_magic != CTDB_MAGIC) {
+		ctdb_set_error(ctdb, "Non CTDB packet rejected in client\n");
+		goto done;
+	}
+
+	if (hdr->ctdb_version != CTDB_PROTOCOL) {
+		ctdb_set_error(ctdb, "Bad CTDB version 0x%x rejected in client\n", hdr->ctdb_version);
+		goto done;
+	}
+
+	switch (hdr->operation) {
+	case CTDB_REPLY_CALL:
+		ctdb_client_reply_call(ctdb, hdr);
+		break;
+
+	case CTDB_REQ_MESSAGE:
+		ctdb_request_message(ctdb, hdr);
+		break;
+
+	case CTDB_REPLY_CONTROL:
+		ctdb_client_reply_control(ctdb, hdr);
+		break;
+
+	default:
+		DEBUG(DEBUG_CRIT,("bogus operation code:%u\n",hdr->operation));
+	}
+
+done:
+	talloc_free(tmp_ctx);
+}
+
+/*
+  connect to a unix domain socket
+*/
+int ctdb_socket_connect(struct ctdb_context *ctdb)
+{
+	struct sockaddr_un addr;
+	int ret;
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sun_family = AF_UNIX;
+	strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
+
+	ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ctdb->daemon.sd == -1) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno));
+		return -1;
+	}
+
+	if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+		DEBUG(DEBUG_ERR,
+		      (__location__
+		       "Failed to connect client socket to daemon (%s)\n",
+		       strerror(errno)));
+		close(ctdb->daemon.sd);
+		ctdb->daemon.sd = -1;
+		return -1;
+	}
+
+	ret = set_blocking(ctdb->daemon.sd, false);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__
+		       " failed to set socket non-blocking (%s)\n",
+		       strerror(errno)));
+		close(ctdb->daemon.sd);
+		ctdb->daemon.sd = -1;
+		return -1;
+	}
+
+	set_close_on_exec(ctdb->daemon.sd);
+
+	ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd,
+					      CTDB_DS_ALIGNMENT,
+					      ctdb_client_read_cb, ctdb, "to-ctdbd");
+	return 0;
+}
+
+
+struct ctdb_record_handle {
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key;
+	TDB_DATA *data;
+	struct ctdb_ltdb_header header;
+};
+
+
+/*
+  make a recv call to the local ctdb daemon - called from client context
+
+  This is called when the program wants to wait for a ctdb_call to complete and get the
+  results. This call will block unless the call has already completed.
+*/
+int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
+{
+	if (state == NULL) {
+		return -1;
+	}
+
+	while (state->state < CTDB_CALL_DONE) {
+		tevent_loop_once(state->ctdb_db->ctdb->ev);
+	}
+	if (state->state != CTDB_CALL_DONE) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_call_recv failed\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	if (state->call->reply_data.dsize) {
+		call->reply_data.dptr = talloc_memdup(state->ctdb_db,
+						      state->call->reply_data.dptr,
+						      state->call->reply_data.dsize);
+		call->reply_data.dsize = state->call->reply_data.dsize;
+	} else {
+		call->reply_data.dptr = NULL;
+		call->reply_data.dsize = 0;
+	}
+	call->status = state->call->status;
+	talloc_free(state);
+
+	return call->status;
+}
+
+
+
+
+/*
+  destroy a ctdb_call in client
+*/
+static int ctdb_client_call_destructor(struct ctdb_client_call_state *state)
+{
+	reqid_remove(state->ctdb_db->ctdb->idr, state->reqid);
+	return 0;
+}
+
+/*
+  construct an event driven local ctdb_call
+
+  this is used so that locally processed ctdb_call requests are processed
+  in an event driven manner
+*/
+static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db_context *ctdb_db,
+								  struct ctdb_call *call,
+								  struct ctdb_ltdb_header *header,
+								  TDB_DATA *data)
+{
+	struct ctdb_client_call_state *state;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int ret;
+
+	state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
+	CTDB_NO_MEMORY_NULL(ctdb, state);
+	state->call = talloc_zero(state, struct ctdb_call);
+	CTDB_NO_MEMORY_NULL(ctdb, state->call);
+
+	talloc_steal(state, data->dptr);
+
+	state->state   = CTDB_CALL_DONE;
+	*(state->call) = *call;
+	state->ctdb_db = ctdb_db;
+
+	ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+	if (ret != 0) {
+		DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+	}
+
+	return state;
+}
+
+/*
+  make a ctdb call to the local daemon - async send. Called from client context.
+
+  This constructs a ctdb_call request and queues it for processing.
+  This call never blocks.
+*/
+struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
+					      struct ctdb_call *call)
+{
+	struct ctdb_client_call_state *state;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct ctdb_ltdb_header header;
+	TDB_DATA data;
+	int ret;
+	size_t len;
+	struct ctdb_req_call_old *c;
+
+	/* if the domain socket is not yet open, open it */
+	if (ctdb->daemon.sd==-1) {
+		ctdb_socket_connect(ctdb);
+	}
+
+	ret = ctdb_ltdb_lock(ctdb_db, call->key);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to get chainlock\n"));
+		return NULL;
+	}
+
+	ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data);
+
+	if ((call->flags & CTDB_IMMEDIATE_MIGRATION) && (header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+		ret = -1;
+	}
+
+	if (ret == 0 && header.dmaster == ctdb->pnn) {
+		state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
+		talloc_free(data.dptr);
+		ctdb_ltdb_unlock(ctdb_db, call->key);
+		return state;
+	}
+
+	ctdb_ltdb_unlock(ctdb_db, call->key);
+	talloc_free(data.dptr);
+
+	state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to allocate state\n"));
+		return NULL;
+	}
+	state->call = talloc_zero(state, struct ctdb_call);
+	if (state->call == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to allocate state->call\n"));
+		return NULL;
+	}
+
+	len = offsetof(struct ctdb_req_call_old, data) + call->key.dsize + call->call_data.dsize;
+	c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CALL, len, struct ctdb_req_call_old);
+	if (c == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to allocate packet\n"));
+		return NULL;
+	}
+
+	state->reqid     = reqid_new(ctdb->idr, state);
+	state->ctdb_db = ctdb_db;
+	talloc_set_destructor(state, ctdb_client_call_destructor);
+
+	c->hdr.reqid     = state->reqid;
+	c->flags         = call->flags;
+	c->db_id         = ctdb_db->db_id;
+	c->callid        = call->call_id;
+	c->hopcount      = 0;
+	c->keylen        = call->key.dsize;
+	c->calldatalen   = call->call_data.dsize;
+	memcpy(&c->data[0], call->key.dptr, call->key.dsize);
+	memcpy(&c->data[call->key.dsize],
+	       call->call_data.dptr, call->call_data.dsize);
+	*(state->call)              = *call;
+	state->call->call_data.dptr = &c->data[call->key.dsize];
+	state->call->key.dptr       = &c->data[0];
+
+	state->state  = CTDB_CALL_WAIT;
+
+
+	ctdb_client_queue_pkt(ctdb, &c->hdr);
+
+	return state;
+}
+
+
+/*
+  full ctdb_call. Equivalent to a ctdb_call_send() followed by a ctdb_call_recv()
+*/
+int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+	struct ctdb_client_call_state *state;
+
+	state = ctdb_call_send(ctdb_db, call);
+	return ctdb_call_recv(state, call);
+}
+
+
+/*
+  tell the daemon what messaging srvid we will use, and register the message
+  handler function in the client
+*/
+int ctdb_client_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
+				    srvid_handler_fn handler,
+				    void *private_data)
+{
+	int res;
+	int32_t status;
+
+	res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid,
+			   CTDB_CONTROL_REGISTER_SRVID, 0,
+			   tdb_null, NULL, NULL, &status, NULL, NULL);
+	if (res != 0 || status != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to register srvid %llu\n",
+		       (unsigned long long)srvid));
+		return -1;
+	}
+
+	/* also need to register the handler with our own ctdb structure */
+	return srvid_register(ctdb->srv, ctdb, srvid, handler, private_data);
+}
+
+/*
+  tell the daemon we no longer want a srvid
+*/
+int ctdb_client_remove_message_handler(struct ctdb_context *ctdb,
+				       uint64_t srvid, void *private_data)
+{
+	int res;
+	int32_t status;
+
+	res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid,
+			   CTDB_CONTROL_DEREGISTER_SRVID, 0,
+			   tdb_null, NULL, NULL, &status, NULL, NULL);
+	if (res != 0 || status != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to deregister srvid %llu\n",
+		       (unsigned long long)srvid));
+		return -1;
+	}
+
+	/* also need to register the handler with our own ctdb structure */
+	srvid_deregister(ctdb->srv, srvid, private_data);
+	return 0;
+}
+
+/*
+  send a message - from client context
+ */
+int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+		      uint64_t srvid, TDB_DATA data)
+{
+	struct ctdb_req_message_old *r;
+	int len, res;
+
+	len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+	r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE,
+			       len, struct ctdb_req_message_old);
+	CTDB_NO_MEMORY(ctdb, r);
+
+	r->hdr.destnode  = pnn;
+	r->srvid         = srvid;
+	r->datalen       = data.dsize;
+	memcpy(&r->data[0], data.dptr, data.dsize);
+
+	res = ctdb_client_queue_pkt(ctdb, &r->hdr);
+	talloc_free(r);
+	return res;
+}
+
+
+/*
+   called when a control completes or timesout to invoke the callback
+   function the user provided
+*/
+static void invoke_control_callback(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval t, void *private_data)
+{
+	struct ctdb_client_control_state *state;
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	int ret;
+
+	state = talloc_get_type(private_data, struct ctdb_client_control_state);
+	talloc_steal(tmp_ctx, state);
+
+	ret = ctdb_control_recv(state->ctdb, state, state,
+			NULL,
+			NULL,
+			NULL);
+	if (ret != 0) {
+		DEBUG(DEBUG_DEBUG,("ctdb_control_recv() failed, ignoring return code %d\n", ret));
+	}
+
+	talloc_free(tmp_ctx);
+}
+
+/*
+  called when a CTDB_REPLY_CONTROL packet comes in in the client
+
+  This packet comes in response to a CTDB_REQ_CONTROL request packet. It
+  contains any reply data from the control
+*/
+static void ctdb_client_reply_control(struct ctdb_context *ctdb,
+				      struct ctdb_req_header *hdr)
+{
+	struct ctdb_reply_control_old *c = (struct ctdb_reply_control_old *)hdr;
+	struct ctdb_client_control_state *state;
+
+	state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_client_control_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
+		return;
+	}
+
+	if (hdr->reqid != state->reqid) {
+		/* we found a record  but it was the wrong one */
+		DEBUG(DEBUG_ERR, ("Dropped orphaned reply control with reqid:%u\n",hdr->reqid));
+		return;
+	}
+
+	state->outdata.dptr = c->data;
+	state->outdata.dsize = c->datalen;
+	state->status = c->status;
+	if (c->errorlen) {
+		state->errormsg = talloc_strndup(state,
+						 (char *)&c->data[c->datalen],
+						 c->errorlen);
+	}
+
+	/* state->outdata now uses resources from c so we don't want c
+	   to just disappear from under us while state is still alive
+	*/
+	talloc_steal(state, c);
+
+	state->state = CTDB_CONTROL_DONE;
+
+	/* if we had a callback registered for this control, pull the response
+	   and call the callback.
+	*/
+	if (state->async.fn) {
+		tevent_add_timer(ctdb->ev, state, timeval_zero(),
+				 invoke_control_callback, state);
+	}
+}
+
+
+/*
+  destroy a ctdb_control in client
+*/
+static int ctdb_client_control_destructor(struct ctdb_client_control_state *state)
+{
+	reqid_remove(state->ctdb->idr, state->reqid);
+	return 0;
+}
+
+
+/* time out handler for ctdb_control */
+static void control_timeout_func(struct tevent_context *ev,
+				 struct tevent_timer *te,
+				 struct timeval t, void *private_data)
+{
+	struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
+
+	DEBUG(DEBUG_ERR,(__location__ " control timed out. reqid:%u opcode:%u "
+			 "dstnode:%u\n", state->reqid, state->c->opcode,
+			 state->c->hdr.destnode));
+
+	state->state = CTDB_CONTROL_TIMEOUT;
+
+	/* if we had a callback registered for this control, pull the response
+	   and call the callback.
+	*/
+	if (state->async.fn) {
+		tevent_add_timer(state->ctdb->ev, state, timeval_zero(),
+				 invoke_control_callback, state);
+	}
+}
+
+/* async version of send control request */
+struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
+		uint32_t destnode, uint64_t srvid,
+		uint32_t opcode, uint32_t flags, TDB_DATA data,
+		TALLOC_CTX *mem_ctx,
+		struct timeval *timeout,
+		char **errormsg)
+{
+	struct ctdb_client_control_state *state;
+	size_t len;
+	struct ctdb_req_control_old *c;
+	int ret;
+
+	if (errormsg) {
+		*errormsg = NULL;
+	}
+
+	/* if the domain socket is not yet open, open it */
+	if (ctdb->daemon.sd==-1) {
+		ctdb_socket_connect(ctdb);
+	}
+
+	state = talloc_zero(mem_ctx, struct ctdb_client_control_state);
+	CTDB_NO_MEMORY_NULL(ctdb, state);
+
+	state->ctdb       = ctdb;
+	state->reqid      = reqid_new(ctdb->idr, state);
+	state->state      = CTDB_CONTROL_WAIT;
+	state->errormsg   = NULL;
+
+	talloc_set_destructor(state, ctdb_client_control_destructor);
+
+	len = offsetof(struct ctdb_req_control_old, data) + data.dsize;
+	c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL,
+			       len, struct ctdb_req_control_old);
+	state->c            = c;
+	CTDB_NO_MEMORY_NULL(ctdb, c);
+	c->hdr.reqid        = state->reqid;
+	c->hdr.destnode     = destnode;
+	c->opcode           = opcode;
+	c->client_id        = 0;
+	c->flags            = flags;
+	c->srvid            = srvid;
+	c->datalen          = data.dsize;
+	if (data.dsize) {
+		memcpy(&c->data[0], data.dptr, data.dsize);
+	}
+
+	/* timeout */
+	if (timeout && !timeval_is_zero(timeout)) {
+		tevent_add_timer(ctdb->ev, state, *timeout,
+				 control_timeout_func, state);
+	}
+
+	ret = ctdb_client_queue_pkt(ctdb, &(c->hdr));
+	if (ret != 0) {
+		talloc_free(state);
+		return NULL;
+	}
+
+	if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+		talloc_free(state);
+		return NULL;
+	}
+
+	return state;
+}
+
+
+/* async version of receive control reply */
+int ctdb_control_recv(struct ctdb_context *ctdb,
+		struct ctdb_client_control_state *state,
+		TALLOC_CTX *mem_ctx,
+		TDB_DATA *outdata, int32_t *status, char **errormsg)
+{
+	TALLOC_CTX *tmp_ctx;
+
+	if (status != NULL) {
+		*status = -1;
+	}
+	if (errormsg != NULL) {
+		*errormsg = NULL;
+	}
+
+	if (state == NULL) {
+		return -1;
+	}
+
+	/* prevent double free of state */
+	tmp_ctx = talloc_new(ctdb);
+	talloc_steal(tmp_ctx, state);
+
+	/* loop one event at a time until we either timeout or the control
+	   completes.
+	*/
+	while (state->state == CTDB_CONTROL_WAIT) {
+		tevent_loop_once(ctdb->ev);
+	}
+
+	if (state->state != CTDB_CONTROL_DONE) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control_recv failed\n"));
+		if (state->async.fn) {
+			state->async.fn(state);
+		}
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	if (state->errormsg) {
+		int s = (state->status == 0 ? -1 : state->status);
+		DEBUG(DEBUG_ERR,("ctdb_control error: '%s'\n", state->errormsg));
+		if (errormsg) {
+			(*errormsg) = talloc_move(mem_ctx, &state->errormsg);
+		}
+		if (state->async.fn) {
+			state->async.fn(state);
+		}
+		talloc_free(tmp_ctx);
+		return s;
+	}
+
+	if (outdata) {
+		*outdata = state->outdata;
+		outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize);
+	}
+
+	if (status) {
+		*status = state->status;
+	}
+
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
+
+
+
+/*
+  send a ctdb control message
+  timeout specifies how long we should wait for a reply.
+  if timeout is NULL we wait indefinitely
+ */
+int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
+		 uint32_t opcode, uint32_t flags, TDB_DATA data,
+		 TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+		 struct timeval *timeout,
+		 char **errormsg)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_control_send(ctdb, destnode, srvid, opcode,
+			flags, data, mem_ctx,
+			timeout, errormsg);
+
+	/* FIXME: Error conditions in ctdb_control_send return NULL without
+	 * setting errormsg.  So, there is no way to distinguish between success
+	 * and failure when CTDB_CTRL_FLAG_NOREPLY is set */
+	if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+		if (status != NULL) {
+			*status = 0;
+		}
+		return 0;
+	}
+
+	return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status,
+			errormsg);
+}
+
+/*
+  get vnn map from a remote node
+ */
+int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_vnn_map **vnnmap)
+{
+	int ret;
+	TDB_DATA outdata;
+	int32_t res;
+	struct ctdb_vnn_map_wire *map;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_GETVNNMAP, 0, tdb_null,
+			   mem_ctx, &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getvnnmap failed\n"));
+		return -1;
+	}
+
+	map = (struct ctdb_vnn_map_wire *)outdata.dptr;
+	if (outdata.dsize < offsetof(struct ctdb_vnn_map_wire, map) ||
+	    outdata.dsize != map->size*sizeof(uint32_t) + offsetof(struct ctdb_vnn_map_wire, map)) {
+		DEBUG(DEBUG_ERR,("Bad vnn map size received in ctdb_ctrl_getvnnmap\n"));
+		return -1;
+	}
+
+	(*vnnmap) = talloc(mem_ctx, struct ctdb_vnn_map);
+	CTDB_NO_MEMORY(ctdb, *vnnmap);
+	(*vnnmap)->generation = map->generation;
+	(*vnnmap)->size       = map->size;
+	(*vnnmap)->map        = talloc_array(*vnnmap, uint32_t, map->size);
+
+	CTDB_NO_MEMORY(ctdb, (*vnnmap)->map);
+	memcpy((*vnnmap)->map, map->map, sizeof(uint32_t)*map->size);
+	talloc_free(outdata.dptr);
+
+	return 0;
+}
+
+
+/*
+  get the recovery mode of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+	return ctdb_control_send(ctdb, destnode, 0,
+			   CTDB_CONTROL_GET_RECMODE, 0, tdb_null,
+			   mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getrecmode_recv failed\n"));
+		return -1;
+	}
+
+	if (recmode) {
+		*recmode = (uint32_t)res;
+	}
+
+	return 0;
+}
+
+int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, timeout, destnode);
+	return ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, state, recmode);
+}
+
+
+
+
+/*
+  set the recovery mode of a remote node
+ */
+int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmode)
+{
+	int ret;
+	TDB_DATA data;
+	int32_t res;
+
+	data.dsize = sizeof(uint32_t);
+	data.dptr = (unsigned char *)&recmode;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_SET_RECMODE, 0, data,
+			   NULL, NULL, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmode failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+
+/*
+  get a list of nodes (vnn and flags ) from a remote node
+ */
+int ctdb_ctrl_getnodemap(struct ctdb_context *ctdb,
+		struct timeval timeout, uint32_t destnode,
+		TALLOC_CTX *mem_ctx, struct ctdb_node_map_old **nodemap)
+{
+	int ret;
+	TDB_DATA outdata;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_GET_NODEMAP, 0, tdb_null,
+			   mem_ctx, &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0 || outdata.dsize == 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getnodes failed ret:%d res:%d\n", ret, res));
+		return -1;
+	}
+
+	*nodemap = (struct ctdb_node_map_old *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+	talloc_free(outdata.dptr);
+	return 0;
+}
+
+int ctdb_ctrl_get_runstate(struct ctdb_context *ctdb,
+			   struct timeval timeout,
+			   uint32_t destnode,
+			   uint32_t *runstate)
+{
+	TDB_DATA outdata;
+	int32_t res;
+	int ret;
+
+	ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_RUNSTATE, 0,
+			   tdb_null, ctdb, &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,("ctdb_control for get_runstate failed\n"));
+		return ret != 0 ? ret : res;
+	}
+
+	if (outdata.dsize != sizeof(uint32_t)) {
+		DEBUG(DEBUG_ERR,("Invalid return data in get_runstate\n"));
+		talloc_free(outdata.dptr);
+		return -1;
+	}
+
+	if (runstate != NULL) {
+		*runstate = *(uint32_t *)outdata.dptr;
+	}
+	talloc_free(outdata.dptr);
+
+	return 0;
+}
+
+/*
+  get debug level on a node
+ */
+int ctdb_ctrl_get_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t *level)
+{
+	int ret;
+	int32_t res;
+	TDB_DATA data;
+
+	ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_DEBUG, 0, tdb_null,
+			   ctdb, &data, &res, NULL, NULL);
+	if (ret != 0 || res != 0) {
+		return -1;
+	}
+	if (data.dsize != sizeof(int32_t)) {
+		DEBUG(DEBUG_ERR,("Bad control reply size in ctdb_get_debuglevel (got %u)\n",
+			 (unsigned)data.dsize));
+		return -1;
+	}
+	*level = *(int32_t *)data.dptr;
+	talloc_free(data.dptr);
+	return 0;
+}
+
+/* Freeze all databases */
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout,
+		     uint32_t destnode)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_FREEZE, 0, tdb_null,
+			   NULL, NULL, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR, ("ctdb_ctrl_freeze_priority failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  get pnn of a node, or -1
+ */
+int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_GET_PNN, 0, tdb_null,
+			   NULL, NULL, &res, &timeout, NULL);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpnn failed\n"));
+		return -1;
+	}
+
+	return res;
+}
+
+int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb,
+				   struct timeval timeout, uint32_t destnode,
+				   TALLOC_CTX *mem_ctx,
+				   uint32_t flags,
+				   struct ctdb_public_ip_list_old **ips)
+{
+	int ret;
+	TDB_DATA outdata;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_GET_PUBLIC_IPS, flags, tdb_null,
+			   mem_ctx, &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,(__location__
+				 " ctdb_control for getpublicips failed ret:%d res:%d\n",
+				 ret, res));
+		return -1;
+	}
+
+	*ips = (struct ctdb_public_ip_list_old *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+	talloc_free(outdata.dptr);
+
+	return 0;
+}
+
+int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
+			     struct timeval timeout, uint32_t destnode,
+			     TALLOC_CTX *mem_ctx,
+			     struct ctdb_public_ip_list_old **ips)
+{
+	return ctdb_ctrl_get_public_ips_flags(ctdb, timeout,
+					      destnode, mem_ctx,
+					      0, ips);
+}
+
+int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb,
+			 struct timeval timeout, uint32_t destnode,
+			 TALLOC_CTX *mem_ctx,
+			 struct ctdb_iface_list_old **_ifaces)
+{
+	int ret;
+	TDB_DATA outdata;
+	int32_t res;
+	struct ctdb_iface_list_old *ifaces;
+	uint32_t len;
+	uint32_t i;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_GET_IFACES, 0, tdb_null,
+			   mem_ctx, &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+				"failed ret:%d res:%d\n",
+				ret, res));
+		return -1;
+	}
+
+	len = offsetof(struct ctdb_iface_list_old, ifaces);
+	if (len > outdata.dsize) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+				"returned invalid data with size %u > %u\n",
+				(unsigned int)outdata.dsize,
+				(unsigned int)len));
+		dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+		return -1;
+	}
+
+	ifaces = (struct ctdb_iface_list_old *)outdata.dptr;
+	len += ifaces->num*sizeof(struct ctdb_iface);
+
+	if (len > outdata.dsize) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+				"returned invalid data with size %u > %u\n",
+				(unsigned int)outdata.dsize,
+				(unsigned int)len));
+		dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+		return -1;
+	}
+
+	/* make sure we null terminate the returned strings */
+	for (i=0; i < ifaces->num; i++) {
+		ifaces->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+	}
+
+	*_ifaces = (struct ctdb_iface_list_old *)talloc_memdup(mem_ctx,
+								  outdata.dptr,
+								  outdata.dsize);
+	talloc_free(outdata.dptr);
+	if (*_ifaces == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+				"talloc_memdup size %u failed\n",
+				(unsigned int)outdata.dsize));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  get all tunables
+ */
+int ctdb_ctrl_get_all_tunables(struct ctdb_context *ctdb,
+			       struct timeval timeout,
+			       uint32_t destnode,
+			       struct ctdb_tunable_list *tunables)
+{
+	TDB_DATA outdata;
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_ALL_TUNABLES, 0, tdb_null, ctdb,
+			   &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get all tunables failed\n"));
+		return -1;
+	}
+
+	if (outdata.dsize != sizeof(*tunables)) {
+		DEBUG(DEBUG_ERR,(__location__ " bad data size %u in ctdb_ctrl_get_all_tunables should be %u\n",
+			 (unsigned)outdata.dsize, (unsigned)sizeof(*tunables)));
+		return -1;
+	}
+
+	*tunables = *(struct ctdb_tunable_list *)outdata.dptr;
+	talloc_free(outdata.dptr);
+	return 0;
+}
+
+/*
+  set some ctdb flags
+*/
+void ctdb_set_flags(struct ctdb_context *ctdb, unsigned flags)
+{
+	ctdb->flags |= flags;
+}
+
+const char *ctdb_get_socketname(struct ctdb_context *ctdb)
+{
+	return ctdb->daemon.name;
+}
+
+/*
+  return the pnn of this node
+*/
+uint32_t ctdb_get_pnn(struct ctdb_context *ctdb)
+{
+	return ctdb->pnn;
+}
+
+/*
+  callback for the async helpers used when sending the same control
+  to multiple nodes in parallel.
+*/
+static void async_callback(struct ctdb_client_control_state *state)
+{
+	struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data);
+	struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct ctdb_context);
+	int ret;
+	TDB_DATA outdata;
+	int32_t res = -1;
+	uint32_t destnode = state->c->hdr.destnode;
+
+	outdata.dsize = 0;
+	outdata.dptr = NULL;
+
+	/* one more node has responded with recmode data */
+	data->count--;
+
+	/* if we failed to push the db, then return an error and let
+	   the main loop try again.
+	*/
+	if (state->state != CTDB_CONTROL_DONE) {
+		if ( !data->dont_log_errors) {
+			DEBUG(DEBUG_ERR,("Async operation failed with state %d, opcode:%u\n", state->state, data->opcode));
+		}
+		data->fail_count++;
+		if (state->state == CTDB_CONTROL_TIMEOUT) {
+			res = -ETIMEDOUT;
+		} else {
+			res = -1;
+		}
+		if (data->fail_callback) {
+			data->fail_callback(ctdb, destnode, res, outdata,
+					data->callback_data);
+		}
+		return;
+	}
+
+	state->async.fn = NULL;
+
+	ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL);
+	if ((ret != 0) || (res != 0)) {
+		if ( !data->dont_log_errors) {
+			DEBUG(DEBUG_ERR,("Async operation failed with ret=%d res=%d opcode=%u\n", ret, (int)res, data->opcode));
+		}
+		data->fail_count++;
+		if (data->fail_callback) {
+			data->fail_callback(ctdb, destnode, res, outdata,
+					data->callback_data);
+		}
+	}
+	if ((ret == 0) && (data->callback != NULL)) {
+		data->callback(ctdb, destnode, res, outdata,
+					data->callback_data);
+	}
+}
+
+
+void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state)
+{
+	/* set up the callback functions */
+	state->async.fn = async_callback;
+	state->async.private_data = data;
+
+	/* one more control to wait for to complete */
+	data->count++;
+}
+
+
+/* wait for up to the maximum number of seconds allowed
+   or until all nodes we expect a response from has replied
+*/
+int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data)
+{
+	while (data->count > 0) {
+		tevent_loop_once(ctdb->ev);
+	}
+	if (data->fail_count != 0) {
+		if (!data->dont_log_errors) {
+			DEBUG(DEBUG_ERR,("Async wait failed - fail_count=%u\n",
+				 data->fail_count));
+		}
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+   perform a simple control on the listed nodes
+   The control cannot return data
+ */
+int ctdb_client_async_control(struct ctdb_context *ctdb,
+				enum ctdb_controls opcode,
+				uint32_t *nodes,
+				uint64_t srvid,
+				struct timeval timeout,
+				bool dont_log_errors,
+				TDB_DATA data,
+				client_async_callback client_callback,
+			        client_async_callback fail_callback,
+				void *callback_data)
+{
+	struct client_async_data *async_data;
+	struct ctdb_client_control_state *state;
+	int j, num_nodes;
+
+	async_data = talloc_zero(ctdb, struct client_async_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+	async_data->dont_log_errors = dont_log_errors;
+	async_data->callback = client_callback;
+	async_data->fail_callback = fail_callback;
+	async_data->callback_data = callback_data;
+	async_data->opcode        = opcode;
+
+	num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
+
+	/* loop over all nodes and send an async control to each of them */
+	for (j=0; j<num_nodes; j++) {
+		uint32_t pnn = nodes[j];
+
+		state = ctdb_control_send(ctdb, pnn, srvid, opcode,
+					  0, data, async_data, &timeout, NULL);
+		if (state == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
+			talloc_free(async_data);
+			return -1;
+		}
+
+		ctdb_client_async_add(async_data, state);
+	}
+
+	if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+		talloc_free(async_data);
+		return -1;
+	}
+
+	talloc_free(async_data);
+	return 0;
+}
+
+uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
+				struct ctdb_vnn_map *vnn_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self)
+{
+	unsigned int i, j, num_nodes;
+	uint32_t *nodes;
+
+	for (i=num_nodes=0;i<vnn_map->size;i++) {
+		if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+			continue;
+		}
+		num_nodes++;
+	}
+
+	nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+	CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+	for (i=j=0;i<vnn_map->size;i++) {
+		if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+			continue;
+		}
+		nodes[j++] = vnn_map->map[i];
+	}
+
+	return nodes;
+}
+
+/* Get list of nodes not including those with flags specified by mask */
+static uint32_t *list_of_nodes(struct ctdb_context *ctdb,
+			       struct ctdb_node_map_old *node_map,
+			       TALLOC_CTX *mem_ctx,
+			       uint32_t mask,
+			       bool include_self)
+{
+	unsigned int i, j, num_nodes;
+	uint32_t exclude_pnn;
+	uint32_t *nodes;
+
+	exclude_pnn = include_self ? CTDB_UNKNOWN_PNN : ctdb->pnn;
+
+	for (i=num_nodes=0;i<node_map->num;i++) {
+		if (node_map->nodes[i].flags & mask) {
+			continue;
+		}
+		if (node_map->nodes[i].pnn == exclude_pnn) {
+			continue;
+		}
+		num_nodes++;
+	}
+
+	nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+	CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+	for (i=j=0;i<node_map->num;i++) {
+		if (node_map->nodes[i].flags & mask) {
+			continue;
+		}
+		if (node_map->nodes[i].pnn == exclude_pnn) {
+			continue;
+		}
+		nodes[j++] = node_map->nodes[i].pnn;
+	}
+
+	return nodes;
+}
+
+uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
+				struct ctdb_node_map_old *node_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self)
+{
+	return list_of_nodes(ctdb,
+			     node_map,
+			     mem_ctx,
+			     NODE_FLAGS_INACTIVE,
+			     include_self);
+}
+
+uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb,
+				struct ctdb_node_map_old *node_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self)
+{
+	return list_of_nodes(ctdb,
+			     node_map,
+			     mem_ctx,
+			     NODE_FLAGS_DISCONNECTED,
+			     include_self);
+}
+
+/*
+  get capabilities of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getcapabilities_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+	return ctdb_control_send(ctdb, destnode, 0,
+			   CTDB_CONTROL_GET_CAPABILITIES, 0, tdb_null,
+			   mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *capabilities)
+{
+	int ret;
+	int32_t res;
+	TDB_DATA outdata;
+
+	ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
+	if ( (ret != 0) || (res != 0) ) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv failed\n"));
+		return -1;
+	}
+
+	if (capabilities) {
+		*capabilities = *((uint32_t *)outdata.dptr);
+	}
+
+	return 0;
+}
+
+int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities)
+{
+	struct ctdb_client_control_state *state;
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	int ret;
+
+	state = ctdb_ctrl_getcapabilities_send(ctdb, tmp_ctx, timeout, destnode);
+	ret = ctdb_ctrl_getcapabilities_recv(ctdb, tmp_ctx, state, capabilities);
+	talloc_free(tmp_ctx);
+	return ret;
+}
+
+static void get_capabilities_callback(struct ctdb_context *ctdb,
+				      uint32_t node_pnn, int32_t res,
+				      TDB_DATA outdata, void *callback_data)
+{
+	struct ctdb_node_capabilities *caps =
+		talloc_get_type(callback_data,
+				struct ctdb_node_capabilities);
+
+	if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
+		DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
+		return;
+	}
+
+	if (node_pnn >= talloc_array_length(caps)) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " unexpected PNN %u\n", node_pnn));
+		return;
+	}
+
+	caps[node_pnn].retrieved = true;
+	caps[node_pnn].capabilities = *((uint32_t *)outdata.dptr);
+}
+
+struct ctdb_node_capabilities *
+ctdb_get_capabilities(struct ctdb_context *ctdb,
+		      TALLOC_CTX *mem_ctx,
+		      struct timeval timeout,
+		      struct ctdb_node_map_old *nodemap)
+{
+	uint32_t *nodes;
+	uint32_t i, res;
+	struct ctdb_node_capabilities *ret;
+
+	nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+
+	ret = talloc_array(mem_ctx, struct ctdb_node_capabilities,
+			   nodemap->num);
+	CTDB_NO_MEMORY_NULL(ctdb, ret);
+	/* Prepopulate the expected PNNs */
+	for (i = 0; i < talloc_array_length(ret); i++) {
+		ret[i].retrieved = false;
+	}
+
+	res = ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
+					nodes, 0, timeout,
+					false, tdb_null,
+					get_capabilities_callback, NULL,
+					ret);
+	if (res != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Failed to read node capabilities.\n"));
+		TALLOC_FREE(ret);
+	}
+
+	return ret;
+}
+
+uint32_t *
+ctdb_get_node_capabilities(struct ctdb_node_capabilities *caps,
+			   uint32_t pnn)
+{
+	if (pnn < talloc_array_length(caps) && caps[pnn].retrieved) {
+		return &caps[pnn].capabilities;
+	}
+
+	return NULL;
+}
+
+bool ctdb_node_has_capabilities(struct ctdb_node_capabilities *caps,
+				uint32_t pnn,
+				uint32_t capabilities_required)
+{
+	uint32_t *capp = ctdb_get_node_capabilities(caps, pnn);
+	return (capp != NULL) &&
+		((*capp & capabilities_required) == capabilities_required);
+}
+
+/*
+  recovery daemon ping to main daemon
+ */
+int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null,
+			   ctdb, NULL, &res, NULL, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send recd ping\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  tell the main daemon how long it took to lock the reclock file
+ */
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
+{
+	int ret;
+	int32_t res;
+	TDB_DATA data;
+
+	data.dptr = (uint8_t *)&latency;
+	data.dsize = sizeof(latency);
+
+	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data,
+			   ctdb, NULL, &res, NULL, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout,
+		      uint32_t destnode, struct ctdb_ban_state *bantime)
+{
+	int ret;
+	TDB_DATA data;
+	int32_t res;
+
+	data.dsize = sizeof(*bantime);
+	data.dptr  = (uint8_t *)bantime;
+
+	ret = ctdb_control(ctdb, destnode, 0,
+			   CTDB_CONTROL_SET_BAN_STATE, 0, data,
+			   NULL, NULL, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	struct ctdb_client_control_state *handle;
+	struct ctdb_marshall_buffer *m;
+	struct ctdb_rec_data_old *rec;
+	TDB_DATA outdata;
+
+	m = talloc_zero(mem_ctx, struct ctdb_marshall_buffer);
+	if (m == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate marshall buffer for update record\n"));
+		return NULL;
+	}
+
+	m->db_id = ctdb_db->db_id;
+
+	rec = ctdb_marshall_record(m, 0, key, header, data);
+	if (rec == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to marshall record for update record\n"));
+		talloc_free(m);
+		return NULL;
+	}
+	m = talloc_realloc_size(mem_ctx, m, rec->length + offsetof(struct ctdb_marshall_buffer, data));
+	if (m == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata\n"));
+		talloc_free(m);
+		return NULL;
+	}
+	m->count++;
+	memcpy((uint8_t *)m + offsetof(struct ctdb_marshall_buffer, data), rec, rec->length);
+
+
+	outdata.dptr = (uint8_t *)m;
+	outdata.dsize = talloc_get_size(m);
+
+	handle = ctdb_control_send(ctdb, destnode, 0,
+			   CTDB_CONTROL_UPDATE_RECORD, 0, outdata,
+			   mem_ctx, &timeout, NULL);
+	talloc_free(m);
+	return handle;
+}
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control_recv(ctdb, state, state, NULL, &res, NULL);
+	if ( (ret != 0) || (res != 0) ){
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_update_record_recv failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_updaterecord_send(ctdb, mem_ctx, timeout, destnode, ctdb_db, key, header, data);
+	return ctdb_ctrl_updaterecord_recv(ctdb, state);
+}
diff --git a/ctdb/server/ctdb_cluster_mutex.c b/ctdb/server/ctdb_cluster_mutex.c
new file mode 100644
index 0000000..2fbe301
--- /dev/null
+++ b/ctdb/server/ctdb_cluster_mutex.c
@@ -0,0 +1,382 @@
+/*
+   CTDB cluster mutex handling
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Martin Schwenke  2016
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/strv.h"
+#include "lib/util/strv_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/blocking.h"
+
+#include "ctdb_private.h"
+
+#include "ctdb_cluster_mutex.h"
+
+struct ctdb_cluster_mutex_handle {
+	struct ctdb_context *ctdb;
+	cluster_mutex_handler_t handler;
+	void *private_data;
+	cluster_mutex_lost_handler_t lost_handler;
+	void *lost_data;
+	int fd[2];
+	struct tevent_timer *te;
+	struct tevent_fd *fde;
+	pid_t child;
+	struct timeval start_time;
+	bool have_response;
+};
+
+static void cluster_mutex_timeout(struct tevent_context *ev,
+				  struct tevent_timer *te,
+				  struct timeval t, void *private_data)
+{
+	struct ctdb_cluster_mutex_handle *h =
+		talloc_get_type(private_data, struct ctdb_cluster_mutex_handle);
+	double latency = timeval_elapsed(&h->start_time);
+
+	if (h->handler != NULL) {
+		h->handler('2', latency, h->private_data);
+	}
+}
+
+
+/* When the handle is freed it causes any child holding the mutex to
+ * be killed, thus freeing the mutex */
+static int cluster_mutex_destructor(struct ctdb_cluster_mutex_handle *h)
+{
+	if (h->fd[0] != -1) {
+		h->fd[0] = -1;
+	}
+	ctdb_kill(h->ctdb, h->child, SIGTERM);
+	return 0;
+}
+
+/* this is called when the client process has completed ctdb_recovery_lock()
+   and has written data back to us through the pipe.
+*/
+static void cluster_mutex_handler(struct tevent_context *ev,
+				  struct tevent_fd *fde,
+				  uint16_t flags, void *private_data)
+{
+	struct ctdb_cluster_mutex_handle *h=
+		talloc_get_type(private_data, struct ctdb_cluster_mutex_handle);
+	double latency = timeval_elapsed(&h->start_time);
+	char c = '0';
+	int ret;
+
+	/* Got response from child process so abort timeout */
+	TALLOC_FREE(h->te);
+
+	ret = sys_read(h->fd[0], &c, 1);
+
+	/* Don't call the handler more than once.  It only exists to
+	 * process the initial response from the helper. */
+	if (h->have_response) {
+		/* Only deal with EOF due to process exit.  Silently
+		 * ignore any other output. */
+		if (ret == 0) {
+			if (h->lost_handler != NULL) {
+				h->lost_handler(h->lost_data);
+			}
+		}
+		return;
+	}
+	h->have_response = true;
+
+	/* If the child wrote status then just pass it to the handler.
+	 * If no status was written then this is an unexpected error
+	 * so pass generic error code to handler. */
+	if (h->handler != NULL) {
+		h->handler(ret == 1 ? c : '3', latency, h->private_data);
+	}
+}
+
+static char cluster_mutex_helper[PATH_MAX+1] = "";
+
+static bool cluster_mutex_helper_args_file(TALLOC_CTX *mem_ctx,
+					   const char *argstring,
+					   char ***argv)
+{
+	struct stat st;
+	size_t size = sizeof(cluster_mutex_helper);
+	const char *t;
+	char **args = NULL;
+	int ret;
+
+	if (cluster_mutex_helper[0] != '\0') {
+		goto helper_done;
+	}
+
+	t = getenv("CTDB_CLUSTER_MUTEX_HELPER");
+	if (t != NULL) {
+		size_t len;
+
+		len = strlcpy(cluster_mutex_helper, t, size);
+		if (len >= size) {
+			DBG_ERR("error: CTDB_CLUSTER_MUTEX_HELPER too long\n");
+			exit(1);
+		}
+	} else {
+		ret = snprintf(cluster_mutex_helper,
+			       size,
+			       "%s/%s",
+			       CTDB_HELPER_BINDIR,
+			       "ctdb_mutex_fcntl_helper");
+		if (ret < 0 || (size_t)ret >= size) {
+			D_ERR("Unable to set cluster mutex helper - "
+			      "path too long\n");
+			exit(1);
+		}
+	}
+
+	ret = stat(cluster_mutex_helper, &st);
+	if (ret != 0) {
+		D_ERR("Unable to set cluster mutex helper \"%s\" - %s\n",
+		      cluster_mutex_helper,
+		      strerror(errno));
+		exit(1);
+	}
+
+	if ((st.st_mode & S_IXUSR) == 0) {
+		D_ERR("Unable to set cluster_mutex helper \"%s\" - "
+		      "not executable\n",
+		      cluster_mutex_helper);
+		exit(1);
+	}
+
+	D_NOTICE("Set cluster mutex helper to \"%s\"\n", cluster_mutex_helper);
+
+helper_done:
+
+	/* Array includes default helper, file and NULL */
+	args = talloc_array(mem_ctx, char *, 3);
+	if (args == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return false;
+	}
+
+	args[0] = cluster_mutex_helper;
+
+	args[1] = talloc_strdup(args, argstring);
+	if (args[1] == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return false;
+	}
+
+	args[2] = NULL;
+
+	*argv = args;
+	return true;
+}
+
+static bool cluster_mutex_helper_args_cmd(TALLOC_CTX *mem_ctx,
+					  const char *argstring,
+					  char ***argv)
+{
+	int i, ret, n;
+	char **args = NULL;
+	char *strv = NULL;
+	char *t = NULL;
+
+	ret = strv_split(mem_ctx, &strv, argstring, " \t");
+	if (ret != 0) {
+		D_ERR("Unable to parse mutex helper command \"%s\" (%s)\n",
+		      argstring,
+		      strerror(ret));
+		return false;
+	}
+	n = strv_count(strv);
+	if (n == 0) {
+		D_ERR("Mutex helper command is empty \"%s\"\n", argstring);
+		return false;
+	}
+
+	/* Extra slot for NULL */
+	args = talloc_array(mem_ctx, char *, n + 1);
+	if (args == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return false;
+	}
+
+	talloc_steal(args, strv);
+
+	t = NULL;
+	for (i = 0 ; i < n; i++) {
+		t = strv_next(strv, t);
+		args[i] = t;
+	}
+
+	args[n] = NULL;
+
+	*argv = args;
+	return true;
+}
+
+static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx,
+				      const char *argstring,
+				      char ***argv)
+{
+	bool ok;
+
+	if (argstring != NULL && argstring[0] == '!') {
+		ok = cluster_mutex_helper_args_cmd(mem_ctx, &argstring[1], argv);
+	} else {
+		ok = cluster_mutex_helper_args_file(mem_ctx, argstring, argv);
+	}
+
+	return ok;
+}
+
+struct ctdb_cluster_mutex_handle *
+ctdb_cluster_mutex(TALLOC_CTX *mem_ctx,
+		   struct ctdb_context *ctdb,
+		   const char *argstring,
+		   int timeout,
+		   cluster_mutex_handler_t handler,
+		   void *private_data,
+		   cluster_mutex_lost_handler_t lost_handler,
+		   void *lost_data)
+{
+	struct ctdb_cluster_mutex_handle *h;
+	char **args;
+	sigset_t sigset_term;
+	int ret;
+
+	h = talloc(mem_ctx, struct ctdb_cluster_mutex_handle);
+	if (h == NULL) {
+		DBG_ERR("out of memory\n");
+		return NULL;
+	}
+
+	h->start_time = timeval_current();
+	h->fd[0] = -1;
+	h->fd[1] = -1;
+	h->have_response = false;
+
+	ret = pipe(h->fd);
+	if (ret != 0) {
+		talloc_free(h);
+		DBG_ERR("Failed to open pipe\n");
+		return NULL;
+	}
+	set_close_on_exec(h->fd[0]);
+
+	/* Create arguments for lock helper */
+	if (!cluster_mutex_helper_args(h, argstring, &args)) {
+		close(h->fd[0]);
+		close(h->fd[1]);
+		talloc_free(h);
+		return NULL;
+	}
+
+	sigemptyset(&sigset_term);
+	sigaddset(&sigset_term, SIGTERM);
+	ret = sigprocmask(SIG_BLOCK, &sigset_term, NULL);
+	if (ret != 0) {
+		DBG_WARNING("Failed to block SIGTERM (%d)\n", errno);
+	}
+
+	h->child = ctdb_fork(ctdb);
+	if (h->child == (pid_t)-1) {
+		close(h->fd[0]);
+		close(h->fd[1]);
+		talloc_free(h);
+		ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL);
+		if (ret != 0) {
+			DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno);
+		}
+		return NULL;
+	}
+
+	if (h->child == 0) {
+		struct sigaction sa = {
+			.sa_handler = SIG_DFL,
+		};
+
+		ret = sigaction(SIGTERM, &sa, NULL);
+		if (ret != 0) {
+			DBG_WARNING("Failed to reset signal handler (%d)\n",
+				    errno);
+		}
+
+		ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL);
+		if (ret != 0) {
+			DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno);
+		}
+
+		/* Make stdout point to the pipe */
+		close(STDOUT_FILENO);
+		dup2(h->fd[1], STDOUT_FILENO);
+		close(h->fd[1]);
+
+		execv(args[0], args);
+
+		/* Only happens on error */
+		DBG_ERR("execv() failed\n");
+		_exit(1);
+	}
+
+	/* Parent */
+
+	ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL);
+	if (ret != 0) {
+		DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno);
+	}
+
+	DBG_DEBUG("Created PIPE FD:%d\n", h->fd[0]);
+	set_close_on_exec(h->fd[0]);
+
+	close(h->fd[1]);
+	h->fd[1] = -1;
+
+	talloc_set_destructor(h, cluster_mutex_destructor);
+
+	if (timeout != 0) {
+		h->te = tevent_add_timer(ctdb->ev, h,
+					 timeval_current_ofs(timeout, 0),
+					 cluster_mutex_timeout, h);
+	} else {
+		h->te = NULL;
+	}
+
+	h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
+			       cluster_mutex_handler, (void *)h);
+
+	if (h->fde == NULL) {
+		talloc_free(h);
+		return NULL;
+	}
+	tevent_fd_set_auto_close(h->fde);
+
+	h->ctdb = ctdb;
+	h->handler = handler;
+	h->private_data = private_data;
+	h->lost_handler = lost_handler;
+	h->lost_data = lost_data;
+
+	return h;
+}
diff --git a/ctdb/server/ctdb_cluster_mutex.h b/ctdb/server/ctdb_cluster_mutex.h
new file mode 100644
index 0000000..4587290
--- /dev/null
+++ b/ctdb/server/ctdb_cluster_mutex.h
@@ -0,0 +1,51 @@
+/*
+   CTDB cluster mutex handling
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Martin Schwenke  2016
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_CLUSTER_MUTEX_H__
+#define __CTDB_CLUSTER_MUTEX_H__
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+struct ctdb_cluster_mutex_handle;
+
+typedef void (*cluster_mutex_handler_t) (
+	char status,
+	double latency,
+	void *private_data);
+
+typedef void (*cluster_mutex_lost_handler_t) (void *private_data);
+
+struct ctdb_cluster_mutex_handle *
+ctdb_cluster_mutex(TALLOC_CTX *mem_ctx,
+		   struct ctdb_context *ctdb,
+		   const char *argstring,
+		   int timeout,
+		   cluster_mutex_handler_t handler,
+		   void *private_data,
+		   cluster_mutex_lost_handler_t lost_handler,
+		   void *lost_data);
+
+#endif /* __CTDB_CLUSTER_MUTEX_H__ */
diff --git a/ctdb/server/ctdb_config.c b/ctdb/server/ctdb_config.c
new file mode 100644
index 0000000..3f61fda
--- /dev/null
+++ b/ctdb/server/ctdb_config.c
@@ -0,0 +1,183 @@
+/*
+   CTDB daemon config handling
+
+   Copyright (C) Martin Schwenke  2018
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "lib/util/debug.h"
+
+#include "common/conf.h"
+#include "common/logging_conf.h"
+#include "common/path.h"
+
+#include "cluster/cluster_conf.h"
+#include "database/database_conf.h"
+#include "event/event_conf.h"
+#include "failover/failover_conf.h"
+#include "legacy_conf.h"
+
+#include "ctdb_config.h"
+
+struct ctdb_config ctdb_config;
+
+static void setup_config_pointers(struct conf_context *conf)
+{
+	/*
+	 * Cluster
+	 */
+
+	conf_assign_string_pointer(conf,
+				   CLUSTER_CONF_SECTION,
+				   CLUSTER_CONF_TRANSPORT,
+				   &ctdb_config.transport);
+	conf_assign_string_pointer(conf,
+				   CLUSTER_CONF_SECTION,
+				   CLUSTER_CONF_NODE_ADDRESS,
+				   &ctdb_config.node_address);
+	conf_assign_string_pointer(conf,
+				   CLUSTER_CONF_SECTION,
+				   CLUSTER_CONF_CLUSTER_LOCK,
+				   &ctdb_config.cluster_lock);
+	conf_assign_string_pointer(conf,
+				   CLUSTER_CONF_SECTION,
+				   CLUSTER_CONF_RECOVERY_LOCK,
+				   &ctdb_config.recovery_lock);
+	conf_assign_integer_pointer(conf,
+				    CLUSTER_CONF_SECTION,
+				    CLUSTER_CONF_LEADER_TIMEOUT,
+				    &ctdb_config.leader_timeout);
+	conf_assign_boolean_pointer(conf,
+				    CLUSTER_CONF_SECTION,
+				    CLUSTER_CONF_LEADER_CAPABILITY,
+				    &ctdb_config.leader_capability);
+
+	/*
+	 * Database
+	 */
+
+	conf_assign_string_pointer(conf,
+				   DATABASE_CONF_SECTION,
+				   DATABASE_CONF_VOLATILE_DB_DIR,
+				   &ctdb_config.dbdir_volatile);
+       conf_assign_string_pointer(conf,
+				   DATABASE_CONF_SECTION,
+				   DATABASE_CONF_PERSISTENT_DB_DIR,
+				   &ctdb_config.dbdir_persistent);
+	conf_assign_string_pointer(conf,
+				   DATABASE_CONF_SECTION,
+				   DATABASE_CONF_STATE_DB_DIR,
+				   &ctdb_config.dbdir_state);
+	conf_assign_string_pointer(conf,
+				   DATABASE_CONF_SECTION,
+				   DATABASE_CONF_LOCK_DEBUG_SCRIPT,
+				   &ctdb_config.lock_debug_script);
+	conf_assign_boolean_pointer(conf,
+				    DATABASE_CONF_SECTION,
+				    DATABASE_CONF_TDB_MUTEXES,
+				    &ctdb_config.tdb_mutexes);
+
+	/*
+	 * Event
+	 */
+	conf_assign_string_pointer(conf,
+				   EVENT_CONF_SECTION,
+				   EVENT_CONF_DEBUG_SCRIPT,
+				   &ctdb_config.event_debug_script);
+
+	/*
+	 * Failover
+	 */
+	conf_assign_boolean_pointer(conf,
+				    FAILOVER_CONF_SECTION,
+				    FAILOVER_CONF_DISABLED,
+				    &ctdb_config.failover_disabled);
+
+	/*
+	 * Legacy
+	 */
+
+	conf_assign_boolean_pointer(conf,
+				    LEGACY_CONF_SECTION,
+				    LEGACY_CONF_REALTIME_SCHEDULING,
+				    &ctdb_config.realtime_scheduling);
+	conf_assign_boolean_pointer(conf,
+				    LEGACY_CONF_SECTION,
+				    LEGACY_CONF_LMASTER_CAPABILITY,
+				    &ctdb_config.lmaster_capability);
+	conf_assign_boolean_pointer(conf,
+				    LEGACY_CONF_SECTION,
+				    LEGACY_CONF_START_AS_STOPPED,
+				    &ctdb_config.start_as_stopped);
+	conf_assign_boolean_pointer(conf,
+				    LEGACY_CONF_SECTION,
+				    LEGACY_CONF_START_AS_DISABLED,
+				    &ctdb_config.start_as_disabled);
+	conf_assign_string_pointer(conf,
+				   LEGACY_CONF_SECTION,
+				   LEGACY_CONF_SCRIPT_LOG_LEVEL,
+				   &ctdb_config.script_log_level);
+}
+
+int ctdbd_config_load(TALLOC_CTX *mem_ctx,
+		      struct conf_context **result)
+{
+	struct conf_context *conf = NULL;
+	int ret = 0;
+	char *conf_file =  NULL;
+
+	ret = conf_init(mem_ctx, &conf);
+	if (ret != 0) {
+		return ret;
+	}
+
+	logging_conf_init(conf, NULL);
+	cluster_conf_init(conf);
+	database_conf_init(conf);
+	event_conf_init(conf);
+	failover_conf_init(conf);
+	legacy_conf_init(conf);
+
+	setup_config_pointers(conf);
+
+	if (! conf_valid(conf)) {
+		ret = EINVAL;
+		goto fail;
+	}
+
+	conf_file = path_config(conf);
+	if (conf_file == NULL) {
+		D_ERR("Memory allocation error\n");
+		ret = ENOMEM;
+		goto fail;
+	}
+	ret = conf_load(conf, conf_file, true);
+	/* Configuration file does not need to exist */
+	if (ret != 0 && ret != ENOENT) {
+		D_ERR("Failed to load configuration file %s\n", conf_file);
+		goto fail;
+	}
+
+	talloc_free(conf_file);
+	*result = conf;
+
+	return 0;
+
+fail:
+	talloc_free(conf);
+	return ret;
+}
diff --git a/ctdb/server/ctdb_config.h b/ctdb/server/ctdb_config.h
new file mode 100644
index 0000000..7ccda7d
--- /dev/null
+++ b/ctdb/server/ctdb_config.h
@@ -0,0 +1,59 @@
+/*
+   CTDB daemon config handling
+
+   Copyright (C) Martin Schwenke  2018
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_CONFIG_H__
+#define __CTDB_CONFIG_H__
+
+#include "common/conf.h"
+
+struct ctdb_config {
+	/* Cluster */
+	const char *transport;
+	const char *node_address;
+	const char *cluster_lock;
+	const char *recovery_lock;
+	int leader_timeout;
+	bool leader_capability;
+
+	/* Database */
+	const char *dbdir_volatile;
+	const char *dbdir_persistent;
+	const char *dbdir_state;
+	const char *lock_debug_script;
+	bool tdb_mutexes;
+
+	/* Event */
+	const char *event_debug_script;
+
+	/* Failover */
+	bool failover_disabled;
+
+	/* Legacy */
+	bool realtime_scheduling;
+	bool lmaster_capability;
+	bool start_as_stopped;
+	bool start_as_disabled;
+	const char *script_log_level;
+};
+
+extern struct ctdb_config ctdb_config;
+
+int ctdbd_config_load(TALLOC_CTX *mem_ctx, struct conf_context **conf);
+
+#endif /* __CTDB_CONFIG_H__ */
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
new file mode 100644
index 0000000..422c4cf
--- /dev/null
+++ b/ctdb/server/ctdb_control.c
@@ -0,0 +1,1097 @@
+/* 
+   ctdb_control protocol code
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/talloc_report.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "protocol/protocol_private.h"
+
+#include "common/reqid.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+
+struct ctdb_control_state {
+	struct ctdb_context *ctdb;
+	uint32_t reqid;
+	ctdb_control_callback_fn_t callback;
+	void *private_data;
+	unsigned flags;
+};
+
+
+/*
+  dump talloc memory hierarchy, returning it as a blob to the client
+ */
+int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+	char *report;
+	size_t reportlen;
+
+	report = talloc_report_str(outdata, NULL);
+	if (report == NULL) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " talloc_report_str failed\n"));
+		return -1;
+	}
+	reportlen = talloc_get_size(report);
+
+	if (reportlen > 0) {
+		reportlen -= 1;	/* strip trailing zero */
+	}
+
+	outdata->dptr = (uint8_t *)report;
+	outdata->dsize = reportlen;
+	return 0;
+}
+
+static int32_t control_not_implemented(const char *unsupported,
+				       const char *alternate)
+{
+	if (alternate == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Control %s is not implemented any more\n",
+		       unsupported));
+	} else {
+		DEBUG(DEBUG_ERR,
+		      ("Control %s is not implemented any more, use %s instead\n",
+		       unsupported, alternate));
+	}
+	return -1;
+}
+
+struct ctdb_echo_data_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_control_old *c;
+	struct ctdb_echo_data *data;
+};
+
+static void ctdb_echo_data_timeout(
+	struct tevent_context *ev,
+	struct tevent_timer *te,
+	struct timeval now,
+	void *private_data);
+
+static int32_t ctdb_control_echo_data(
+	struct ctdb_context *ctdb,
+	struct ctdb_req_control_old *c,
+	TDB_DATA indata,
+	bool *async_reply)
+{
+	struct ctdb_echo_data_state *state = NULL;
+	struct tevent_timer *te = NULL;
+	uint32_t delay = 0;
+	size_t np = 0;
+	int ret;
+
+	state = talloc_zero(ctdb, struct ctdb_echo_data_state);
+	CTDB_NO_MEMORY(ctdb, state);
+	state->ctdb = ctdb;
+
+	ret = ctdb_echo_data_pull(
+		indata.dptr, indata.dsize, state, &state->data, &np);
+	if (ret != 0) {
+		DBG_DEBUG("ctdb_echo_data_pull failed: %s\n",
+			  strerror(ret));
+		TALLOC_FREE(state);
+		return -1;
+	}
+
+	te = tevent_add_timer(
+		ctdb->ev,
+		state,
+		timeval_current_ofs_msec(delay),
+		ctdb_echo_data_timeout,
+		state);
+	if (te == NULL) {
+		DBG_DEBUG("tevent_add_timer failed\n");
+		TALLOC_FREE(state);
+		return -1;
+	}
+
+	state->c = talloc_move(state, &c);
+	*async_reply = true;
+
+	return 0;
+}
+
+static void ctdb_echo_data_timeout(
+	struct tevent_context *ev,
+	struct tevent_timer *te,
+	struct timeval now,
+	void *private_data)
+{
+	struct ctdb_echo_data_state *state = talloc_get_type_abort(
+		private_data, struct ctdb_echo_data_state);
+	size_t len = ctdb_echo_data_len(state->data);
+	uint8_t *buf = NULL;
+	size_t np;
+	TDB_DATA data;
+
+	DBG_DEBUG("reqid=%"PRIu32" len=%zu\n", state->c->hdr.reqid, len);
+
+	buf = talloc_array(state, uint8_t, len);
+	if (buf == NULL) {
+		DBG_WARNING("talloc_array(%zu) failed\n", len);
+		goto done;
+	}
+	ctdb_echo_data_push(state->data, buf, &np);
+	data = (TDB_DATA) { .dptr = buf, .dsize = np };
+
+	ctdb_request_control_reply(state->ctdb, state->c, &data, 0, NULL);
+
+done:
+	TALLOC_FREE(state);
+}
+
+static int ctdb_control_disable_node(struct ctdb_context *ctdb)
+{
+	struct ctdb_node *node;
+
+	node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+	if (node == NULL) {
+		/* Can't happen */
+		DBG_ERR("Unable to find current node\n");
+		return -1;
+	}
+
+	D_ERR("Disable node\n");
+	node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
+
+	return 0;
+}
+
+static int ctdb_control_enable_node(struct ctdb_context *ctdb)
+{
+	struct ctdb_node *node;
+
+	node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+	if (node == NULL) {
+		/* Can't happen */
+		DBG_ERR("Unable to find current node\n");
+		return -1;
+	}
+
+	D_ERR("Enable node\n");
+	node->flags &= ~NODE_FLAGS_PERMANENTLY_DISABLED;
+
+	return 0;
+}
+
+/*
+  process a control request
+ */
+static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, 
+				     struct ctdb_req_control_old *c,
+				     TDB_DATA indata,
+				     TDB_DATA *outdata, uint32_t srcnode,
+				     const char **errormsg,
+				     bool *async_reply)
+{
+	uint32_t opcode = c->opcode;
+	uint64_t srvid = c->srvid;
+	uint32_t client_id = c->client_id;
+	static int level = DEBUG_ERR;
+
+	switch (opcode) {
+	case CTDB_CONTROL_PROCESS_EXISTS: {
+		CHECK_CONTROL_DATA_SIZE(sizeof(pid_t));
+		return ctdb_control_process_exists(ctdb, *(pid_t *)indata.dptr);
+	}
+
+	case CTDB_CONTROL_SET_DEBUG: {
+		union {
+			uint8_t *ptr;
+			int32_t *level;
+		} debug;
+		CHECK_CONTROL_DATA_SIZE(sizeof(int32_t));
+		debug.ptr = indata.dptr;
+		debuglevel_set(*debug.level);
+		return 0;
+	}
+
+	case CTDB_CONTROL_GET_DEBUG: {
+		CHECK_CONTROL_DATA_SIZE(0);
+		level = debuglevel_get();
+		outdata->dptr = (uint8_t *)&(level);
+		outdata->dsize = sizeof(DEBUGLEVEL);
+		return 0;
+	}
+
+	case CTDB_CONTROL_STATISTICS: {
+		CHECK_CONTROL_DATA_SIZE(0);
+		ctdb->statistics.memory_used = talloc_total_size(NULL);
+		ctdb->statistics.num_clients = ctdb->num_clients;
+		ctdb->statistics.frozen = (ctdb_db_all_frozen(ctdb) ? 1 : 0);
+		ctdb->statistics.recovering = (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE);
+		ctdb->statistics.statistics_current_time = timeval_current();
+
+		outdata->dptr = (uint8_t *)&ctdb->statistics;
+		outdata->dsize = sizeof(ctdb->statistics);
+		return 0;
+	}
+
+	case CTDB_CONTROL_GET_ALL_TUNABLES: {
+		CHECK_CONTROL_DATA_SIZE(0);
+		outdata->dptr = (uint8_t *)&ctdb->tunable;
+		outdata->dsize = sizeof(ctdb->tunable);
+		return 0;
+	}
+
+	case CTDB_CONTROL_DUMP_MEMORY: {
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_dump_memory(ctdb, outdata);
+	}
+
+	case CTDB_CONTROL_STATISTICS_RESET: {
+		struct ctdb_db_context *ctdb_db;
+
+		CHECK_CONTROL_DATA_SIZE(0);
+		ZERO_STRUCT(ctdb->statistics);
+		for (ctdb_db = ctdb->db_list;
+		     ctdb_db != NULL;
+		     ctdb_db = ctdb_db->next) {
+			ctdb_db_statistics_reset(ctdb_db);
+		}
+		ctdb->statistics.statistics_start_time = timeval_current();
+		return 0;
+	}
+
+	case CTDB_CONTROL_GETVNNMAP:
+		return ctdb_control_getvnnmap(ctdb, opcode, indata, outdata);
+
+	case CTDB_CONTROL_GET_DBMAP:
+		return ctdb_control_getdbmap(ctdb, opcode, indata, outdata);
+
+	case CTDB_CONTROL_GET_NODEMAPv4:
+		return control_not_implemented("GET_NODEMAPv4", "GET_NODEMAP");
+
+	case CTDB_CONTROL_GET_NODEMAP:
+		return ctdb_control_getnodemap(ctdb, opcode, indata, outdata);
+
+	case CTDB_CONTROL_GET_NODES_FILE:
+		return ctdb_control_getnodesfile(ctdb, opcode, indata, outdata);
+
+	case CTDB_CONTROL_RELOAD_NODES_FILE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_reload_nodes_file(ctdb, opcode);
+
+	case CTDB_CONTROL_SET_DB_STICKY: {
+		uint32_t db_id;
+		struct ctdb_db_context *ctdb_db;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+		db_id = *(uint32_t *)indata.dptr;
+		ctdb_db = find_ctdb_db(ctdb, db_id);
+		if (ctdb_db == NULL) return -1;
+		return ctdb_set_db_sticky(ctdb, ctdb_db);
+	}
+
+	case CTDB_CONTROL_SETVNNMAP:
+		return ctdb_control_setvnnmap(ctdb, opcode, indata, outdata);
+
+	case CTDB_CONTROL_PULL_DB:
+		return control_not_implemented("PULL_DB", NULL);
+
+	case CTDB_CONTROL_SET_DMASTER: 
+		return control_not_implemented("SET_DMASTER", NULL);
+
+	case CTDB_CONTROL_PUSH_DB:
+		return control_not_implemented("PUSH_DB", NULL);
+
+	case CTDB_CONTROL_GET_RECMODE: {
+		return ctdb->recovery_mode;
+	}
+
+	case CTDB_CONTROL_SET_RECMASTER:
+		return control_not_implemented("SET_RECMASTER", NULL);
+
+	case CTDB_CONTROL_GET_RECMASTER:
+		return control_not_implemented("GET_RECMASTER", NULL);
+
+	case CTDB_CONTROL_GET_PID:
+		return getpid();
+
+	case CTDB_CONTROL_GET_PNN:
+		return ctdb->pnn;
+
+	case CTDB_CONTROL_PING:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb->num_clients;
+
+	case CTDB_CONTROL_GET_RUNSTATE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		outdata->dptr = (uint8_t *)&ctdb->runstate;
+		outdata->dsize = sizeof(uint32_t);
+		return 0;
+
+
+	case CTDB_CONTROL_SET_DB_READONLY: {
+		uint32_t db_id;
+		struct ctdb_db_context *ctdb_db;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+		db_id = *(uint32_t *)indata.dptr;
+		ctdb_db = find_ctdb_db(ctdb, db_id);
+		if (ctdb_db == NULL) return -1;
+		return ctdb_set_db_readonly(ctdb, ctdb_db);
+	}
+	case CTDB_CONTROL_GET_DBNAME: {
+		uint32_t db_id;
+		struct ctdb_db_context *ctdb_db;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+		db_id = *(uint32_t *)indata.dptr;
+		ctdb_db = find_ctdb_db(ctdb, db_id);
+		if (ctdb_db == NULL) return -1;
+		outdata->dptr = discard_const(ctdb_db->db_name);
+		outdata->dsize = strlen(ctdb_db->db_name)+1;
+		return 0;
+	}
+
+	case CTDB_CONTROL_GETDBPATH: {
+		uint32_t db_id;
+		struct ctdb_db_context *ctdb_db;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+		db_id = *(uint32_t *)indata.dptr;
+		ctdb_db = find_ctdb_db(ctdb, db_id);
+		if (ctdb_db == NULL) return -1;
+		outdata->dptr = discard_const(ctdb_db->db_path);
+		outdata->dsize = strlen(ctdb_db->db_path)+1;
+		return 0;
+	}
+
+	case CTDB_CONTROL_DB_ATTACH:
+	  return ctdb_control_db_attach(ctdb,
+					indata,
+					outdata,
+					0,
+					srcnode,
+					client_id,
+					c,
+					async_reply);
+
+	case CTDB_CONTROL_DB_ATTACH_PERSISTENT:
+	  return ctdb_control_db_attach(ctdb,
+					indata,
+					outdata,
+					CTDB_DB_FLAGS_PERSISTENT,
+					srcnode,
+					client_id,
+					c,
+					async_reply);
+
+	case CTDB_CONTROL_DB_ATTACH_REPLICATED:
+	  return ctdb_control_db_attach(ctdb,
+					indata,
+					outdata,
+					CTDB_DB_FLAGS_REPLICATED,
+					srcnode,
+					client_id,
+					c,
+					async_reply);
+
+	case CTDB_CONTROL_SET_CALL:
+		return control_not_implemented("SET_CALL", NULL);
+
+	case CTDB_CONTROL_TRAVERSE_START:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start));
+		return ctdb_control_traverse_start(ctdb, indata, outdata, srcnode, client_id);
+
+	case CTDB_CONTROL_TRAVERSE_START_EXT:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start_ext));
+		return ctdb_control_traverse_start_ext(ctdb, indata, outdata, srcnode, client_id);
+
+	case CTDB_CONTROL_TRAVERSE_ALL:
+		return ctdb_control_traverse_all(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_TRAVERSE_ALL_EXT:
+		return ctdb_control_traverse_all_ext(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_TRAVERSE_DATA:
+		return ctdb_control_traverse_data(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_TRAVERSE_KILL:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start));
+		return ctdb_control_traverse_kill(ctdb, indata, outdata, srcnode);
+
+	case CTDB_CONTROL_REGISTER_SRVID:
+		return daemon_register_message_handler(ctdb, client_id, srvid);
+
+	case CTDB_CONTROL_DEREGISTER_SRVID:
+		return daemon_deregister_message_handler(ctdb, client_id, srvid);
+
+	case CTDB_CONTROL_CHECK_SRVIDS:
+		return control_not_implemented("CHECK_SRVIDS", NULL);
+
+	case CTDB_CONTROL_ENABLE_SEQNUM:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_ltdb_enable_seqnum(ctdb, *(uint32_t *)indata.dptr);
+
+	case CTDB_CONTROL_UPDATE_SEQNUM:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));		
+		return ctdb_ltdb_update_seqnum(ctdb, *(uint32_t *)indata.dptr, srcnode);
+
+	case CTDB_CONTROL_FREEZE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_freeze(ctdb, c, async_reply);
+
+	case CTDB_CONTROL_THAW:
+		return control_not_implemented("THAW", NULL);
+
+	case CTDB_CONTROL_SET_RECMODE:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));		
+		return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg);
+
+	case CTDB_CONTROL_GET_MONMODE:
+		return control_not_implemented("GET_MONMODE", NULL);
+
+	case CTDB_CONTROL_ENABLE_MONITOR:
+		return control_not_implemented("ENABLE_MONITOR", NULL);
+
+	case CTDB_CONTROL_RUN_EVENTSCRIPTS:
+		return control_not_implemented("RUN_EVENTSCRIPTS", NULL);
+
+	case CTDB_CONTROL_DISABLE_MONITOR:
+		return control_not_implemented("DISABLE_MONITOR", NULL);
+
+	case CTDB_CONTROL_SHUTDOWN:
+		DEBUG(DEBUG_NOTICE,("Received SHUTDOWN command.\n"));
+		ctdb_shutdown_sequence(ctdb, 0);
+		/* In case above returns due to duplicate shutdown */
+		return 0;
+
+	case CTDB_CONTROL_TAKEOVER_IPv4:
+		return control_not_implemented("TAKEOVER_IPv4", "TAKEOVER_IP");
+
+	case CTDB_CONTROL_TAKEOVER_IP:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip));
+		return ctdb_control_takeover_ip(ctdb, c, indata, async_reply);
+
+	case CTDB_CONTROL_RELEASE_IPv4:
+		return control_not_implemented("RELEASE_IPv4", "RELEASE_IP");
+
+	case CTDB_CONTROL_RELEASE_IP:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip));
+		return ctdb_control_release_ip(ctdb, c, indata, async_reply);
+
+	case CTDB_CONTROL_IPREALLOCATED:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_ipreallocated(ctdb, c, async_reply);
+
+	case CTDB_CONTROL_GET_PUBLIC_IPSv4:
+		return control_not_implemented("GET_PUBLIC_IPSv4",
+					       "GET_PUBLIC_IPS");
+
+	case CTDB_CONTROL_GET_PUBLIC_IPS:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_get_public_ips(ctdb, c, outdata);
+
+	case CTDB_CONTROL_TCP_CLIENT:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+		return ctdb_control_tcp_client(ctdb, client_id, indata);
+
+	case CTDB_CONTROL_STARTUP: 
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_startup(ctdb, srcnode);
+
+	case CTDB_CONTROL_TCP_ADD: 
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+		return ctdb_control_tcp_add(ctdb, indata, false);
+
+	case CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE: 
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+		return ctdb_control_tcp_add(ctdb, indata, true);
+
+	case CTDB_CONTROL_TCP_REMOVE: 
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+		return ctdb_control_tcp_remove(ctdb, indata);
+
+	case CTDB_CONTROL_SET_TUNABLE:
+		return ctdb_control_set_tunable(ctdb, indata);
+
+	case CTDB_CONTROL_GET_TUNABLE:
+		return ctdb_control_get_tunable(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_LIST_TUNABLES:
+		return ctdb_control_list_tunables(ctdb, outdata);
+
+	case CTDB_CONTROL_MODIFY_FLAGS:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_node_flag_change));
+		return ctdb_control_modflags(ctdb, indata);
+
+	case CTDB_CONTROL_KILL_TCP:
+		return control_not_implemented("KILL_TCP", NULL);
+
+	case CTDB_CONTROL_GET_TCP_TICKLE_LIST:
+		CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr));
+		return ctdb_control_get_tcp_tickle_list(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_SET_TCP_TICKLE_LIST:
+		/* data size is verified in the called function */
+		return ctdb_control_set_tcp_tickle_list(ctdb, indata);
+
+	case CTDB_CONTROL_REGISTER_SERVER_ID:
+		return control_not_implemented("REGISTER_SERVER_ID", NULL);
+
+	case CTDB_CONTROL_UNREGISTER_SERVER_ID:
+		return control_not_implemented("UNREGISTER_SERVER_ID", NULL);
+
+	case CTDB_CONTROL_CHECK_SERVER_ID:
+		return control_not_implemented("CHECK_SERVER_ID", NULL);
+
+	case CTDB_CONTROL_GET_SERVER_ID_LIST:
+		return control_not_implemented("SERVER_ID_LIST", NULL);
+
+	case CTDB_CONTROL_PERSISTENT_STORE:
+		return control_not_implemented("PERSISTENT_STORE", NULL);
+
+	case CTDB_CONTROL_UPDATE_RECORD:
+		return ctdb_control_update_record(ctdb, c, indata, async_reply);
+
+	case CTDB_CONTROL_SEND_GRATUITOUS_ARP:
+		return ctdb_control_send_gratious_arp(ctdb, indata);
+
+	case CTDB_CONTROL_TRANSACTION_START:
+		return control_not_implemented("TRANSACTION_START", NULL);
+
+	case CTDB_CONTROL_TRANSACTION_COMMIT:
+		return control_not_implemented("TRANSACTION_COMMIT", NULL);
+
+	case CTDB_CONTROL_WIPE_DATABASE:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb));
+		return ctdb_control_wipe_database(ctdb, indata);
+
+	case CTDB_CONTROL_UPTIME:
+		return ctdb_control_uptime(ctdb, outdata);
+
+	case CTDB_CONTROL_START_RECOVERY:
+		return ctdb_control_start_recovery(ctdb, c, async_reply);
+
+	case CTDB_CONTROL_END_RECOVERY:
+		return ctdb_control_end_recovery(ctdb, c, async_reply);
+
+	case CTDB_CONTROL_TRY_DELETE_RECORDS:
+		return ctdb_control_try_delete_records(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_ADD_PUBLIC_IP:
+		return ctdb_control_add_public_address(ctdb, indata);
+
+	case CTDB_CONTROL_DEL_PUBLIC_IP:
+		return ctdb_control_del_public_address(ctdb, indata);
+
+	case CTDB_CONTROL_GET_CAPABILITIES:
+		return ctdb_control_get_capabilities(ctdb, outdata);
+
+	case CTDB_CONTROL_START_PERSISTENT_UPDATE:
+		return ctdb_control_start_persistent_update(ctdb, c, indata);
+
+	case CTDB_CONTROL_CANCEL_PERSISTENT_UPDATE:
+		return ctdb_control_cancel_persistent_update(ctdb, c, indata);
+
+	case CTDB_CONTROL_TRANS2_COMMIT:
+	case CTDB_CONTROL_TRANS2_COMMIT_RETRY:
+		return control_not_implemented("TRANS2_COMMIT", "TRANS3_COMMIT");
+
+	case CTDB_CONTROL_TRANS2_ERROR:
+		return control_not_implemented("TRANS2_ERROR", NULL);
+
+	case CTDB_CONTROL_TRANS2_FINISHED:
+		return control_not_implemented("TRANS2_FINISHED", NULL);
+
+	case CTDB_CONTROL_TRANS2_ACTIVE:
+		return control_not_implemented("TRANS2_ACTIVE", NULL);
+
+	case CTDB_CONTROL_TRANS3_COMMIT:
+		return ctdb_control_trans3_commit(ctdb, c, indata, async_reply);
+
+	case CTDB_CONTROL_RECD_PING:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_recd_ping(ctdb);
+
+	case CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS:
+		return control_not_implemented("GET_EVENT_SCRIPT_STATUS", NULL);
+
+	case CTDB_CONTROL_RECD_RECLOCK_LATENCY:
+		CHECK_CONTROL_DATA_SIZE(sizeof(double));
+		CTDB_UPDATE_RECLOCK_LATENCY(ctdb, "recd reclock", reclock.recd, *((double *)indata.dptr));
+		return 0;
+	case CTDB_CONTROL_GET_RECLOCK_FILE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		if (ctdb->recovery_lock != NULL) {
+			outdata->dptr  = discard_const(ctdb->recovery_lock);
+			outdata->dsize = strlen(ctdb->recovery_lock) + 1;
+		}
+		return 0;
+	case CTDB_CONTROL_SET_RECLOCK_FILE:
+		return control_not_implemented("SET_RECLOCK", NULL);
+
+	case CTDB_CONTROL_STOP_NODE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_stop_node(ctdb);
+
+	case CTDB_CONTROL_CONTINUE_NODE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_continue_node(ctdb);
+
+	case CTDB_CONTROL_SET_NATGWSTATE:
+		return control_not_implemented("SET_NATGWSTATE", NULL);
+
+	case CTDB_CONTROL_SET_LMASTERROLE: {
+		uint32_t lmasterrole;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));		
+		lmasterrole = *(uint32_t *)indata.dptr;
+		if (lmasterrole == 0) {
+			ctdb->capabilities &= ~CTDB_CAP_LMASTER;
+		} else {
+			ctdb->capabilities |= CTDB_CAP_LMASTER;
+		}
+		return 0;
+	}
+
+	case CTDB_CONTROL_SET_RECMASTERROLE: {
+		uint32_t recmasterrole;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));		
+		recmasterrole = *(uint32_t *)indata.dptr;
+		if (recmasterrole == 0) {
+			ctdb->capabilities &= ~CTDB_CAP_RECMASTER;
+		} else {
+			ctdb->capabilities |= CTDB_CAP_RECMASTER;
+		}
+		return 0;
+	}
+
+	case CTDB_CONTROL_ENABLE_SCRIPT:
+		return control_not_implemented("ENABLE_SCRIPT", NULL);
+
+	case CTDB_CONTROL_DISABLE_SCRIPT:
+		return control_not_implemented("DISABLE_SCRIPT", NULL);
+
+	case CTDB_CONTROL_SET_BAN_STATE:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_ban_state));
+		return ctdb_control_set_ban_state(ctdb, indata);
+
+	case CTDB_CONTROL_GET_BAN_STATE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_get_ban_state(ctdb, outdata);
+
+	case CTDB_CONTROL_SET_DB_PRIORITY:
+		return control_not_implemented("SET_DB_PRIORITY", NULL);
+
+	case CTDB_CONTROL_GET_DB_PRIORITY:
+		return control_not_implemented("GET_DB_PRIORITY", NULL);
+
+	case CTDB_CONTROL_TRANSACTION_CANCEL:
+		return control_not_implemented("TRANSACTION_CANCEL", NULL);
+
+	case CTDB_CONTROL_REGISTER_NOTIFY:
+		return ctdb_control_register_notify(ctdb, client_id, indata);
+
+	case CTDB_CONTROL_DEREGISTER_NOTIFY:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t));
+		return ctdb_control_deregister_notify(ctdb, client_id, indata);
+
+	case CTDB_CONTROL_GET_LOG:
+		return control_not_implemented("GET_LOG", NULL);
+
+	case CTDB_CONTROL_CLEAR_LOG:
+		return control_not_implemented("CLEAR_LOG", NULL);
+
+	case CTDB_CONTROL_GET_DB_SEQNUM:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t));
+		return ctdb_control_get_db_seqnum(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_DB_SET_HEALTHY:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_db_set_healthy(ctdb, indata);
+
+	case CTDB_CONTROL_DB_GET_HEALTH:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_db_get_health(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_GET_PUBLIC_IP_INFO:
+		CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr));
+		return ctdb_control_get_public_ip_info(ctdb, c, indata, outdata);
+
+	case CTDB_CONTROL_GET_IFACES:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_get_ifaces(ctdb, c, outdata);
+
+	case CTDB_CONTROL_SET_IFACE_LINK_STATE:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_iface));
+		return ctdb_control_set_iface_link(ctdb, c, indata);
+
+	case CTDB_CONTROL_GET_STAT_HISTORY:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_get_stat_history(ctdb, c, outdata);
+
+	case CTDB_CONTROL_SCHEDULE_FOR_DELETION: {
+		struct ctdb_control_schedule_for_deletion *d;
+		size_t size = offsetof(struct ctdb_control_schedule_for_deletion, key);
+		CHECK_CONTROL_MIN_DATA_SIZE(size);
+		d = (struct ctdb_control_schedule_for_deletion *)indata.dptr;
+		size += d->keylen;
+		CHECK_CONTROL_DATA_SIZE(size);
+		return ctdb_control_schedule_for_deletion(ctdb, indata);
+	}
+	case CTDB_CONTROL_GET_DB_STATISTICS:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_get_db_statistics(ctdb, *(uint32_t *)indata.dptr, outdata);
+
+	case CTDB_CONTROL_RELOAD_PUBLIC_IPS:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_reload_public_ips(ctdb, c, async_reply);
+
+	case CTDB_CONTROL_RECEIVE_RECORDS:
+		return control_not_implemented("RECEIVE_RECORDS", NULL);
+
+	case CTDB_CONTROL_DB_DETACH:
+		return ctdb_control_db_detach(ctdb, indata, client_id);
+
+	case CTDB_CONTROL_DB_FREEZE:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_db_freeze(ctdb, c, *(uint32_t *)indata.dptr,
+					      async_reply);
+
+	case CTDB_CONTROL_DB_THAW:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_db_thaw(ctdb, *(uint32_t *)indata.dptr);
+
+	case CTDB_CONTROL_DB_TRANSACTION_START:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb));
+		return ctdb_control_db_transaction_start(ctdb, indata);
+
+	case CTDB_CONTROL_DB_TRANSACTION_COMMIT:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb));
+		return ctdb_control_db_transaction_commit(ctdb, indata);
+
+	case CTDB_CONTROL_DB_TRANSACTION_CANCEL:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_db_transaction_cancel(ctdb, indata);
+
+	case CTDB_CONTROL_DB_PULL:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_pulldb_ext));
+		return ctdb_control_db_pull(ctdb, c, indata, outdata);
+
+	case CTDB_CONTROL_DB_PUSH_START:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_pulldb_ext));
+		return ctdb_control_db_push_start(ctdb, indata);
+
+	case CTDB_CONTROL_DB_PUSH_CONFIRM:
+		CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+		return ctdb_control_db_push_confirm(ctdb, indata, outdata);
+
+	case CTDB_CONTROL_DB_OPEN_FLAGS: {
+		uint32_t db_id;
+		struct ctdb_db_context *ctdb_db;
+		int tdb_flags;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+		db_id = *(uint32_t *)indata.dptr;
+		ctdb_db = find_ctdb_db(ctdb, db_id);
+		if (ctdb_db == NULL) {
+			return -1;
+		}
+
+		tdb_flags = tdb_get_flags(ctdb_db->ltdb->tdb);
+
+		outdata->dptr = talloc_size(outdata, sizeof(tdb_flags));
+		if (outdata->dptr == NULL) {
+			return -1;
+		}
+
+		outdata->dsize = sizeof(tdb_flags);
+		memcpy(outdata->dptr, &tdb_flags, outdata->dsize);
+		return 0;
+	}
+
+	case CTDB_CONTROL_CHECK_PID_SRVID:
+		CHECK_CONTROL_DATA_SIZE((sizeof(pid_t) + sizeof(uint64_t)));
+		return ctdb_control_check_pid_srvid(ctdb, indata);
+
+	case CTDB_CONTROL_TUNNEL_REGISTER:
+		return ctdb_control_tunnel_register(ctdb, client_id, srvid);
+
+	case CTDB_CONTROL_TUNNEL_DEREGISTER:
+		return ctdb_control_tunnel_deregister(ctdb, client_id, srvid);
+
+	case CTDB_CONTROL_VACUUM_FETCH:
+		return ctdb_control_vacuum_fetch(ctdb, indata);
+
+	case CTDB_CONTROL_DB_VACUUM: {
+		struct ctdb_db_vacuum db_vacuum;
+
+		CHECK_CONTROL_DATA_SIZE(ctdb_db_vacuum_len(&db_vacuum));
+		return ctdb_control_db_vacuum(ctdb, c, indata, async_reply);
+	}
+	case CTDB_CONTROL_ECHO_DATA: {
+		return ctdb_control_echo_data(ctdb, c, indata, async_reply);
+	}
+
+	case CTDB_CONTROL_DISABLE_NODE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_disable_node(ctdb);
+
+	case CTDB_CONTROL_ENABLE_NODE:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_enable_node(ctdb);
+
+	case CTDB_CONTROL_TCP_CLIENT_DISCONNECTED:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+		return ctdb_control_tcp_client_disconnected(ctdb, client_id, indata);
+
+	case CTDB_CONTROL_TCP_CLIENT_PASSED:
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+		return ctdb_control_tcp_client_passed(ctdb, client_id, indata);
+
+	default:
+		DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
+		return -1;
+	}
+}
+
+/*
+  send a reply for a ctdb control
+ */
+void ctdb_request_control_reply(struct ctdb_context *ctdb, struct ctdb_req_control_old *c,
+				TDB_DATA *outdata, int32_t status, const char *errormsg)
+{
+	struct ctdb_reply_control_old *r;
+	size_t len;
+	
+	/* some controls send no reply */
+	if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+		return;
+	}
+
+	len = offsetof(struct ctdb_reply_control_old, data) + (outdata?outdata->dsize:0);
+	if (errormsg) {
+		len += strlen(errormsg);
+	}
+	r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CONTROL, len, struct ctdb_reply_control_old);
+	if (r == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ "Unable to allocate transport - OOM or transport is down\n"));
+		return;
+	}
+
+	r->hdr.destnode     = c->hdr.srcnode;
+	r->hdr.reqid        = c->hdr.reqid;
+	r->status           = status;
+	r->datalen          = outdata?outdata->dsize:0;
+	if (outdata && outdata->dsize) {
+		memcpy(&r->data[0], outdata->dptr, outdata->dsize);
+	}
+	if (errormsg) {
+		r->errorlen = strlen(errormsg);
+		memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+	}
+
+	ctdb_queue_packet_opcode(ctdb, &r->hdr, c->opcode);	
+
+	talloc_free(r);
+}
+
+/*
+  called when a CTDB_REQ_CONTROL packet comes in
+*/
+void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_req_control_old *c = (struct ctdb_req_control_old *)hdr;
+	TDB_DATA data, *outdata;
+	int32_t status;
+	bool async_reply = false;
+	const char *errormsg = NULL;
+
+	data.dptr = &c->data[0];
+	data.dsize = c->datalen;
+
+	outdata = talloc_zero(c, TDB_DATA);
+
+	status = ctdb_control_dispatch(ctdb, c, data, outdata, hdr->srcnode, 
+				       &errormsg, &async_reply);
+
+	if (!async_reply) {
+		ctdb_request_control_reply(ctdb, c, outdata, status, errormsg);
+	}
+}
+
+/*
+  called when a CTDB_REPLY_CONTROL packet comes in
+*/
+void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_reply_control_old *c = (struct ctdb_reply_control_old *)hdr;
+	TDB_DATA data;
+	struct ctdb_control_state *state;
+	const char *errormsg = NULL;
+
+	state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_control_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_control\n",
+			 ctdb->pnn, hdr->reqid));
+		return;
+	}
+
+	if (hdr->reqid != state->reqid) {
+		/* we found a record  but it was the wrong one */
+		DEBUG(DEBUG_ERR, ("Dropped orphaned control reply with reqid:%u\n", hdr->reqid));
+		return;
+	}
+
+	data.dptr = &c->data[0];
+	data.dsize = c->datalen;
+	if (c->errorlen) {
+		errormsg = talloc_strndup(state, 
+					  (char *)&c->data[c->datalen], c->errorlen);
+	}
+
+	/* make state a child of the packet, so it goes away when the packet
+	   is freed. */
+	talloc_steal(hdr, state);
+
+	state->callback(ctdb, c->status, data, errormsg, state->private_data);
+}
+
+static int ctdb_control_destructor(struct ctdb_control_state *state)
+{
+	reqid_remove(state->ctdb->idr, state->reqid);
+	return 0;
+}
+
+/*
+  handle a timeout of a control
+ */
+static void ctdb_control_timeout(struct tevent_context *ev,
+				 struct tevent_timer *te,
+				 struct timeval t, void *private_data)
+{
+	struct ctdb_control_state *state = talloc_get_type(private_data, struct ctdb_control_state);
+	TALLOC_CTX *tmp_ctx = talloc_new(ev);
+
+	CTDB_INCREMENT_STAT(state->ctdb, timeouts.control);
+
+	talloc_steal(tmp_ctx, state);
+
+	state->callback(state->ctdb, -1, tdb_null,
+			"ctdb_control timed out", 
+			state->private_data);
+	talloc_free(tmp_ctx);
+}
+
+
+/*
+  send a control message to a node
+ */
+int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
+			     uint64_t srvid, uint32_t opcode, uint32_t client_id,
+			     uint32_t flags,
+			     TDB_DATA data,
+			     ctdb_control_callback_fn_t callback,
+			     void *private_data)
+{
+	struct ctdb_req_control_old *c;
+	struct ctdb_control_state *state;
+	size_t len;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Failed to send control. Transport is DOWN\n"));
+		return -1;
+	}
+
+	if (((destnode == CTDB_BROADCAST_ACTIVE) ||
+	     (destnode == CTDB_BROADCAST_ALL) ||
+	     (destnode == CTDB_BROADCAST_CONNECTED)) && 
+	    !(flags & CTDB_CTRL_FLAG_NOREPLY)) {
+		DEBUG(DEBUG_CRIT,("Attempt to broadcast control without NOREPLY\n"));
+		return -1;
+	}
+
+	if (destnode != CTDB_BROADCAST_ACTIVE &&
+	    destnode != CTDB_BROADCAST_ALL && 
+	    destnode != CTDB_BROADCAST_CONNECTED && 
+	    (!ctdb_validate_pnn(ctdb, destnode) || 
+	     (ctdb->nodes[destnode]->flags & NODE_FLAGS_DISCONNECTED))) {
+		if (!(flags & CTDB_CTRL_FLAG_NOREPLY)) {
+			callback(ctdb, -1, tdb_null, "ctdb_control to disconnected node", private_data);
+		}
+		return 0;
+	}
+
+	/* the state is made a child of private_data if possible. This means any reply
+	   will be discarded if the private_data goes away */
+	state = talloc(private_data?private_data:ctdb, struct ctdb_control_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->reqid = reqid_new(ctdb->idr, state);
+	state->callback = callback;
+	state->private_data = private_data;
+	state->ctdb = ctdb;
+	state->flags = flags;
+
+	talloc_set_destructor(state, ctdb_control_destructor);
+
+	len = offsetof(struct ctdb_req_control_old, data) + data.dsize;
+	c = ctdb_transport_allocate(ctdb, state, CTDB_REQ_CONTROL, len, 
+				    struct ctdb_req_control_old);
+	CTDB_NO_MEMORY(ctdb, c);
+	talloc_set_name_const(c, "ctdb_req_control packet");
+
+	c->hdr.destnode     = destnode;
+	c->hdr.reqid        = state->reqid;
+	c->opcode           = opcode;
+	c->client_id        = client_id;
+	c->flags            = flags;
+	c->srvid            = srvid;
+	c->datalen          = data.dsize;
+	if (data.dsize) {
+		memcpy(&c->data[0], data.dptr, data.dsize);
+	}
+
+	ctdb_queue_packet(ctdb, &c->hdr);	
+
+	if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+		talloc_free(state);
+		return 0;
+	}
+
+	if (ctdb->tunable.control_timeout) {
+		tevent_add_timer(ctdb->ev, state,
+				 timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+				 ctdb_control_timeout, state);
+	}
+
+	talloc_free(c);
+	return 0;
+}
diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c
new file mode 100644
index 0000000..eb9d634
--- /dev/null
+++ b/ctdb/server/ctdb_daemon.c
@@ -0,0 +1,2248 @@
+/*
+   ctdb daemon code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "system/time.h"
+
+#include <talloc.h>
+/* Allow use of deprecated function tevent_loop_allow_nesting() */
+#define TEVENT_DEPRECATED
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/blocking.h"
+#include "lib/util/become_daemon.h"
+
+#include "version.h"
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/pidfile.h"
+#include "common/sock_io.h"
+
+struct ctdb_client_pid_list {
+	struct ctdb_client_pid_list *next, *prev;
+	struct ctdb_context *ctdb;
+	pid_t pid;
+	struct ctdb_client *client;
+};
+
+const char *ctdbd_pidfile = NULL;
+static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
+
+static void daemon_incoming_packet(void *, struct ctdb_req_header *);
+
+static pid_t __ctdbd_pid;
+
+static void print_exit_message(void)
+{
+	if (getpid() == __ctdbd_pid) {
+		DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
+
+		/* Wait a second to allow pending log messages to be flushed */
+		sleep(1);
+	}
+}
+
+#ifdef HAVE_GETRUSAGE
+
+struct cpu_check_threshold_data {
+	unsigned short percent;
+	struct timeval timeofday;
+	struct timeval ru_time;
+};
+
+static void ctdb_cpu_check_threshold(struct tevent_context *ev,
+				     struct tevent_timer *te,
+				     struct timeval tv,
+				     void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type_abort(
+		private_data, struct ctdb_context);
+	uint32_t interval = 60;
+
+	static unsigned short threshold = 0;
+	static struct cpu_check_threshold_data prev = {
+		.percent = 0,
+		.timeofday = { .tv_sec = 0 },
+		.ru_time = { .tv_sec = 0 },
+	};
+
+	struct rusage usage;
+	struct cpu_check_threshold_data curr = {
+		.percent = 0,
+	};
+	int64_t ru_time_diff, timeofday_diff;
+	bool first;
+	int ret;
+
+	/*
+	 * Cache the threshold so that we don't waste time checking
+	 * the environment variable every time
+	 */
+	if (threshold == 0) {
+		const char *t;
+
+		threshold = 90;
+
+		t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
+		if (t != NULL) {
+			int th;
+
+			th = atoi(t);
+			if (th <= 0 || th > 100) {
+				DBG_WARNING("Failed to parse env var: %s\n", t);
+			} else {
+				threshold = th;
+			}
+		}
+	}
+
+	ret = getrusage(RUSAGE_SELF, &usage);
+	if (ret != 0) {
+		DBG_WARNING("rusage() failed: %d\n", ret);
+		goto next;
+	}
+
+	/* Sum the system and user CPU usage */
+	curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
+
+	curr.timeofday = tv;
+
+	first = timeval_is_zero(&prev.timeofday);
+	if (first) {
+		/* No previous values recorded so no calculation to do */
+		goto done;
+	}
+
+	timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
+	if (timeofday_diff <= 0) {
+		/*
+		 * Time went backwards or didn't progress so no (sane)
+		 * calculation can be done
+		 */
+		goto done;
+	}
+
+	ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
+
+	curr.percent = ru_time_diff * 100 / timeofday_diff;
+
+	if (curr.percent >= threshold) {
+		/* Log only if the utilisation changes */
+		if (curr.percent != prev.percent) {
+			D_WARNING("WARNING: CPU utilisation %hu%% >= "
+				  "threshold (%hu%%)\n",
+				  curr.percent,
+				  threshold);
+		}
+	} else {
+		/* Log if the utilisation falls below the threshold */
+		if (prev.percent >= threshold) {
+			D_WARNING("WARNING: CPU utilisation %hu%% < "
+				  "threshold (%hu%%)\n",
+				  curr.percent,
+				  threshold);
+		}
+	}
+
+done:
+	prev = curr;
+
+next:
+	tevent_add_timer(ctdb->ev, ctdb,
+			 timeval_current_ofs(interval, 0),
+			 ctdb_cpu_check_threshold,
+			 ctdb);
+}
+
+static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
+{
+	tevent_add_timer(ctdb->ev, ctdb,
+			 timeval_current(),
+			 ctdb_cpu_check_threshold,
+			 ctdb);
+}
+#endif /* HAVE_GETRUSAGE */
+
+static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
+				  struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+	if (getpid() != ctdb->ctdbd_pid) {
+		return;
+	}
+
+	tevent_add_timer(ctdb->ev, ctdb,
+			 timeval_current_ofs(1, 0),
+			 ctdb_time_tick, ctdb);
+}
+
+/* Used to trigger a dummy event once per second, to make
+ * detection of hangs more reliable.
+ */
+static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
+{
+	tevent_add_timer(ctdb->ev, ctdb,
+			 timeval_current_ofs(1, 0),
+			 ctdb_time_tick, ctdb);
+}
+
+static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
+{
+	/* start monitoring for connected/disconnected nodes */
+	ctdb_start_keepalive(ctdb);
+
+	/* start periodic update of tcp tickle lists */
+       	ctdb_start_tcp_tickle_update(ctdb);
+
+	/* start listening for recovery daemon pings */
+	ctdb_control_recd_ping(ctdb);
+
+	/* start listening to timer ticks */
+	ctdb_start_time_tickd(ctdb);
+
+#ifdef HAVE_GETRUSAGE
+	ctdb_start_cpu_check_threshold(ctdb);
+#endif /* HAVE_GETRUSAGE */
+}
+
+static void ignore_signal(int signum)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+
+	act.sa_handler = SIG_IGN;
+	sigemptyset(&act.sa_mask);
+	sigaddset(&act.sa_mask, signum);
+	sigaction(signum, &act, NULL);
+}
+
+
+/*
+  send a packet to a client
+ */
+static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
+{
+	CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
+	if (hdr->operation == CTDB_REQ_MESSAGE) {
+		if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
+			DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
+			talloc_free(client);
+			return -1;
+		}
+	}
+	return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
+}
+
+/*
+  message handler for when we are in daemon mode. This redirects the message
+  to the right client
+ */
+static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
+				   void *private_data)
+{
+	struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
+	struct ctdb_req_message_old *r;
+	int len;
+
+	/* construct a message to send to the client containing the data */
+	len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+	r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
+			       len, struct ctdb_req_message_old);
+	CTDB_NO_MEMORY_VOID(client->ctdb, r);
+
+	talloc_set_name_const(r, "req_message packet");
+
+	r->srvid         = srvid;
+	r->datalen       = data.dsize;
+	memcpy(&r->data[0], data.dptr, data.dsize);
+
+	daemon_queue_send(client, &r->hdr);
+
+	talloc_free(r);
+}
+
+/*
+  this is called when the ctdb daemon received a ctdb request to
+  set the srvid from the client
+ */
+int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	int res;
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
+		return -1;
+	}
+	res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
+			     client);
+	if (res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
+			 (unsigned long long)srvid));
+	} else {
+		DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
+			 (unsigned long long)srvid));
+	}
+
+	return res;
+}
+
+/*
+  this is called when the ctdb daemon received a ctdb request to
+  remove a srvid from the client
+ */
+int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
+		return -1;
+	}
+	return srvid_deregister(ctdb->srv, srvid, client);
+}
+
+void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
+			   void *private_data)
+{
+	struct ctdb_client *client =
+		talloc_get_type_abort(private_data, struct ctdb_client);
+	struct ctdb_req_tunnel_old *c, *pkt;
+	size_t len;
+
+	pkt = (struct ctdb_req_tunnel_old *)data.dptr;
+
+	len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
+	c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
+			       len, struct ctdb_req_tunnel_old);
+	if (c == NULL) {
+		DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
+		return;
+	}
+
+	talloc_set_name_const(c, "req_tunnel packet");
+
+	c->tunnel_id = tunnel_id;
+	c->flags = pkt->flags;
+	c->datalen = pkt->datalen;
+	memcpy(c->data, pkt->data, pkt->datalen);
+
+	daemon_queue_send(client, &c->hdr);
+
+	talloc_free(c);
+}
+
+/*
+  destroy a ctdb_client
+*/
+static int ctdb_client_destructor(struct ctdb_client *client)
+{
+	struct ctdb_db_context *ctdb_db;
+
+	ctdb_takeover_client_destructor_hook(client);
+	reqid_remove(client->ctdb->idr, client->client_id);
+	client->ctdb->num_clients--;
+
+	if (client->num_persistent_updates != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
+		client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+	}
+	ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
+	if (ctdb_db) {
+		DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
+				  "commit active. Forcing recovery.\n"));
+		client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+		/*
+		 * trans3 transaction state:
+		 *
+		 * The destructor sets the pointer to NULL.
+		 */
+		talloc_free(ctdb_db->persistent_state);
+	}
+
+	return 0;
+}
+
+
+/*
+  this is called when the ctdb daemon received a ctdb request message
+  from a local client over the unix domain socket
+ */
+static void daemon_request_message_from_client(struct ctdb_client *client,
+					       struct ctdb_req_message_old *c)
+{
+	TDB_DATA data;
+	int res;
+
+	if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+		c->hdr.destnode = ctdb_get_pnn(client->ctdb);
+	}
+
+	/* maybe the message is for another client on this node */
+	if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
+		ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
+		return;
+	}
+
+	/* its for a remote node */
+	data.dptr = &c->data[0];
+	data.dsize = c->datalen;
+	res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
+				       c->srvid, data);
+	if (res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
+			 c->hdr.destnode));
+	}
+}
+
+
+struct daemon_call_state {
+	struct ctdb_client *client;
+	uint32_t reqid;
+	struct ctdb_call *call;
+	struct timeval start_time;
+
+	/* readonly request ? */
+	uint32_t readonly_fetch;
+	uint32_t client_callid;
+};
+
+/*
+   complete a call from a client
+*/
+static void daemon_call_from_client_callback(struct ctdb_call_state *state)
+{
+	struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
+							   struct daemon_call_state);
+	struct ctdb_reply_call_old *r;
+	int res;
+	uint32_t length;
+	struct ctdb_client *client = dstate->client;
+	struct ctdb_db_context *ctdb_db = state->ctdb_db;
+
+	talloc_steal(client, dstate);
+	talloc_steal(dstate, dstate->call);
+
+	res = ctdb_daemon_call_recv(state, dstate->call);
+	if (res != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
+		CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+
+		CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
+		return;
+	}
+
+	length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
+	/* If the client asked for readonly FETCH, we remapped this to
+	   FETCH_WITH_HEADER when calling the daemon. So we must
+	   strip the extra header off the reply data before passing
+	   it back to the client.
+	*/
+	if (dstate->readonly_fetch
+	&& dstate->client_callid == CTDB_FETCH_FUNC) {
+		length -= sizeof(struct ctdb_ltdb_header);
+	}
+
+	r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
+			       length, struct ctdb_reply_call_old);
+	if (r == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
+		CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+		CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
+		return;
+	}
+	r->hdr.reqid        = dstate->reqid;
+	r->status           = dstate->call->status;
+
+	if (dstate->readonly_fetch
+	&& dstate->client_callid == CTDB_FETCH_FUNC) {
+		/* client only asked for a FETCH so we must strip off
+		   the extra ctdb_ltdb header
+		*/
+		r->datalen          = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+		memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
+	} else {
+		r->datalen          = dstate->call->reply_data.dsize;
+		memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
+	}
+
+	res = daemon_queue_send(client, &r->hdr);
+	if (res == -1) {
+		/* client is dead - return immediately */
+		return;
+	}
+	if (res != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
+	}
+	CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
+	CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+	talloc_free(dstate);
+}
+
+struct ctdb_daemon_packet_wrap {
+	struct ctdb_context *ctdb;
+	uint32_t client_id;
+};
+
+/*
+  a wrapper to catch disconnected clients
+ */
+static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
+{
+	struct ctdb_client *client;
+	struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
+							    struct ctdb_daemon_packet_wrap);
+	if (w == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
+		return;
+	}
+
+	client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+			 w->client_id));
+		talloc_free(w);
+		return;
+	}
+	talloc_free(w);
+
+	/* process it */
+	daemon_incoming_packet(client, hdr);
+}
+
+struct ctdb_deferred_fetch_call {
+	struct ctdb_deferred_fetch_call *next, *prev;
+	struct ctdb_req_call_old *c;
+	struct ctdb_daemon_packet_wrap *w;
+};
+
+struct ctdb_deferred_fetch_queue {
+	struct ctdb_deferred_fetch_call *deferred_calls;
+};
+
+struct ctdb_deferred_requeue {
+	struct ctdb_deferred_fetch_call *dfc;
+	struct ctdb_client *client;
+};
+
+/* called from a timer event and starts reprocessing the deferred call.*/
+static void reprocess_deferred_call(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval t, void *private_data)
+{
+	struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
+	struct ctdb_client *client = dfr->client;
+
+	talloc_steal(client, dfr->dfc->c);
+	daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
+	talloc_free(dfr);
+}
+
+/* the referral context is destroyed either after a timeout or when the initial
+   fetch-lock has finished.
+   at this stage, immediately start reprocessing the queued up deferred
+   calls so they get reprocessed immediately (and since we are dmaster at
+   this stage, trigger the waiting smbd processes to pick up and acquire the
+   record right away.
+*/
+static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
+{
+
+	/* need to reprocess the packets from the queue explicitly instead of
+	   just using a normal destructor since we need to
+	   call the clients in the same order as the requests queued up
+	*/
+	while (dfq->deferred_calls != NULL) {
+		struct ctdb_client *client;
+		struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
+		struct ctdb_deferred_requeue *dfr;
+
+		DLIST_REMOVE(dfq->deferred_calls, dfc);
+
+		client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
+		if (client == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+				 dfc->w->client_id));
+			continue;
+		}
+
+		/* process it by pushing it back onto the eventloop */
+		dfr = talloc(client, struct ctdb_deferred_requeue);
+		if (dfr == NULL) {
+			DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
+			continue;
+		}
+
+		dfr->dfc    = talloc_steal(dfr, dfc);
+		dfr->client = client;
+
+		tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
+				 reprocess_deferred_call, dfr);
+	}
+
+	return 0;
+}
+
+/* insert the new deferral context into the rb tree.
+   there should never be a pre-existing context here, but check for it
+   warn and destroy the previous context if there is already a deferral context
+   for this key.
+*/
+static void *insert_dfq_callback(void *parm, void *data)
+{
+        if (data) {
+		DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
+                talloc_free(data);
+        }
+        return parm;
+}
+
+/* if the original fetch-lock did not complete within a reasonable time,
+   free the context and context for all deferred requests to cause them to be
+   re-inserted into the event system.
+*/
+static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
+			struct timeval t, void *private_data)
+{
+	talloc_free(private_data);
+}
+
+/* This function is used in the local daemon to register a KEY in a database
+   for being "fetched"
+   While the remote fetch is in-flight, any further attempts to re-fetch the
+   same record will be deferred until the fetch completes.
+*/
+static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+	uint32_t *k;
+	struct ctdb_deferred_fetch_queue *dfq;
+
+	k = ctdb_key_to_idkey(call, call->key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+		return -1;
+	}
+
+	dfq  = talloc(call, struct ctdb_deferred_fetch_queue);
+	if (dfq == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
+		talloc_free(k);
+		return -1;
+	}
+	dfq->deferred_calls = NULL;
+
+	trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
+
+	talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
+
+	/* If the fetch hasn't completed in 30 seconds, just tear it all down
+	   and let it try again as the events are reissued */
+	tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
+			 dfq_timeout, dfq);
+
+	talloc_free(k);
+	return 0;
+}
+
+/* check if this is a duplicate request to a fetch already in-flight
+   if it is, make this call deferred to be reprocessed later when
+   the in-flight fetch completes.
+*/
+static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
+{
+	uint32_t *k;
+	struct ctdb_deferred_fetch_queue *dfq;
+	struct ctdb_deferred_fetch_call *dfc;
+
+	k = ctdb_key_to_idkey(c, key);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+		return -1;
+	}
+
+	dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
+	if (dfq == NULL) {
+		talloc_free(k);
+		return -1;
+	}
+
+
+	talloc_free(k);
+
+	dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
+	if (dfc == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
+		return -1;
+	}
+
+	dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
+	if (dfc->w == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
+		talloc_free(dfc);
+		return -1;
+	}
+
+	dfc->c = talloc_steal(dfc, c);
+	dfc->w->ctdb = ctdb_db->ctdb;
+	dfc->w->client_id = client->client_id;
+
+	DLIST_ADD_END(dfq->deferred_calls, dfc);
+
+	return 0;
+}
+
+
+/*
+  this is called when the ctdb daemon received a ctdb request call
+  from a local client over the unix domain socket
+ */
+static void daemon_request_call_from_client(struct ctdb_client *client,
+					    struct ctdb_req_call_old *c)
+{
+	struct ctdb_call_state *state;
+	struct ctdb_db_context *ctdb_db;
+	struct daemon_call_state *dstate;
+	struct ctdb_call *call;
+	struct ctdb_ltdb_header header;
+	TDB_DATA key, data;
+	int ret;
+	struct ctdb_context *ctdb = client->ctdb;
+	struct ctdb_daemon_packet_wrap *w;
+
+	CTDB_INCREMENT_STAT(ctdb, total_calls);
+	CTDB_INCREMENT_STAT(ctdb, pending_calls);
+
+	ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x\n",
+			  c->db_id));
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		return;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		/*
+		 * this is just a warning, as the tdb should be empty anyway,
+		 * and only persistent databases can be unhealthy, which doesn't
+		 * use this code patch
+		 */
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
+	key.dptr = c->data;
+	key.dsize = c->keylen;
+
+	w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
+	CTDB_NO_MEMORY_VOID(ctdb, w);
+
+	w->ctdb = ctdb;
+	w->client_id = client->client_id;
+
+	ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
+					   (struct ctdb_req_header *)c, &data,
+					   daemon_incoming_packet_wrap, w, true);
+	if (ret == -2) {
+		/* will retry later */
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		return;
+	}
+
+	talloc_free(w);
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		return;
+	}
+
+
+	/* check if this fetch request is a duplicate for a
+	   request we already have in flight. If so defer it until
+	   the first request completes.
+	*/
+	if (ctdb->tunable.fetch_collapse == 1) {
+		if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
+			ret = ctdb_ltdb_unlock(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+			}
+			CTDB_DECREMENT_STAT(ctdb, pending_calls);
+			talloc_free(data.dptr);
+			return;
+		}
+	}
+
+	/* Dont do READONLY if we don't have a tracking database */
+	if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
+		c->flags &= ~CTDB_WANT_READONLY;
+	}
+
+	if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+		header.flags &= ~CTDB_REC_RO_FLAGS;
+		CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+		CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+		if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+		}
+		/* and clear out the tracking data */
+		if (tdb_delete(ctdb_db->rottdb, key) != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+		}
+	}
+
+	/* if we are revoking, we must defer all other calls until the revoke
+	 * had completed.
+	 */
+	if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+		talloc_free(data.dptr);
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		return;
+	}
+
+	if ((header.dmaster == ctdb->pnn)
+	&& (!(c->flags & CTDB_WANT_READONLY))
+	&& (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+		header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+		if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+		}
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+		if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to start record revoke");
+		}
+		talloc_free(data.dptr);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		return;
+	}
+
+	dstate = talloc(client, struct daemon_call_state);
+	if (dstate == NULL) {
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+
+		DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		return;
+	}
+	dstate->start_time = timeval_current();
+	dstate->client = client;
+	dstate->reqid  = c->hdr.reqid;
+	talloc_steal(dstate, data.dptr);
+
+	call = dstate->call = talloc_zero(dstate, struct ctdb_call);
+	if (call == NULL) {
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+
+		DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
+		return;
+	}
+
+	dstate->readonly_fetch = 0;
+	call->call_id = c->callid;
+	call->key = key;
+	call->call_data.dptr = c->data + c->keylen;
+	call->call_data.dsize = c->calldatalen;
+	call->flags = c->flags;
+
+	if (c->flags & CTDB_WANT_READONLY) {
+		/* client wants readonly record, so translate this into a
+		   fetch with header. remember what the client asked for
+		   so we can remap the reply back to the proper format for
+		   the client in the reply
+		 */
+		dstate->client_callid = call->call_id;
+		call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+		dstate->readonly_fetch = 1;
+	}
+
+	if (header.dmaster == ctdb->pnn) {
+		state = ctdb_call_local_send(ctdb_db, call, &header, &data);
+	} else {
+		state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
+		if (ctdb->tunable.fetch_collapse == 1) {
+			/* This request triggered a remote fetch-lock.
+			   set up a deferral for this key so any additional
+			   fetch-locks are deferred until the current one
+			   finishes.
+			 */
+			setup_deferred_fetch_locks(ctdb_db, call);
+		}
+	}
+
+	ret = ctdb_ltdb_unlock(ctdb_db, key);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+	}
+
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
+		CTDB_DECREMENT_STAT(ctdb, pending_calls);
+		CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
+		return;
+	}
+	talloc_steal(state, dstate);
+	talloc_steal(client, state);
+
+	state->async.fn = daemon_call_from_client_callback;
+	state->async.private_data = dstate;
+}
+
+
+static void daemon_request_control_from_client(struct ctdb_client *client,
+					       struct ctdb_req_control_old *c);
+static void daemon_request_tunnel_from_client(struct ctdb_client *client,
+					      struct ctdb_req_tunnel_old *c);
+
+/* data contains a packet from the client */
+static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
+{
+	struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
+	TALLOC_CTX *tmp_ctx;
+	struct ctdb_context *ctdb = client->ctdb;
+
+	/* place the packet as a child of a tmp_ctx. We then use
+	   talloc_free() below to free it. If any of the calls want
+	   to keep it, then they will steal it somewhere else, and the
+	   talloc_free() will be a no-op */
+	tmp_ctx = talloc_new(client);
+	talloc_steal(tmp_ctx, hdr);
+
+	if (hdr->ctdb_magic != CTDB_MAGIC) {
+		ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
+		goto done;
+	}
+
+	if (hdr->ctdb_version != CTDB_PROTOCOL) {
+		ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+		goto done;
+	}
+
+	switch (hdr->operation) {
+	case CTDB_REQ_CALL:
+		CTDB_INCREMENT_STAT(ctdb, client.req_call);
+		daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
+		break;
+
+	case CTDB_REQ_MESSAGE:
+		CTDB_INCREMENT_STAT(ctdb, client.req_message);
+		daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
+		break;
+
+	case CTDB_REQ_CONTROL:
+		CTDB_INCREMENT_STAT(ctdb, client.req_control);
+		daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
+		break;
+
+	case CTDB_REQ_TUNNEL:
+		CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
+		daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
+		break;
+
+	default:
+		DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
+			 hdr->operation));
+	}
+
+done:
+	talloc_free(tmp_ctx);
+}
+
+/*
+  called when the daemon gets a incoming packet
+ */
+static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+	struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
+	struct ctdb_req_header *hdr;
+
+	if (cnt == 0) {
+		talloc_free(client);
+		return;
+	}
+
+	CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
+
+	if (cnt < sizeof(*hdr)) {
+		ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
+			       (unsigned)cnt);
+		return;
+	}
+	hdr = (struct ctdb_req_header *)data;
+
+	if (hdr->ctdb_magic != CTDB_MAGIC) {
+		ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
+		goto err_out;
+	}
+
+	if (hdr->ctdb_version != CTDB_PROTOCOL) {
+		ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+		goto err_out;
+	}
+
+	DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
+		 "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+		 hdr->srcnode, hdr->destnode));
+
+	/* it is the responsibility of the incoming packet function to free 'data' */
+	daemon_incoming_packet(client, hdr);
+	return;
+
+err_out:
+	TALLOC_FREE(data);
+}
+
+
+static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
+{
+	if (client_pid->ctdb->client_pids != NULL) {
+		DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
+	}
+
+	return 0;
+}
+
+static int get_new_client_id(struct reqid_context *idr,
+			     struct ctdb_client *client,
+			     uint32_t *out)
+{
+	uint32_t client_id;
+
+	client_id = reqid_new(idr, client);
+	/*
+	 * Some places in the code (e.g. ctdb_control_db_attach(),
+	 * ctdb_control_db_detach()) assign a special meaning to
+	 * client_id 0.  The assumption is that if client_id is 0 then
+	 * the control has come from another daemon.  Therefore, we
+	 * should never return client_id == 0.
+	 */
+	if (client_id == 0) {
+		/*
+		 * Don't leak ID 0.  This is safe because the ID keeps
+		 * increasing.  A test will be added to ensure that
+		 * this doesn't change.
+		 */
+		reqid_remove(idr, 0);
+
+		client_id = reqid_new(idr, client);
+	}
+
+	if (client_id == REQID_INVALID) {
+		return EINVAL;
+	}
+
+	if (client_id == 0) {
+		/* Every other ID must have been used and we can't use 0 */
+		reqid_remove(idr, 0);
+		return EINVAL;
+	}
+
+	*out = client_id;
+	return 0;
+}
+
+static void ctdb_accept_client(struct tevent_context *ev,
+			       struct tevent_fd *fde, uint16_t flags,
+			       void *private_data)
+{
+	struct sockaddr_un addr;
+	socklen_t len;
+	int fd;
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	struct ctdb_client *client;
+	struct ctdb_client_pid_list *client_pid;
+	pid_t peer_pid = 0;
+	int ret;
+
+	memset(&addr, 0, sizeof(addr));
+	len = sizeof(addr);
+	fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
+	if (fd == -1) {
+		return;
+	}
+	smb_set_close_on_exec(fd);
+
+	ret = set_blocking(fd, false);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__
+		       " failed to set socket non-blocking (%s)\n",
+		       strerror(errno)));
+		close(fd);
+		return;
+	}
+
+	set_close_on_exec(fd);
+
+	DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
+
+	client = talloc_zero(ctdb, struct ctdb_client);
+	if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
+		DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
+	}
+
+	client->ctdb = ctdb;
+	client->fd = fd;
+
+	ret = get_new_client_id(ctdb->idr, client, &client->client_id);
+	if (ret != 0) {
+		DBG_ERR("Unable to get client ID (%d)\n", ret);
+		close(fd);
+		talloc_free(client);
+		return;
+	}
+
+	client->pid = peer_pid;
+
+	client_pid = talloc(client, struct ctdb_client_pid_list);
+	if (client_pid == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
+		close(fd);
+		talloc_free(client);
+		return;
+	}
+	client_pid->ctdb   = ctdb;
+	client_pid->pid    = peer_pid;
+	client_pid->client = client;
+
+	DLIST_ADD(ctdb->client_pids, client_pid);
+
+	client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
+					 ctdb_daemon_read_cb, client,
+					 "client-%u", client->pid);
+
+	talloc_set_destructor(client, ctdb_client_destructor);
+	talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
+	ctdb->num_clients++;
+}
+
+
+
+/*
+ * Create a unix domain socket, bind it, secure it and listen.  Return
+ * the file descriptor for the socket.
+ */
+static int ux_socket_bind(struct ctdb_context *ctdb, bool test_mode_enabled)
+{
+	struct sockaddr_un addr = { .sun_family = AF_UNIX };
+	int ret;
+
+	ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ctdb->daemon.sd == -1) {
+		return -1;
+	}
+
+	strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
+
+	if (! sock_clean(ctdb->daemon.name)) {
+		return -1;
+	}
+
+	set_close_on_exec(ctdb->daemon.sd);
+
+	ret = set_blocking(ctdb->daemon.sd, false);
+	if (ret != 0) {
+		DBG_ERR("Failed to set socket non-blocking (%s)\n",
+			strerror(errno));
+		goto failed;
+	}
+
+	ret = bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr));
+	if (ret == -1) {
+		D_ERR("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name);
+		goto failed;
+	}
+
+	if (!test_mode_enabled) {
+		ret = chown(ctdb->daemon.name, geteuid(), getegid());
+		if (ret != 0 && !test_mode_enabled) {
+			D_ERR("Unable to secure (chown) ctdb socket '%s'\n",
+			      ctdb->daemon.name);
+			goto failed;
+		}
+	}
+
+	ret = chmod(ctdb->daemon.name, 0700);
+	if (ret != 0) {
+		D_ERR("Unable to secure (chmod) ctdb socket '%s'\n",
+		      ctdb->daemon.name);
+		goto failed;
+	}
+
+
+	ret = listen(ctdb->daemon.sd, 100);
+	if (ret != 0) {
+		D_ERR("Unable to listen on ctdb socket '%s'\n",
+		      ctdb->daemon.name);
+		goto failed;
+	}
+
+	D_NOTICE("Listening to ctdb socket %s\n", ctdb->daemon.name);
+	return 0;
+
+failed:
+	close(ctdb->daemon.sd);
+	ctdb->daemon.sd = -1;
+	return -1;
+}
+
+struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn)
+{
+	struct ctdb_node *node = NULL;
+	unsigned int i;
+
+	if (pnn == CTDB_CURRENT_NODE) {
+		pnn = ctdb->pnn;
+	}
+
+	/* Always found: PNN correctly set just before this is called */
+	for (i = 0; i < ctdb->num_nodes; i++) {
+		node = ctdb->nodes[i];
+		if (pnn == node->pnn) {
+			return node;
+		}
+	}
+
+	return NULL;
+}
+
+static void initialise_node_flags (struct ctdb_context *ctdb)
+{
+	struct ctdb_node *node = NULL;
+
+	node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+	/*
+	 * PNN correctly set just before this is called so always
+	 * found but keep static analysers happy...
+	 */
+	if (node == NULL) {
+		DBG_ERR("Unable to find current node\n");
+		return;
+	}
+
+	node->flags &= ~NODE_FLAGS_DISCONNECTED;
+
+	/* do we start out in DISABLED mode? */
+	if (ctdb->start_as_disabled != 0) {
+		D_ERR("This node is configured to start in DISABLED state\n");
+		node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
+	}
+	/* do we start out in STOPPED mode? */
+	if (ctdb->start_as_stopped != 0) {
+		D_ERR("This node is configured to start in STOPPED state\n");
+		node->flags |= NODE_FLAGS_STOPPED;
+	}
+}
+
+static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
+				      void *private_data)
+{
+	if (status != 0) {
+		ctdb_die(ctdb, "Failed to run setup event");
+	}
+	ctdb_run_notification_script(ctdb, "setup");
+
+	/* Start the recovery daemon */
+	if (ctdb_start_recoverd(ctdb) != 0) {
+		DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
+		exit(11);
+	}
+
+	ctdb_start_periodic_events(ctdb);
+
+	ctdb_wait_for_first_recovery(ctdb);
+}
+
+static struct timeval tevent_before_wait_ts;
+static struct timeval tevent_after_wait_ts;
+
+static void ctdb_tevent_trace_init(void)
+{
+	struct timeval now;
+
+	now = timeval_current();
+
+	tevent_before_wait_ts = now;
+	tevent_after_wait_ts = now;
+}
+
+static void ctdb_tevent_trace(enum tevent_trace_point tp,
+			      void *private_data)
+{
+	struct timeval diff;
+	struct timeval now;
+	struct ctdb_context *ctdb =
+		talloc_get_type(private_data, struct ctdb_context);
+
+	if (getpid() != ctdb->ctdbd_pid) {
+		return;
+	}
+
+	now = timeval_current();
+
+	switch (tp) {
+	case TEVENT_TRACE_BEFORE_WAIT:
+		diff = timeval_until(&tevent_after_wait_ts, &now);
+		if (diff.tv_sec > 3) {
+			DEBUG(DEBUG_ERR,
+			      ("Handling event took %ld seconds!\n",
+			       (long)diff.tv_sec));
+		}
+		tevent_before_wait_ts = now;
+		break;
+
+	case TEVENT_TRACE_AFTER_WAIT:
+		diff = timeval_until(&tevent_before_wait_ts, &now);
+		if (diff.tv_sec > 3) {
+			DEBUG(DEBUG_ERR,
+			      ("No event for %ld seconds!\n",
+			       (long)diff.tv_sec));
+		}
+		tevent_after_wait_ts = now;
+		break;
+
+	default:
+		/* Do nothing for future tevent trace points */ ;
+	}
+}
+
+static void ctdb_remove_pidfile(void)
+{
+	TALLOC_FREE(ctdbd_pidfile_ctx);
+}
+
+static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
+{
+	if (ctdbd_pidfile != NULL) {
+		int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
+						 &ctdbd_pidfile_ctx);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,
+			      ("Failed to create PID file %s\n",
+			       ctdbd_pidfile));
+			exit(11);
+		}
+
+		DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
+		atexit(ctdb_remove_pidfile);
+	}
+}
+
+static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
+{
+	unsigned int i, j, count;
+
+	/* initialize the vnn mapping table, skipping any deleted nodes */
+	ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
+
+	count = 0;
+	for (i = 0; i < ctdb->num_nodes; i++) {
+		if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
+			count++;
+		}
+	}
+
+	ctdb->vnn_map->generation = INVALID_GENERATION;
+	ctdb->vnn_map->size = count;
+	ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
+	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
+
+	for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
+		if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+			continue;
+		}
+		ctdb->vnn_map->map[j] = i;
+		j++;
+	}
+}
+
+static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
+{
+	if (ctdb->address == NULL) {
+		ctdb_fatal(ctdb,
+			   "Can not determine PNN - node address is not set\n");
+	}
+
+	ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
+	if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
+		ctdb_fatal(ctdb,
+			   "Can not determine PNN - unknown node address\n");
+	}
+
+	D_NOTICE("PNN is %u\n", ctdb->pnn);
+}
+
+static void stdin_handler(struct tevent_context *ev,
+			  struct tevent_fd *fde,
+			  uint16_t flags,
+			  void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type_abort(
+		private_data, struct ctdb_context);
+	ssize_t nread;
+	char c;
+
+	nread = read(STDIN_FILENO, &c, 1);
+	if (nread != 1) {
+		D_ERR("stdin closed, exiting\n");
+		talloc_free(fde);
+		ctdb_shutdown_sequence(ctdb, EPIPE);
+	}
+}
+
+static int setup_stdin_handler(struct ctdb_context *ctdb)
+{
+	struct tevent_fd *fde;
+	struct stat st;
+	int ret;
+
+	ret = fstat(STDIN_FILENO, &st);
+	if (ret != 0) {
+		/* Problem with stdin, ignore... */
+		DBG_INFO("Can't fstat() stdin\n");
+		return 0;
+	}
+
+	if (!S_ISFIFO(st.st_mode)) {
+		DBG_INFO("Not a pipe...\n");
+		return 0;
+	}
+
+	fde = tevent_add_fd(ctdb->ev,
+			    ctdb,
+			    STDIN_FILENO,
+			    TEVENT_FD_READ,
+			    stdin_handler,
+			    ctdb);
+	if (fde == NULL) {
+		return ENOMEM;
+	}
+
+	DBG_INFO("Set up stdin handler\n");
+	return 0;
+}
+
+static void fork_only(void)
+{
+	pid_t pid;
+
+	pid = fork();
+	if (pid == -1) {
+		D_ERR("Fork failed (errno=%d)\n", errno);
+		exit(1);
+	}
+
+	if (pid != 0) {
+		/* Parent simply exits... */
+		exit(0);
+	}
+}
+
+static void sighup_hook(void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type_abort(private_data,
+							  struct ctdb_context);
+
+	if (ctdb->recoverd_pid > 0) {
+		kill(ctdb->recoverd_pid, SIGHUP);
+	}
+	ctdb_event_reopen_logs(ctdb);
+}
+
+/*
+  start the protocol going as a daemon
+*/
+int ctdb_start_daemon(struct ctdb_context *ctdb,
+		      bool interactive,
+		      bool test_mode_enabled)
+{
+	bool status;
+	int ret;
+	struct tevent_fd *fde;
+
+	/* Fork if not interactive */
+	if (!interactive) {
+		if (test_mode_enabled) {
+			/* Keep stdin open */
+			fork_only();
+		} else {
+			/* Fork, close stdin, start a session */
+			become_daemon(true, false, false);
+		}
+	}
+
+	ignore_signal(SIGPIPE);
+	ignore_signal(SIGUSR1);
+
+	ctdb->ctdbd_pid = getpid();
+	DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
+			  SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
+	ctdb_create_pidfile(ctdb);
+
+	/* create a unix domain stream socket to listen to */
+	ret = ux_socket_bind(ctdb, test_mode_enabled);
+	if (ret != 0) {
+		D_ERR("Cannot continue.  Exiting!\n");
+		exit(10);
+	}
+
+	/* Make sure we log something when the daemon terminates.
+	 * This must be the first exit handler to run (so the last to
+	 * be registered.
+	 */
+	__ctdbd_pid = getpid();
+	atexit(print_exit_message);
+
+	if (ctdb->do_setsched) {
+		/* try to set us up as realtime */
+		if (!set_scheduler()) {
+			exit(1);
+		}
+		DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
+	}
+
+	ctdb->ev = tevent_context_init(NULL);
+	if (ctdb->ev == NULL) {
+		DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
+		exit(1);
+	}
+	tevent_loop_allow_nesting(ctdb->ev);
+	ctdb_tevent_trace_init();
+	tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
+
+	status = logging_setup_sighup_handler(ctdb->ev,
+					      ctdb,
+					      sighup_hook,
+					      ctdb);
+	if (!status) {
+		D_ERR("Failed to set up signal handler for SIGHUP\n");
+		exit(1);
+	}
+
+	/* set up a handler to pick up sigchld */
+	if (ctdb_init_sigchld(ctdb) == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
+		exit(1);
+	}
+
+	if (!interactive) {
+		ctdb_set_child_logging(ctdb);
+	}
+
+	/* Exit if stdin is closed */
+	if (test_mode_enabled) {
+		ret = setup_stdin_handler(ctdb);
+		if (ret != 0) {
+			DBG_ERR("Failed to setup stdin handler\n");
+			exit(1);
+		}
+	}
+
+	TALLOC_FREE(ctdb->srv);
+	if (srvid_init(ctdb, &ctdb->srv) != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
+		exit(1);
+	}
+
+	TALLOC_FREE(ctdb->tunnels);
+	if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
+		exit(1);
+	}
+
+	/* initialize statistics collection */
+	ctdb_statistics_init(ctdb);
+
+	/* force initial recovery for election */
+	ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+	if (ctdb_start_eventd(ctdb) != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
+		exit(1);
+	}
+
+	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
+	ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
+	if (ret != 0) {
+		ctdb_die(ctdb, "Failed to run init event\n");
+	}
+	ctdb_run_notification_script(ctdb, "init");
+
+	if (strcmp(ctdb->transport, "tcp") == 0) {
+		ret = ctdb_tcp_init(ctdb);
+	}
+#ifdef USE_INFINIBAND
+	if (strcmp(ctdb->transport, "ib") == 0) {
+		ret = ctdb_ibw_init(ctdb);
+	}
+#endif
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
+		return -1;
+	}
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
+		ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
+	}
+
+	/* Initialise the transport.  This sets the node address if it
+	 * was not set via the command-line. */
+	if (ctdb->methods->initialise(ctdb) != 0) {
+		ctdb_fatal(ctdb, "transport failed to initialise");
+	}
+
+	ctdb_set_my_pnn(ctdb);
+
+	initialise_node_flags(ctdb);
+
+	ret = ctdb_set_public_addresses(ctdb, true);
+	if (ret == -1) {
+		D_ERR("Unable to setup public IP addresses\n");
+		exit(1);
+	}
+
+	ctdb_initialise_vnn_map(ctdb);
+
+	/* attach to existing databases */
+	if (ctdb_attach_databases(ctdb) != 0) {
+		ctdb_fatal(ctdb, "Failed to attach to databases\n");
+	}
+
+	/* start frozen, then let the first election sort things out */
+	if (!ctdb_blocking_freeze(ctdb)) {
+		ctdb_fatal(ctdb, "Failed to get initial freeze\n");
+	}
+
+	/* now start accepting clients, only can do this once frozen */
+	fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
+			    ctdb_accept_client, ctdb);
+	if (fde == NULL) {
+		ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
+	}
+	tevent_fd_set_auto_close(fde);
+
+	/* Start the transport */
+	if (ctdb->methods->start(ctdb) != 0) {
+		DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
+		ctdb_fatal(ctdb, "transport failed to start");
+	}
+
+	/* Recovery daemon and timed events are started from the
+	 * callback, only after the setup event completes
+	 * successfully.
+	 */
+	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
+	ret = ctdb_event_script_callback(ctdb,
+					 ctdb,
+					 ctdb_setup_event_callback,
+					 ctdb,
+					 CTDB_EVENT_SETUP,
+					 "%s",
+					 "");
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
+		exit(1);
+	}
+
+	lockdown_memory(ctdb->valgrinding);
+
+	/* go into a wait loop to allow other nodes to complete */
+	tevent_loop_wait(ctdb->ev);
+
+	DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
+	exit(1);
+}
+
+/*
+  allocate a packet for use in daemon<->daemon communication
+ */
+struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
+						 TALLOC_CTX *mem_ctx,
+						 enum ctdb_operation operation,
+						 size_t length, size_t slength,
+						 const char *type)
+{
+	int size;
+	struct ctdb_req_header *hdr;
+
+	length = MAX(length, slength);
+	size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
+			 operation, (unsigned)length));
+		return NULL;
+	}
+
+	hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
+	if (hdr == NULL) {
+		DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
+			 operation, (unsigned)length));
+		return NULL;
+	}
+	talloc_set_name_const(hdr, type);
+	memset(hdr, 0, slength);
+	hdr->length       = length;
+	hdr->operation    = operation;
+	hdr->ctdb_magic   = CTDB_MAGIC;
+	hdr->ctdb_version = CTDB_PROTOCOL;
+	hdr->generation   = ctdb->vnn_map->generation;
+	hdr->srcnode      = ctdb->pnn;
+
+	return hdr;
+}
+
+struct daemon_control_state {
+	struct daemon_control_state *next, *prev;
+	struct ctdb_client *client;
+	struct ctdb_req_control_old *c;
+	uint32_t reqid;
+	struct ctdb_node *node;
+};
+
+/*
+  callback when a control reply comes in
+ */
+static void daemon_control_callback(struct ctdb_context *ctdb,
+				    int32_t status, TDB_DATA data,
+				    const char *errormsg,
+				    void *private_data)
+{
+	struct daemon_control_state *state = talloc_get_type(private_data,
+							     struct daemon_control_state);
+	struct ctdb_client *client = state->client;
+	struct ctdb_reply_control_old *r;
+	size_t len;
+	int ret;
+
+	/* construct a message to send to the client containing the data */
+	len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
+	if (errormsg) {
+		len += strlen(errormsg);
+	}
+	r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
+			       struct ctdb_reply_control_old);
+	CTDB_NO_MEMORY_VOID(ctdb, r);
+
+	r->hdr.reqid     = state->reqid;
+	r->status        = status;
+	r->datalen       = data.dsize;
+	r->errorlen = 0;
+	memcpy(&r->data[0], data.dptr, data.dsize);
+	if (errormsg) {
+		r->errorlen = strlen(errormsg);
+		memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+	}
+
+	ret = daemon_queue_send(client, &r->hdr);
+	if (ret != -1) {
+		talloc_free(state);
+	}
+}
+
+/*
+  fail all pending controls to a disconnected node
+ */
+void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
+{
+	struct daemon_control_state *state;
+	while ((state = node->pending_controls)) {
+		DLIST_REMOVE(node->pending_controls, state);
+		daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
+					"node is disconnected", state);
+	}
+}
+
+/*
+  destroy a daemon_control_state
+ */
+static int daemon_control_destructor(struct daemon_control_state *state)
+{
+	if (state->node) {
+		DLIST_REMOVE(state->node->pending_controls, state);
+	}
+	return 0;
+}
+
+/*
+  this is called when the ctdb daemon received a ctdb request control
+  from a local client over the unix domain socket
+ */
+static void daemon_request_control_from_client(struct ctdb_client *client,
+					       struct ctdb_req_control_old *c)
+{
+	TDB_DATA data;
+	int res;
+	struct daemon_control_state *state;
+	TALLOC_CTX *tmp_ctx = talloc_new(client);
+
+	if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+		c->hdr.destnode = client->ctdb->pnn;
+	}
+
+	state = talloc(client, struct daemon_control_state);
+	CTDB_NO_MEMORY_VOID(client->ctdb, state);
+
+	state->client = client;
+	state->c = talloc_steal(state, c);
+	state->reqid = c->hdr.reqid;
+	if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+		state->node = client->ctdb->nodes[c->hdr.destnode];
+		DLIST_ADD(state->node->pending_controls, state);
+	} else {
+		state->node = NULL;
+	}
+
+	talloc_set_destructor(state, daemon_control_destructor);
+
+	if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+		talloc_steal(tmp_ctx, state);
+	}
+
+	data.dptr = &c->data[0];
+	data.dsize = c->datalen;
+	res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
+				       c->srvid, c->opcode, client->client_id,
+				       c->flags,
+				       data, daemon_control_callback,
+				       state);
+	if (res != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
+			 c->hdr.destnode));
+	}
+
+	talloc_free(tmp_ctx);
+}
+
+static void daemon_request_tunnel_from_client(struct ctdb_client *client,
+					      struct ctdb_req_tunnel_old *c)
+{
+	TDB_DATA data;
+	int ret;
+
+	if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+		DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
+				  c->hdr.destnode));
+		return;
+	}
+
+	ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
+		       c->tunnel_id));
+		return;
+	}
+
+	data = (TDB_DATA) {
+		.dsize = c->datalen,
+		.dptr = &c->data[0],
+	};
+
+	ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
+				      c->tunnel_id, c->flags, data);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
+				  c->hdr.destnode));
+	}
+}
+
+/*
+  register a call function
+*/
+int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
+			 ctdb_fn_t fn, int id)
+{
+	struct ctdb_registered_call *call;
+	struct ctdb_db_context *ctdb_db;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (ctdb_db == NULL) {
+		return -1;
+	}
+
+	call = talloc(ctdb_db, struct ctdb_registered_call);
+	call->fn = fn;
+	call->id = id;
+
+	DLIST_ADD(ctdb_db->calls, call);
+	return 0;
+}
+
+
+
+/*
+  this local messaging handler is ugly, but is needed to prevent
+  recursion in ctdb_send_message() when the destination node is the
+  same as the source node
+ */
+struct ctdb_local_message {
+	struct ctdb_context *ctdb;
+	uint64_t srvid;
+	TDB_DATA data;
+};
+
+static void ctdb_local_message_trigger(struct tevent_context *ev,
+				       struct tevent_timer *te,
+				       struct timeval t, void *private_data)
+{
+	struct ctdb_local_message *m = talloc_get_type(
+		private_data, struct ctdb_local_message);
+
+	srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
+	talloc_free(m);
+}
+
+static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
+{
+	struct ctdb_local_message *m;
+	m = talloc(ctdb, struct ctdb_local_message);
+	CTDB_NO_MEMORY(ctdb, m);
+
+	m->ctdb = ctdb;
+	m->srvid = srvid;
+	m->data  = data;
+	m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
+	if (m->data.dptr == NULL) {
+		talloc_free(m);
+		return -1;
+	}
+
+	/* this needs to be done as an event to prevent recursion */
+	tevent_add_timer(ctdb->ev, m, timeval_zero(),
+			 ctdb_local_message_trigger, m);
+	return 0;
+}
+
+/*
+  send a ctdb message
+*/
+int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+			     uint64_t srvid, TDB_DATA data)
+{
+	struct ctdb_req_message_old *r;
+	int len;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
+		return -1;
+	}
+
+	/* see if this is a message to ourselves */
+	if (pnn == ctdb->pnn) {
+		return ctdb_local_message(ctdb, srvid, data);
+	}
+
+	len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+	r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
+				    struct ctdb_req_message_old);
+	CTDB_NO_MEMORY(ctdb, r);
+
+	r->hdr.destnode  = pnn;
+	r->srvid         = srvid;
+	r->datalen       = data.dsize;
+	memcpy(&r->data[0], data.dptr, data.dsize);
+
+	ctdb_queue_packet(ctdb, &r->hdr);
+
+	talloc_free(r);
+	return 0;
+}
+
+
+
+struct ctdb_client_notify_list {
+	struct ctdb_client_notify_list *next, *prev;
+	struct ctdb_context *ctdb;
+	uint64_t srvid;
+	TDB_DATA data;
+};
+
+
+static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
+{
+	int ret;
+
+	DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
+
+	ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
+	}
+
+	return 0;
+}
+
+int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+	struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
+        struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	struct ctdb_client_notify_list *nl;
+
+	DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
+
+	if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
+		DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
+		return -1;
+	}
+
+	if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
+		DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
+		return -1;
+	}
+
+
+        if (client == NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+                return -1;
+        }
+
+	for(nl=client->notify; nl; nl=nl->next) {
+		if (nl->srvid == notify->srvid) {
+			break;
+		}
+	}
+	if (nl != NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
+                return -1;
+        }
+
+	nl = talloc(client, struct ctdb_client_notify_list);
+	CTDB_NO_MEMORY(ctdb, nl);
+	nl->ctdb       = ctdb;
+	nl->srvid      = notify->srvid;
+	nl->data.dsize = notify->len;
+	nl->data.dptr  = talloc_memdup(nl, notify->notify_data,
+				       nl->data.dsize);
+	CTDB_NO_MEMORY(ctdb, nl->data.dptr);
+
+	DLIST_ADD(client->notify, nl);
+	talloc_set_destructor(nl, ctdb_client_notify_destructor);
+
+	return 0;
+}
+
+int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+	uint64_t srvid = *(uint64_t *)indata.dptr;
+        struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	struct ctdb_client_notify_list *nl;
+
+	DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
+
+        if (client == NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+                return -1;
+        }
+
+	for(nl=client->notify; nl; nl=nl->next) {
+		if (nl->srvid == srvid) {
+			break;
+		}
+	}
+	if (nl == NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
+                return -1;
+        }
+
+	DLIST_REMOVE(client->notify, nl);
+	talloc_set_destructor(nl, NULL);
+	talloc_free(nl);
+
+	return 0;
+}
+
+struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
+{
+	struct ctdb_client_pid_list *client_pid;
+
+	for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
+		if (client_pid->pid == pid) {
+			return client_pid->client;
+		}
+	}
+	return NULL;
+}
+
+
+/* This control is used by samba when probing if a process (of a samba daemon)
+   exists on the node.
+   Samba does this when it needs/wants to check if a subrecord in one of the
+   databases is still valid, or if it is stale and can be removed.
+   If the node is in unhealthy or stopped state we just kill of the samba
+   process holding this sub-record and return to the calling samba that
+   the process does not exist.
+   This allows us to forcefully recall subrecords registered by samba processes
+   on banned and stopped nodes.
+*/
+int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
+{
+        struct ctdb_client *client;
+
+	client = ctdb_find_client_by_pid(ctdb, pid);
+	if (client == NULL) {
+		return -1;
+	}
+
+	if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
+		DEBUG(DEBUG_NOTICE,
+		      ("Killing client with pid:%d on banned/stopped node\n",
+		       (int)pid));
+		talloc_free(client);
+		return -1;
+	}
+
+	return kill(pid, 0);
+}
+
+int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
+				     TDB_DATA indata)
+{
+	struct ctdb_client_pid_list *client_pid;
+	pid_t pid;
+	uint64_t srvid;
+	int ret;
+
+	pid = *(pid_t *)indata.dptr;
+	srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
+
+	for (client_pid = ctdb->client_pids;
+	     client_pid != NULL;
+	     client_pid = client_pid->next) {
+		if (client_pid->pid == pid) {
+			ret = srvid_exists(ctdb->srv, srvid,
+					   client_pid->client);
+			if (ret == 0) {
+				return 0;
+			}
+		}
+	}
+
+	return -1;
+}
+
+int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+	struct ctdb_node_map_old *node_map = NULL;
+
+	CHECK_CONTROL_DATA_SIZE(0);
+
+	node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
+	if (node_map == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to read nodes file\n"));
+		return -1;
+	}
+
+	outdata->dptr  = (unsigned char *)node_map;
+	outdata->dsize = talloc_get_size(outdata->dptr);
+
+	return 0;
+}
+
+void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
+{
+	if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
+		DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
+		return;
+	}
+
+	DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
+	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
+	ctdb_stop_recoverd(ctdb);
+	ctdb_stop_keepalive(ctdb);
+	ctdb_stop_monitoring(ctdb);
+	ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+	ctdb_stop_eventd(ctdb);
+	if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
+		ctdb->methods->shutdown(ctdb);
+	}
+
+	DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
+	exit(exit_code);
+}
+
+/* When forking the main daemon and the child process needs to connect
+ * back to the daemon as a client process, this function can be used
+ * to change the ctdb context from daemon into client mode.  The child
+ * process must be created using ctdb_fork() and not fork() -
+ * ctdb_fork() does some necessary housekeeping.
+ */
+int switch_from_server_to_client(struct ctdb_context *ctdb)
+{
+	int ret;
+
+	if (ctdb->daemon.sd != -1) {
+		close(ctdb->daemon.sd);
+		ctdb->daemon.sd = -1;
+	}
+
+	/* get a new event context */
+	ctdb->ev = tevent_context_init(ctdb);
+	if (ctdb->ev == NULL) {
+		DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
+		exit(1);
+	}
+	tevent_loop_allow_nesting(ctdb->ev);
+
+	/* Connect to main CTDB daemon */
+	ret = ctdb_socket_connect(ctdb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
+		return -1;
+	}
+
+	ctdb->can_send_controls = true;
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_fork.c b/ctdb/server/ctdb_fork.c
new file mode 100644
index 0000000..1065423
--- /dev/null
+++ b/ctdb/server/ctdb_fork.c
@@ -0,0 +1,216 @@
+/* 
+   functions to track and manage processes
+
+   Copyright (C) Ronnie Sahlberg 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/wait.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+void ctdb_track_child(struct ctdb_context *ctdb, pid_t pid)
+{
+	char *process;
+
+	/* Only CTDB main daemon should track child processes */
+	if (getpid() != ctdb->ctdbd_pid) {
+		return;
+	}
+
+	process = talloc_asprintf(ctdb->child_processes, "process:%d", (int)pid);
+	trbt_insert32(ctdb->child_processes, pid, process);
+}
+
+/*
+ * This function forks a child process and drops the realtime 
+ * scheduler for the child process.
+ */
+pid_t ctdb_fork(struct ctdb_context *ctdb)
+{
+	pid_t pid;
+	struct timeval before;
+	double delta_t;
+
+	before = timeval_current();
+
+	pid = fork();
+	if (pid == -1) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " fork() failed (%s)\n", strerror(errno)));
+		return -1;
+	}
+	if (pid == 0) {
+		/* Close the Unix Domain socket and the TCP socket.
+		 * This ensures that none of the child processes will
+		 * look like the main daemon when it is not running.
+		 * tevent needs to be stopped before closing sockets.
+		 */
+		if (ctdb->ev != NULL) {
+			talloc_free(ctdb->ev);
+			ctdb->ev = NULL;
+		}
+		if (ctdb->daemon.sd != -1) {
+			close(ctdb->daemon.sd);
+			ctdb->daemon.sd = -1;
+		}
+		if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
+			ctdb->methods->shutdown(ctdb);
+		}
+
+		/* The child does not need to be realtime */
+		if (ctdb->do_setsched) {
+			reset_scheduler();
+		}
+		ctdb->can_send_controls = false;
+
+		return 0;
+	}
+
+	delta_t = timeval_elapsed(&before);
+	if (delta_t > 3.0) {
+		DEBUG(DEBUG_WARNING, ("fork() took %lf seconds\n", delta_t));
+	}
+
+	ctdb_track_child(ctdb, pid);
+	return pid;
+}
+
+/*
+ * vfork + exec
+ */
+pid_t ctdb_vfork_exec(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
+		      const char *helper, int helper_argc,
+		      const char **helper_argv)
+{
+	pid_t pid;
+	struct timeval before;
+	double delta_t;
+	char **argv;
+	int i;
+
+	argv = talloc_array(mem_ctx, char *, helper_argc + 1);
+	if (argv == NULL) {
+		DEBUG(DEBUG_ERR, ("Memory allocation error\n"));
+		return -1;
+	}
+
+	argv[0] = discard_const(helper);
+	for (i=0; i<helper_argc; i++) {
+		argv[i+1] = discard_const(helper_argv[i]);
+	}
+
+	before = timeval_current();
+
+	pid = vfork();
+	if (pid == -1) {
+		DEBUG(DEBUG_ERR, ("vfork() failed (%s)\n", strerror(errno)));
+		return -1;
+	}
+
+	if (pid == 0) {
+		execv(helper, argv);
+		_exit(1);
+	}
+
+	delta_t = timeval_elapsed(&before);
+	if (delta_t > 3.0) {
+		DEBUG(DEBUG_WARNING, ("vfork() took %lf seconds\n", delta_t));
+	}
+
+	ctdb_track_child(ctdb, pid);
+	return pid;
+}
+
+static void ctdb_sigchld_handler(struct tevent_context *ev,
+	struct tevent_signal *te, int signum, int count,
+	void *dont_care, 
+	void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	int status;
+	pid_t pid = -1;
+
+	while (pid != 0) {
+		pid = waitpid(-1, &status, WNOHANG);
+		if (pid == -1) {
+			DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
+			return;
+		}
+		if (pid > 0) {
+			char *process;
+
+			if (getpid() != ctdb->ctdbd_pid) {
+				continue;
+			}
+
+			process = trbt_lookup32(ctdb->child_processes, pid);
+			if (process == NULL) {
+				DEBUG(DEBUG_ERR,("Got SIGCHLD from pid:%d we didn not spawn with ctdb_fork\n", pid));
+			}
+
+			DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d %s\n", (int)pid, process));
+			talloc_free(process);
+		}
+	}
+}
+
+
+struct tevent_signal *
+ctdb_init_sigchld(struct ctdb_context *ctdb)
+{
+	struct tevent_signal *se;
+
+	ctdb->child_processes = trbt_create(ctdb, 0);
+
+	se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, ctdb_sigchld_handler, ctdb);
+	return se;
+}
+
+int
+ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum)
+{
+	char *process;
+
+	if (signum == 0) {
+		return kill(pid, signum);
+	}
+
+	if (getpid() != ctdb->ctdbd_pid) {
+		return kill(pid, signum);
+	}
+
+	process = trbt_lookup32(ctdb->child_processes, pid);
+	if (process == NULL) {
+		DEBUG(DEBUG_ERR,("ctdb_kill: trying to kill(%d, %d) a process that does not exist\n", pid, signum));
+		return 0;
+	}
+
+	return kill(pid, signum);
+}
diff --git a/ctdb/server/ctdb_freeze.c b/ctdb/server/ctdb_freeze.c
new file mode 100644
index 0000000..06aeacf
--- /dev/null
+++ b/ctdb/server/ctdb_freeze.c
@@ -0,0 +1,923 @@
+/* 
+   ctdb freeze handling
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+
+#include "ctdb_private.h"
+
+#include "common/rb_tree.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+/**
+ * Cancel a transaction on database
+ */
+static int db_transaction_cancel_handler(struct ctdb_db_context *ctdb_db,
+					 void *private_data)
+{
+	int ret;
+
+	tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+	ret = tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to cancel transaction for db %s\n",
+				  ctdb_db->db_name));
+	}
+	tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+	return 0;
+}
+
+/**
+ * Start a transaction on database
+ */
+static int db_transaction_start_handler(struct ctdb_db_context *ctdb_db,
+					void *private_data)
+{
+	bool freeze_transaction_started = *(bool *)private_data;
+	int ret;
+
+	tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+	if (freeze_transaction_started) {
+		ret = tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,
+			      ("Failed to cancel transaction for db %s\n",
+			       ctdb_db->db_name));
+		}
+	}
+	ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
+	tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to start transaction for db %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Commit a transaction on database
+ */
+static int db_transaction_commit_handler(struct ctdb_db_context *ctdb_db,
+					 void *private_data)
+{
+	unsigned int healthy_nodes = *(unsigned int *)private_data;
+	int ret;
+
+	tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+	ret = tdb_transaction_commit(ctdb_db->ltdb->tdb);
+	tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to commit transaction for db %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	ret = ctdb_update_persistent_health(ctdb_db->ctdb, ctdb_db, NULL,
+					    healthy_nodes);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to update persistent health for db %s\n",
+				  ctdb_db->db_name));
+	}
+	return ret;
+}
+
+/* a list of control requests waiting for db freeze */
+struct ctdb_db_freeze_waiter {
+	struct ctdb_db_freeze_waiter *next, *prev;
+	struct ctdb_context *ctdb;
+	void *private_data;
+	int32_t status;
+};
+
+/* a handle to a db freeze lock child process */
+struct ctdb_db_freeze_handle {
+	struct ctdb_db_context *ctdb_db;
+	struct lock_request *lreq;
+	struct ctdb_db_freeze_waiter *waiters;
+};
+
+/**
+ * Called when freeing database freeze handle
+ */
+static int ctdb_db_freeze_handle_destructor(struct ctdb_db_freeze_handle *h)
+{
+	struct ctdb_db_context *ctdb_db = h->ctdb_db;
+
+	DEBUG(DEBUG_ERR, ("Release freeze handle for db %s\n",
+			  ctdb_db->db_name));
+
+	/* Cancel any pending transactions */
+	if (ctdb_db->freeze_transaction_started) {
+		db_transaction_cancel_handler(ctdb_db, NULL);
+		ctdb_db->freeze_transaction_started = false;
+	}
+	ctdb_db->freeze_mode = CTDB_FREEZE_NONE;
+	ctdb_db->freeze_handle = NULL;
+
+	/* Clear invalid records flag */
+	ctdb_db->invalid_records = false;
+
+	talloc_free(h->lreq);
+	return 0;
+}
+
+/**
+ * Called when a database is frozen
+ */
+static void ctdb_db_freeze_handler(void *private_data, bool locked)
+{
+	struct ctdb_db_freeze_handle *h = talloc_get_type_abort(
+		private_data, struct ctdb_db_freeze_handle);
+	struct ctdb_db_freeze_waiter *w;
+
+	if (h->ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_ERR, ("Freeze db child died - unfreezing\n"));
+		h->ctdb_db->freeze_mode = CTDB_FREEZE_NONE;
+		talloc_free(h);
+		return;
+	}
+
+	if (!locked) {
+		DEBUG(DEBUG_ERR, ("Failed to get db lock for %s\n",
+				  h->ctdb_db->db_name));
+		h->ctdb_db->freeze_mode = CTDB_FREEZE_NONE;
+		talloc_free(h);
+		return;
+	}
+
+	h->ctdb_db->freeze_mode = CTDB_FREEZE_FROZEN;
+
+	/* notify the waiters */
+	while ((w = h->waiters) != NULL) {
+		w->status = 0;
+		DLIST_REMOVE(h->waiters, w);
+		talloc_free(w);
+	}
+}
+
+/**
+ * Start freeze process for a database
+ */
+static void ctdb_start_db_freeze(struct ctdb_db_context *ctdb_db)
+{
+	struct ctdb_db_freeze_handle *h;
+
+	if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+		return;
+	}
+
+	if (ctdb_db->freeze_handle != NULL) {
+		return;
+	}
+
+	DEBUG(DEBUG_ERR, ("Freeze db: %s\n", ctdb_db->db_name));
+
+	ctdb_stop_vacuuming(ctdb_db->ctdb);
+
+	h = talloc_zero(ctdb_db, struct ctdb_db_freeze_handle);
+	CTDB_NO_MEMORY_FATAL(ctdb_db->ctdb, h);
+
+	h->ctdb_db = ctdb_db;
+	h->lreq = ctdb_lock_db(h, ctdb_db, false, ctdb_db_freeze_handler, h);
+	CTDB_NO_MEMORY_FATAL(ctdb_db->ctdb, h->lreq);
+	talloc_set_destructor(h, ctdb_db_freeze_handle_destructor);
+
+	ctdb_db->freeze_handle = h;
+	ctdb_db->freeze_mode = CTDB_FREEZE_PENDING;
+}
+
+/**
+ * Reply to a waiter for db freeze
+ */
+static int ctdb_db_freeze_waiter_destructor(struct ctdb_db_freeze_waiter *w)
+{
+	/* 'c' pointer is talloc_memdup(), so cannot use talloc_get_type */
+	struct ctdb_req_control_old *c =
+		(struct ctdb_req_control_old *)w->private_data;
+
+	ctdb_request_control_reply(w->ctdb, c, NULL, w->status, NULL);
+	return 0;
+}
+
+/**
+ * freeze a database
+ */
+int32_t ctdb_control_db_freeze(struct ctdb_context *ctdb,
+			       struct ctdb_req_control_old *c,
+			       uint32_t db_id,
+			       bool *async_reply)
+{
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_db_freeze_waiter *w;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR, ("Freeze db for unknown dbid 0x%08x\n", db_id));
+		return -1;
+	}
+
+	if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_ERR, ("Freeze db: %s frozen\n", ctdb_db->db_name));
+		return 0;
+	}
+
+	ctdb_start_db_freeze(ctdb_db);
+
+	/* add ourselves to the list of waiters */
+	w = talloc(ctdb_db->freeze_handle, struct ctdb_db_freeze_waiter);
+	CTDB_NO_MEMORY(ctdb, w);
+	w->ctdb = ctdb;
+	w->private_data = talloc_steal(w, c);
+	w->status = -1;
+	talloc_set_destructor(w, ctdb_db_freeze_waiter_destructor);
+	DLIST_ADD(ctdb_db->freeze_handle->waiters, w);
+
+	*async_reply = true;
+	return 0;
+}
+
+/**
+ * Thaw a database
+ */
+int32_t ctdb_control_db_thaw(struct ctdb_context *ctdb, uint32_t db_id)
+{
+	struct ctdb_db_context *ctdb_db;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR, ("Thaw db for unknown dbid 0x%08x\n", db_id));
+		return -1;
+	}
+
+	DEBUG(DEBUG_ERR, ("Thaw db: %s generation %u\n", ctdb_db->db_name,
+			  ctdb_db->generation));
+
+	TALLOC_FREE(ctdb_db->freeze_handle);
+	ctdb_call_resend_db(ctdb_db);
+	return 0;
+}
+
+
+/*
+  a list of control requests waiting for a freeze lock child to get
+  the database locks
+ */
+struct ctdb_freeze_waiter {
+	struct ctdb_freeze_waiter *next, *prev;
+	struct ctdb_context *ctdb;
+	struct ctdb_req_control_old *c;
+	int32_t status;
+};
+
+/* a handle to a freeze lock child process */
+struct ctdb_freeze_handle {
+	struct ctdb_context *ctdb;
+	unsigned int num_total, num_locked, num_failed;
+	struct ctdb_freeze_waiter *waiters;
+};
+
+static int db_thaw(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+	talloc_free(ctdb_db->freeze_handle);
+	return 0;
+}
+
+/*
+  destroy a freeze handle
+ */
+static int ctdb_freeze_handle_destructor(struct ctdb_freeze_handle *h)
+{
+	struct ctdb_context *ctdb = h->ctdb;
+
+	DEBUG(DEBUG_ERR,("Release freeze handle\n"));
+
+	/* cancel any pending transactions */
+	if (ctdb->freeze_transaction_started) {
+		ctdb_db_iterator(ctdb, db_transaction_cancel_handler, NULL);
+		ctdb->freeze_transaction_started = false;
+	}
+
+	ctdb_db_iterator(ctdb, db_thaw, NULL);
+
+	ctdb->freeze_mode   = CTDB_FREEZE_NONE;
+	ctdb->freeze_handle = NULL;
+
+	return 0;
+}
+
+/*
+  called when the child writes its status to us
+ */
+static void ctdb_freeze_lock_handler(void *private_data, bool locked)
+{
+	struct ctdb_freeze_handle *h = talloc_get_type_abort(private_data,
+							     struct ctdb_freeze_handle);
+	struct ctdb_freeze_waiter *w;
+
+	if (h->ctdb->freeze_mode == CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_INFO,("freeze child died - unfreezing\n"));
+		talloc_free(h);
+		return;
+	}
+
+	if (!locked) {
+		DEBUG(DEBUG_ERR,("Failed to get locks in ctdb_freeze_child\n"));
+		/* we didn't get the locks - destroy the handle */
+		talloc_free(h);
+		return;
+	}
+
+	h->ctdb->freeze_mode = CTDB_FREEZE_FROZEN;
+
+	/* notify the waiters */
+	if (h != h->ctdb->freeze_handle) {
+		DEBUG(DEBUG_ERR,("lockwait finished but h is not linked\n"));
+	}
+	while ((w = h->waiters)) {
+		w->status = 0;
+		DLIST_REMOVE(h->waiters, w);
+		talloc_free(w);
+	}
+}
+
+/**
+ * When single database is frozen
+ */
+static int db_freeze_waiter_destructor(struct ctdb_db_freeze_waiter *w)
+{
+	struct ctdb_freeze_handle *h = talloc_get_type_abort(
+		w->private_data, struct ctdb_freeze_handle);
+
+	if (w->status == 0) {
+		h->num_locked += 1;
+	} else {
+		h->num_failed += 1;
+	}
+
+	/* Call ctdb_freeze_lock_handler() only when the status of all
+	 * databases is known.
+	 */
+	if (h->num_locked + h->num_failed == h->num_total) {
+		bool locked;
+
+		if (h->num_locked == h->num_total) {
+			locked = true;
+		} else {
+			locked = false;
+		}
+		ctdb_freeze_lock_handler(h, locked);
+	}
+	return 0;
+}
+
+/**
+ * Invalidate the records in the database.
+ * This only applies to volatile databases.
+ */
+static int db_invalidate(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+	if (ctdb_db_volatile(ctdb_db)) {
+		ctdb_db->invalid_records = true;
+	}
+
+	return 0;
+}
+
+/**
+ * Count the number of databases
+ */
+static int db_count(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+	unsigned int *count = (unsigned int *)private_data;
+
+	*count += 1;
+
+	return 0;
+}
+
+/**
+ * Freeze a single database
+ */
+static int db_freeze(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+	struct ctdb_freeze_handle *h = talloc_get_type_abort(
+		private_data, struct ctdb_freeze_handle);
+	struct ctdb_db_freeze_waiter *w;
+
+	ctdb_start_db_freeze(ctdb_db);
+
+	w = talloc(ctdb_db->freeze_handle, struct ctdb_db_freeze_waiter);
+	CTDB_NO_MEMORY(h->ctdb, w);
+	w->ctdb = h->ctdb;
+	w->private_data = h;
+	w->status = -1;
+	talloc_set_destructor(w, db_freeze_waiter_destructor);
+
+	if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+		/* Early return if already frozen */
+		w->status = 0;
+		talloc_free(w);
+		return 0;
+	}
+
+	DLIST_ADD(ctdb_db->freeze_handle->waiters, w);
+
+	return 0;
+}
+
+/*
+  start the freeze process for all databases
+  This is only called from ctdb_control_freeze(), which is called
+  only on node becoming INACTIVE.  So mark the records invalid.
+ */
+static void ctdb_start_freeze(struct ctdb_context *ctdb)
+{
+	struct ctdb_freeze_handle *h;
+	int ret;
+
+	ctdb_db_iterator(ctdb, db_invalidate, NULL);
+
+	if (ctdb->freeze_mode == CTDB_FREEZE_FROZEN) {
+		unsigned int count = 0;
+
+		/*
+		 * Check if all the databases are frozen
+		 *
+		 * It's possible that the databases can get attached after
+		 * initial freeze. This typically happens during startup as
+		 * CTDB will only attach persistent databases and go in to
+		 * startup freeze.  The recovery master during recovery will
+		 * attach all the missing databases.
+		 */
+
+		h = ctdb->freeze_handle;
+		if (h == NULL) {
+			ctdb->freeze_mode = CTDB_FREEZE_NONE;
+			return;
+		}
+
+		ret = ctdb_db_iterator(ctdb, db_count, &count);
+		if (ret != 0) {
+			TALLOC_FREE(ctdb->freeze_handle);
+			ctdb->freeze_mode = CTDB_FREEZE_NONE;
+			return;
+		}
+
+		if (count != h->num_total) {
+			DEBUG(DEBUG_ERR, ("Freeze all: incremental\n"));
+
+			h->num_total = count;
+			h->num_locked = 0;
+			h->num_failed = 0;
+
+			ctdb->freeze_mode = CTDB_FREEZE_PENDING;
+
+			ret = ctdb_db_iterator(ctdb, db_freeze, h);
+			if (ret != 0) {
+				TALLOC_FREE(ctdb->freeze_handle);
+				ctdb->freeze_mode = CTDB_FREEZE_NONE;
+			}
+		}
+		return;
+	}
+
+	if (ctdb->freeze_handle != NULL) {
+		/* already trying to freeze */
+		return;
+	}
+
+	DEBUG(DEBUG_ERR, ("Freeze all\n"));
+
+	/* Stop any vacuuming going on: we don't want to wait. */
+	ctdb_stop_vacuuming(ctdb);
+
+	/* create freeze lock children for each database */
+	h = talloc_zero(ctdb, struct ctdb_freeze_handle);
+	CTDB_NO_MEMORY_FATAL(ctdb, h);
+	h->ctdb = ctdb;
+	talloc_set_destructor(h, ctdb_freeze_handle_destructor);
+	ctdb->freeze_handle = h;
+
+	ret = ctdb_db_iterator(ctdb, db_count, &h->num_total);
+	if (ret != 0) {
+		talloc_free(h);
+		return;
+	}
+
+	ctdb->freeze_mode = CTDB_FREEZE_PENDING;
+
+	ret = ctdb_db_iterator(ctdb, db_freeze, h);
+	if (ret != 0) {
+		talloc_free(h);
+		return;
+	}
+
+	if (h->num_total == 0) {
+		ctdb->freeze_mode = CTDB_FREEZE_FROZEN;
+	}
+}
+
+/*
+  destroy a waiter for a freeze mode change
+ */
+static int ctdb_freeze_waiter_destructor(struct ctdb_freeze_waiter *w)
+{
+	ctdb_request_control_reply(w->ctdb, w->c, NULL, w->status, NULL);
+	return 0;
+}
+
+/*
+  freeze all the databases
+  This control is only used when freezing database on node becoming INACTIVE.
+  So mark the records invalid in ctdb_start_freeze().
+ */
+int32_t ctdb_control_freeze(struct ctdb_context *ctdb,
+			    struct ctdb_req_control_old *c, bool *async_reply)
+{
+	struct ctdb_freeze_waiter *w;
+
+	ctdb_start_freeze(ctdb);
+
+	if (ctdb->freeze_mode == CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_ERR, ("Freeze all: frozen\n"));
+		/* we're already frozen */
+		return 0;
+	}
+
+	if (ctdb->freeze_handle == NULL) {
+		DEBUG(DEBUG_ERR,("No freeze lock handle when adding a waiter\n"));
+		return -1;
+	}
+
+	/* If there are no databases, we are done. */
+	if (ctdb->freeze_handle->num_total == 0) {
+		return 0;
+	}
+
+	/* add ourselves to list of waiters */
+	w = talloc(ctdb->freeze_handle, struct ctdb_freeze_waiter);
+	CTDB_NO_MEMORY(ctdb, w);
+	w->ctdb     = ctdb;
+	w->c        = talloc_steal(w, c);
+	w->status   = -1;
+	talloc_set_destructor(w, ctdb_freeze_waiter_destructor);
+	DLIST_ADD(ctdb->freeze_handle->waiters, w);
+
+	/* we won't reply till later */
+	*async_reply = true;
+	return 0;
+}
+
+
+static int db_freeze_block(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+	struct tevent_context *ev = (struct tevent_context *)private_data;
+
+	ctdb_start_db_freeze(ctdb_db);
+
+	while (ctdb_db->freeze_mode == CTDB_FREEZE_PENDING) {
+		tevent_loop_once(ev);
+	}
+
+	if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  block until we are frozen, used during daemon startup
+ */
+bool ctdb_blocking_freeze(struct ctdb_context *ctdb)
+{
+	int ret;
+
+	ret = ctdb_db_iterator(ctdb, db_freeze_block, ctdb->ev);
+	if (ret != 0) {
+		return false;
+	}
+
+	return true;
+}
+
+/*
+  thaw the databases
+ */
+int32_t ctdb_control_thaw(struct ctdb_context *ctdb, bool check_recmode)
+{
+	if (check_recmode && ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
+		DEBUG(DEBUG_ERR, ("Failing to thaw databases while "
+				  "recovery is active\n"));
+		return -1;
+	}
+
+	DEBUG(DEBUG_ERR,("Thawing all\n"));
+
+	/* cancel any pending transactions */
+	if (ctdb->freeze_transaction_started) {
+		ctdb_db_iterator(ctdb, db_transaction_cancel_handler, NULL);
+		ctdb->freeze_transaction_started = false;
+	}
+
+	ctdb_db_iterator(ctdb, db_thaw, NULL);
+	TALLOC_FREE(ctdb->freeze_handle);
+
+	ctdb_call_resend_all(ctdb);
+	return 0;
+}
+
+/**
+ * Database transaction wrappers
+ *
+ * These functions are wrappers around transaction start/cancel/commit handlers.
+ */
+
+struct db_start_transaction_state {
+	uint32_t transaction_id;
+	bool transaction_started;
+};
+
+static int db_start_transaction(struct ctdb_db_context *ctdb_db,
+				void *private_data)
+{
+	struct db_start_transaction_state *state =
+		(struct db_start_transaction_state *)private_data;
+	int ret;
+	bool transaction_started;
+
+	if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_ERR,
+		      ("Database %s not frozen, cannot start transaction\n",
+		       ctdb_db->db_name));
+		return -1;
+	}
+
+	transaction_started = state->transaction_started &
+			      ctdb_db->freeze_transaction_started;
+
+	ret = db_transaction_start_handler(ctdb_db,
+					   &transaction_started);
+	if (ret != 0) {
+		return -1;
+	}
+
+	ctdb_db->freeze_transaction_started = true;
+	ctdb_db->freeze_transaction_id = state->transaction_id;
+
+	return 0;
+}
+
+static int db_cancel_transaction(struct ctdb_db_context *ctdb_db,
+				 void *private_data)
+{
+	int ret;
+
+	ret = db_transaction_cancel_handler(ctdb_db, private_data);
+	if (ret != 0) {
+		return ret;
+	}
+
+	ctdb_db->freeze_transaction_started = false;
+
+	return 0;
+}
+
+struct db_commit_transaction_state {
+	uint32_t transaction_id;
+	unsigned int healthy_nodes;
+};
+
+static int db_commit_transaction(struct ctdb_db_context *ctdb_db,
+				 void *private_data)
+{
+	struct db_commit_transaction_state *state =
+		(struct db_commit_transaction_state *)private_data;
+	int ret;
+
+	if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_ERR,
+		      ("Database %s not frozen, cannot commit transaction\n",
+		       ctdb_db->db_name));
+		return -1;
+	}
+
+	if (!ctdb_db->freeze_transaction_started) {
+		DEBUG(DEBUG_ERR, ("Transaction not started on %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	if (ctdb_db->freeze_transaction_id != state->transaction_id) {
+		DEBUG(DEBUG_ERR,
+		      ("Incorrect transaction commit id 0x%08x for %s\n",
+		       state->transaction_id, ctdb_db->db_name));
+		return -1;
+	}
+
+	ret = db_transaction_commit_handler(ctdb_db, &state->healthy_nodes);
+	if (ret != 0) {
+		return -1;
+	}
+
+	ctdb_db->freeze_transaction_started = false;
+	ctdb_db->freeze_transaction_id = 0;
+	ctdb_db->generation = state->transaction_id;
+	return 0;
+}
+
+/**
+ * Start a transaction on a database - used for db recovery
+ */
+int32_t ctdb_control_db_transaction_start(struct ctdb_context *ctdb,
+					  TDB_DATA indata)
+{
+	struct ctdb_transdb *w =
+		(struct ctdb_transdb *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	struct db_start_transaction_state state;
+
+	ctdb_db = find_ctdb_db(ctdb, w->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Transaction start for unknown dbid 0x%08x\n",
+		       w->db_id));
+		return -1;
+	}
+
+	state.transaction_id = w->tid;
+	state.transaction_started = true;
+
+	return db_start_transaction(ctdb_db, &state);
+}
+
+/**
+ * Cancel a transaction on a database - used for db recovery
+ */
+int32_t ctdb_control_db_transaction_cancel(struct ctdb_context *ctdb,
+					   TDB_DATA indata)
+{
+	uint32_t db_id = *(uint32_t *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Transaction cancel for unknown dbid 0x%08x\n", db_id));
+		return -1;
+	}
+
+	DEBUG(DEBUG_ERR, ("Recovery db transaction cancelled for %s\n",
+			  ctdb_db->db_name));
+
+	return db_cancel_transaction(ctdb_db, NULL);
+}
+
+/**
+ * Commit a transaction on a database - used for db recovery
+ */
+int32_t ctdb_control_db_transaction_commit(struct ctdb_context *ctdb,
+					   TDB_DATA indata)
+{
+	struct ctdb_transdb *w =
+		(struct ctdb_transdb *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	struct db_commit_transaction_state state;
+	unsigned int healthy_nodes, i;
+
+	ctdb_db = find_ctdb_db(ctdb, w->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Transaction commit for unknown dbid 0x%08x\n",
+		       w->db_id));
+		return -1;
+	}
+
+	healthy_nodes = 0;
+	for (i=0; i < ctdb->num_nodes; i++) {
+		if (ctdb->nodes[i]->flags == 0) {
+			healthy_nodes += 1;
+		}
+	}
+
+	state.transaction_id = w->tid;
+	state.healthy_nodes = healthy_nodes;
+
+	return db_commit_transaction(ctdb_db, &state);
+}
+
+/*
+  wipe a database - only possible when in a frozen transaction
+ */
+int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_transdb w = *(struct ctdb_transdb *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+
+	ctdb_db = find_ctdb_db(ctdb, w.db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", w.db_id));
+		return -1;
+	}
+
+	if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n"));
+		return -1;
+	}
+
+	if (!ctdb_db->freeze_transaction_started) {
+		DEBUG(DEBUG_ERR,(__location__ " transaction not started\n"));
+		return -1;
+	}
+
+	if (w.tid != ctdb_db->freeze_transaction_id) {
+		DEBUG(DEBUG_ERR,(__location__ " incorrect transaction id 0x%x in commit\n", w.tid));
+		return -1;
+	}
+
+	if (tdb_wipe_all(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to wipe database for db '%s'\n",
+			 ctdb_db->db_name));
+		return -1;
+	}
+
+	if (ctdb_db_volatile(ctdb_db)) {
+		talloc_free(ctdb_db->delete_queue);
+		talloc_free(ctdb_db->fetch_queue);
+		ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+		if (ctdb_db->delete_queue == NULL) {
+			DEBUG(DEBUG_ERR, (__location__ " Failed to re-create "
+					  "the delete queue.\n"));
+			return -1;
+		}
+		ctdb_db->fetch_queue = trbt_create(ctdb_db, 0);
+		if (ctdb_db->fetch_queue == NULL) {
+			DEBUG(DEBUG_ERR, (__location__ " Failed to re-create "
+					  "the fetch queue.\n"));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+bool ctdb_db_frozen(struct ctdb_db_context *ctdb_db)
+{
+	if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+		return false;
+	}
+
+	return true;
+}
+
+bool ctdb_db_all_frozen(struct ctdb_context *ctdb)
+{
+	if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) {
+		return false;
+	}
+	return true;
+}
+
+bool ctdb_db_allow_access(struct ctdb_db_context *ctdb_db)
+{
+	if (ctdb_db->freeze_mode == CTDB_FREEZE_NONE) {
+		/* If database is not frozen, then allow access. */
+		return true;
+	} else if (ctdb_db->freeze_transaction_started) {
+		/* If database is frozen, allow access only if the
+		 * transaction is started.  This is required during
+		 * recovery.
+		 *
+		 * If a node is inactive, then transaction is not started.
+		 */
+		return true;
+	}
+
+	return false;
+}
diff --git a/ctdb/server/ctdb_keepalive.c b/ctdb/server/ctdb_keepalive.c
new file mode 100644
index 0000000..9155ade
--- /dev/null
+++ b/ctdb/server/ctdb_keepalive.c
@@ -0,0 +1,234 @@
+/* 
+   monitoring links to all other nodes to detect dead nodes
+
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/time.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "version.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+
+static uint32_t keepalive_version(void)
+{
+	static uint32_t version = 0;
+
+	if (version == 0) {
+		const char *t;
+
+		version = (SAMBA_VERSION_MAJOR << 16) | SAMBA_VERSION_MINOR;
+
+		t = getenv("CTDB_TEST_SAMBA_VERSION");
+		if (t != NULL) {
+			int v;
+
+			v = atoi(t);
+			if (v <= 0) {
+				DBG_WARNING("Failed to parse env var: %s\n", t);
+			} else {
+				version = v;
+			}
+		}
+	}
+
+	return version;
+}
+
+static uint32_t keepalive_uptime(struct ctdb_context *ctdb)
+{
+	struct timeval current = tevent_timeval_current();
+
+	return current.tv_sec - ctdb->ctdbd_start_time.tv_sec;
+}
+
+/*
+   send a keepalive packet to the other node
+*/
+static void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode)
+{
+	struct ctdb_req_keepalive_old *r;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,
+		      ("Failed to send keepalive. Transport is DOWN\n"));
+		return;
+	}
+
+	r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_KEEPALIVE,
+				    sizeof(struct ctdb_req_keepalive_old),
+				    struct ctdb_req_keepalive_old);
+	CTDB_NO_MEMORY_FATAL(ctdb, r);
+	r->hdr.destnode  = destnode;
+	r->hdr.reqid     = 0;
+
+	r->version = keepalive_version();
+	r->uptime = keepalive_uptime(ctdb);
+
+	CTDB_INCREMENT_STAT(ctdb, keepalive_packets_sent);
+
+	ctdb_queue_packet(ctdb, &r->hdr);
+
+	talloc_free(r);
+}
+
+/*
+  see if any nodes are dead
+ */
+static void ctdb_check_for_dead_nodes(struct tevent_context *ev,
+				      struct tevent_timer *te,
+				      struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	unsigned int i;
+
+	/* send a keepalive to all other nodes, unless */
+	for (i=0;i<ctdb->num_nodes;i++) {
+		struct ctdb_node *node = ctdb->nodes[i];
+
+		if (node->flags & NODE_FLAGS_DELETED) {
+			continue;
+		}
+
+		if (node->pnn == ctdb->pnn) {
+			continue;
+		}
+		
+		if (node->flags & NODE_FLAGS_DISCONNECTED) {
+			/* it might have come alive again */
+			if (node->rx_cnt != 0) {
+				ctdb_node_connected(node);
+			}
+			continue;
+		}
+
+
+		if (node->rx_cnt == 0) {
+			node->dead_count++;
+		} else {
+			node->dead_count = 0;
+		}
+
+		node->rx_cnt = 0;
+
+		if (node->dead_count >= ctdb->tunable.keepalive_limit) {
+			DEBUG(DEBUG_NOTICE,("dead count reached for node %u\n", node->pnn));
+			ctdb_node_dead(node);
+			ctdb_send_keepalive(ctdb, node->pnn);
+			/* maybe tell the transport layer to kill the
+			   sockets as well?
+			*/
+			continue;
+		}
+		
+		DEBUG(DEBUG_DEBUG,("sending keepalive to %u\n", node->pnn));
+		ctdb_send_keepalive(ctdb, node->pnn);
+
+		node->tx_cnt = 0;
+	}
+
+	tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx,
+			 timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
+			 ctdb_check_for_dead_nodes, ctdb);
+}
+
+
+void ctdb_start_keepalive(struct ctdb_context *ctdb)
+{
+	struct tevent_timer *te;
+
+	ctdb->keepalive_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->keepalive_ctx);
+
+	te = tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx,
+			      timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
+			      ctdb_check_for_dead_nodes, ctdb);
+	CTDB_NO_MEMORY_FATAL(ctdb, te);
+
+	DEBUG(DEBUG_NOTICE,("Keepalive monitoring has been started\n"));
+
+	if (ctdb->tunable.allow_mixed_versions == 1) {
+		DEBUG(DEBUG_WARNING,
+		      ("CTDB cluster with mixed versions configured\n"));
+	}
+}
+
+void ctdb_stop_keepalive(struct ctdb_context *ctdb)
+{
+	talloc_free(ctdb->keepalive_ctx);
+	ctdb->keepalive_ctx = NULL;
+}
+
+void ctdb_request_keepalive(struct ctdb_context *ctdb,
+			    struct ctdb_req_header *hdr)
+{
+	struct ctdb_req_keepalive_old *c =
+		(struct ctdb_req_keepalive_old *)hdr;
+	uint32_t my_version = keepalive_version();
+	uint32_t my_uptime = keepalive_uptime(ctdb);
+
+	/* Don't check anything if mixed versions are allowed */
+	if (ctdb->tunable.allow_mixed_versions == 1) {
+		return;
+	}
+
+	if (hdr->length == sizeof(struct ctdb_req_header)) {
+		/* Old keepalive */
+		goto fail1;
+	}
+
+	if (c->version != my_version) {
+		if (c->uptime > my_uptime) {
+			goto fail2;
+		} else if (c->uptime == my_uptime) {
+			if (c->version > my_version) {
+				goto fail2;
+			}
+		}
+	}
+
+	return;
+
+fail1:
+	DEBUG(DEBUG_ERR,
+	      ("Keepalive version missing from node %u\n", hdr->srcnode));
+	goto shutdown;
+
+fail2:
+	DEBUG(DEBUG_ERR,
+	      ("Keepalive version mismatch 0x%08x != 0x%08x from node %u\n",
+	       my_version, c->version, hdr->srcnode));
+	goto shutdown;
+
+shutdown:
+	DEBUG(DEBUG_ERR,
+	      ("CTDB Cluster with mixed versions, cannot continue\n"));
+	ctdb_shutdown_sequence(ctdb, 0);
+}
diff --git a/ctdb/server/ctdb_lock.c b/ctdb/server/ctdb_lock.c
new file mode 100644
index 0000000..063ebfa
--- /dev/null
+++ b/ctdb/server/ctdb_lock.c
@@ -0,0 +1,996 @@
+/*
+   ctdb lock handling
+   provide API to do non-blocking locks for single or all databases
+
+   Copyright (C) Amitay Isaacs  2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+
+#include "ctdb_private.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+ * Non-blocking Locking API
+ *
+ * 1. Create a child process to do blocking locks.
+ * 2. Once the locks are obtained, signal parent process via fd.
+ * 3. Invoke registered callback routine with locking status.
+ * 4. If the child process cannot get locks within certain time,
+ *    execute an external script to debug.
+ *
+ * ctdb_lock_record()      - get a lock on a record
+ * ctdb_lock_db()          - get a lock on a DB
+ *
+ *  auto_mark              - whether to mark/unmark DBs in before/after callback
+ *                           = false is used for freezing databases for
+ *                           recovery since the recovery cannot start till
+ *                           databases are locked on all the nodes.
+ *                           = true is used for record locks.
+ */
+
+enum lock_type {
+	LOCK_RECORD,
+	LOCK_DB,
+};
+
+static const char * const lock_type_str[] = {
+	"lock_record",
+	"lock_db",
+};
+
+struct lock_request;
+
+/* lock_context is the common part for a lock request */
+struct lock_context {
+	struct lock_context *next, *prev;
+	enum lock_type type;
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key;
+	uint32_t priority;
+	bool auto_mark;
+	struct lock_request *request;
+	pid_t child;
+	int fd[2];
+	struct tevent_fd *tfd;
+	struct tevent_timer *ttimer;
+	struct timeval start_time;
+	uint32_t key_hash;
+	bool can_schedule;
+};
+
+/* lock_request is the client specific part for a lock request */
+struct lock_request {
+	struct lock_context *lctx;
+	void (*callback)(void *, bool);
+	void *private_data;
+};
+
+
+int ctdb_db_iterator(struct ctdb_context *ctdb, ctdb_db_handler_t handler,
+		     void *private_data)
+{
+	struct ctdb_db_context *ctdb_db;
+	int ret;
+
+	for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+		ret = handler(ctdb_db, private_data);
+		if (ret != 0) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * lock all databases - mark only
+ */
+static int db_lock_mark_handler(struct ctdb_db_context *ctdb_db,
+				void *private_data)
+{
+	int tdb_transaction_write_lock_mark(struct tdb_context *);
+
+	DEBUG(DEBUG_INFO, ("marking locked database %s\n", ctdb_db->db_name));
+
+	if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to mark (transaction lock) database %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to mark (all lock) database %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	return 0;
+}
+
+int ctdb_lockdb_mark(struct ctdb_db_context *ctdb_db)
+{
+	if (!ctdb_db_frozen(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Attempt to mark database locked when not frozen\n"));
+		return -1;
+	}
+
+	return db_lock_mark_handler(ctdb_db, NULL);
+}
+
+/*
+ * lock all databases - unmark only
+ */
+static int db_lock_unmark_handler(struct ctdb_db_context *ctdb_db,
+				  void *private_data)
+{
+	int tdb_transaction_write_lock_unmark(struct tdb_context *);
+
+	DEBUG(DEBUG_INFO, ("unmarking locked database %s\n", ctdb_db->db_name));
+
+	if (tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to unmark (transaction lock) database %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to unmark (all lock) database %s\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	return 0;
+}
+
+int ctdb_lockdb_unmark(struct ctdb_db_context *ctdb_db)
+{
+	if (!ctdb_db_frozen(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Attempt to unmark database locked when not frozen\n"));
+		return -1;
+	}
+
+	return db_lock_unmark_handler(ctdb_db, NULL);
+}
+
+static void ctdb_lock_schedule(struct ctdb_context *ctdb);
+
+/*
+ * Destructor to kill the child locking process
+ */
+static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
+{
+	if (lock_ctx->request) {
+		lock_ctx->request->lctx = NULL;
+	}
+	if (lock_ctx->child > 0) {
+		ctdb_kill(lock_ctx->ctdb, lock_ctx->child, SIGTERM);
+		if (lock_ctx->type == LOCK_RECORD) {
+			DLIST_REMOVE(lock_ctx->ctdb_db->lock_current, lock_ctx);
+		} else {
+			DLIST_REMOVE(lock_ctx->ctdb->lock_current, lock_ctx);
+		}
+		if (lock_ctx->ctdb_db->lock_num_current == 0) {
+			ctdb_fatal(NULL, "Lock count is 0 before decrement\n");
+		}
+		lock_ctx->ctdb_db->lock_num_current--;
+		CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+		CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+	} else {
+		if (lock_ctx->type == LOCK_RECORD) {
+			DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx);
+		} else {
+			DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx);
+		}
+		CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+		CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+	}
+
+	ctdb_lock_schedule(lock_ctx->ctdb);
+
+	return 0;
+}
+
+
+/*
+ * Destructor to remove lock request
+ */
+static int ctdb_lock_request_destructor(struct lock_request *lock_request)
+{
+	if (lock_request->lctx == NULL) {
+		return 0;
+	}
+
+	lock_request->lctx->request = NULL;
+	TALLOC_FREE(lock_request->lctx);
+
+	return 0;
+}
+
+/*
+ * Process all the callbacks waiting for lock
+ *
+ * If lock has failed, callback is executed with locked=false
+ */
+static void process_callbacks(struct lock_context *lock_ctx, bool locked)
+{
+	struct lock_request *request;
+	bool auto_mark = lock_ctx->auto_mark;
+
+	if (auto_mark && locked) {
+		switch (lock_ctx->type) {
+		case LOCK_RECORD:
+			tdb_chainlock_mark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
+			break;
+
+		case LOCK_DB:
+			(void)ctdb_lockdb_mark(lock_ctx->ctdb_db);
+			break;
+		}
+	}
+
+	request = lock_ctx->request;
+	if (auto_mark) {
+		/* Since request may be freed in the callback, unset the lock
+		 * context, so request destructor will not free lock context.
+		 */
+		request->lctx = NULL;
+	}
+
+	/* Since request may be freed in the callback, unset the request */
+	lock_ctx->request = NULL;
+
+	request->callback(request->private_data, locked);
+
+	if (!auto_mark) {
+		return;
+	}
+
+	if (locked) {
+		switch (lock_ctx->type) {
+		case LOCK_RECORD:
+			tdb_chainlock_unmark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
+			break;
+
+		case LOCK_DB:
+			ctdb_lockdb_unmark(lock_ctx->ctdb_db);
+			break;
+		}
+	}
+
+	talloc_free(lock_ctx);
+}
+
+
+static int lock_bucket_id(double t)
+{
+	double ms = 1.e-3, s = 1;
+	int id;
+
+	if (t < 1*ms) {
+		id = 0;
+	} else if (t < 10*ms) {
+		id = 1;
+	} else if (t < 100*ms) {
+		id = 2;
+	} else if (t < 1*s) {
+		id = 3;
+	} else if (t < 2*s) {
+		id = 4;
+	} else if (t < 4*s) {
+		id = 5;
+	} else if (t < 8*s) {
+		id = 6;
+	} else if (t < 16*s) {
+		id = 7;
+	} else if (t < 32*s) {
+		id = 8;
+	} else if (t < 64*s) {
+		id = 9;
+	} else {
+		id = 10;
+	}
+
+	return id;
+}
+
+/*
+ * Callback routine when the required locks are obtained.
+ * Called from parent context
+ */
+static void ctdb_lock_handler(struct tevent_context *ev,
+			    struct tevent_fd *tfd,
+			    uint16_t flags,
+			    void *private_data)
+{
+	struct lock_context *lock_ctx;
+	char c;
+	bool locked;
+	double t;
+	int id;
+
+	lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
+
+	/* cancel the timeout event */
+	TALLOC_FREE(lock_ctx->ttimer);
+
+	t = timeval_elapsed(&lock_ctx->start_time);
+	id = lock_bucket_id(t);
+
+	/* Read the status from the child process */
+	if (sys_read(lock_ctx->fd[0], &c, 1) != 1) {
+		locked = false;
+	} else {
+		locked = (c == 0 ? true : false);
+	}
+
+	/* Update statistics */
+	CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls);
+	CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls);
+
+	if (locked) {
+		CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.buckets[id]);
+		CTDB_UPDATE_LATENCY(lock_ctx->ctdb, lock_ctx->ctdb_db,
+				    lock_type_str[lock_ctx->type], locks.latency,
+				    lock_ctx->start_time);
+
+		CTDB_UPDATE_DB_LATENCY(lock_ctx->ctdb_db, lock_type_str[lock_ctx->type], locks.latency, t);
+		CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.buckets[id]);
+	} else {
+		CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_failed);
+		CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_failed);
+	}
+
+	process_callbacks(lock_ctx, locked);
+}
+
+struct lock_log_entry {
+	struct db_hash_context *lock_log;
+	TDB_DATA key;
+	unsigned long log_sec;
+	struct tevent_timer *timer;
+};
+
+static int lock_log_fetch_parser(uint8_t *keybuf, size_t keylen,
+				 uint8_t *databuf, size_t datalen,
+				 void *private_data)
+{
+	struct lock_log_entry **entry =
+		(struct lock_log_entry **)private_data;
+
+	if (datalen != sizeof(struct lock_log_entry *)) {
+		return EINVAL;
+	}
+
+	*entry = talloc_get_type_abort(*(void **)databuf,
+				       struct lock_log_entry);
+	return 0;
+}
+
+static void lock_log_cleanup(struct tevent_context *ev,
+			     struct tevent_timer *ttimer,
+			     struct timeval current_time,
+			     void *private_data)
+{
+	struct lock_log_entry *entry = talloc_get_type_abort(
+		private_data, struct lock_log_entry);
+	int ret;
+
+	entry->timer = NULL;
+
+	ret = db_hash_delete(entry->lock_log, entry->key.dptr,
+			     entry->key.dsize);
+	if (ret != 0) {
+		return;
+	}
+	talloc_free(entry);
+}
+
+static bool lock_log_skip(struct tevent_context *ev,
+			  struct db_hash_context *lock_log,
+			  TDB_DATA key, unsigned long elapsed_sec)
+{
+	struct lock_log_entry *entry = NULL;
+	int ret;
+
+	ret = db_hash_fetch(lock_log, key.dptr, key.dsize,
+			    lock_log_fetch_parser, &entry);
+	if (ret == ENOENT) {
+
+		entry = talloc_zero(lock_log, struct lock_log_entry);
+		if (entry == NULL) {
+			goto fail;
+		}
+
+		entry->lock_log = lock_log;
+
+		entry->key.dptr = talloc_memdup(entry, key.dptr, key.dsize);
+		if (entry->key.dptr == NULL) {
+			talloc_free(entry);
+			goto fail;
+		}
+		entry->key.dsize = key.dsize;
+
+		entry->log_sec = elapsed_sec;
+		entry->timer = tevent_add_timer(ev, entry,
+						timeval_current_ofs(30, 0),
+						lock_log_cleanup, entry);
+		if (entry->timer == NULL) {
+			talloc_free(entry);
+			goto fail;
+		}
+
+		ret = db_hash_add(lock_log, key.dptr, key.dsize,
+				  (uint8_t *)&entry,
+				  sizeof(struct lock_log_entry *));
+		if (ret != 0) {
+			talloc_free(entry);
+			goto fail;
+		}
+
+		return false;
+
+	} else if (ret == EINVAL) {
+
+		ret = db_hash_delete(lock_log, key.dptr, key.dsize);
+		if (ret != 0) {
+			goto fail;
+		}
+
+		return false;
+
+	} else if (ret == 0) {
+
+		if (elapsed_sec <= entry->log_sec) {
+			return true;
+		}
+
+		entry->log_sec = elapsed_sec;
+
+		TALLOC_FREE(entry->timer);
+		entry->timer = tevent_add_timer(ev, entry,
+						timeval_current_ofs(30, 0),
+						lock_log_cleanup, entry);
+		if (entry->timer == NULL) {
+			ret = db_hash_delete(lock_log, key.dptr, key.dsize);
+			if (ret != 0) {
+				goto fail;
+			}
+			talloc_free(entry);
+		}
+
+		return false;
+	}
+
+
+fail:
+	return false;
+
+}
+
+static const char **debug_locks_args(TALLOC_CTX *mem_ctx, struct lock_context *lock_ctx)
+{
+	const char **args = NULL;
+	int tdb_flags;
+	int nargs, i;
+
+	/* Program, lock helper PID, db|record, tdb path, fcntl|mutex, NULL */
+	nargs = 6;
+
+	args = talloc_array(mem_ctx, const char *, nargs);
+	if (args == NULL) {
+		return NULL;
+	}
+
+	args[0] = talloc_strdup(args, "debug_locks");
+	args[1] = talloc_asprintf(args, "%d", lock_ctx->child);
+
+	if (lock_ctx->type == LOCK_RECORD) {
+		args[2] = talloc_strdup(args, "RECORD");
+	} else {
+		args[2] = talloc_strdup(args, "DB");
+	}
+
+	args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+
+	tdb_flags = tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb);
+	if (tdb_flags & TDB_MUTEX_LOCKING) {
+		args[4] = talloc_strdup(args, "MUTEX");
+	} else {
+		args[4] = talloc_strdup(args, "FCNTL");
+	}
+
+	args[5] = NULL;
+
+	for (i=0; i<nargs-1; i++) {
+		if (args[i] == NULL) {
+			talloc_free(args);
+			return NULL;
+		}
+	}
+
+	return args;
+}
+
+/*
+ * Callback routine when required locks are not obtained within timeout
+ * Called from parent context
+ */
+static void ctdb_lock_timeout_handler(struct tevent_context *ev,
+				    struct tevent_timer *ttimer,
+				    struct timeval current_time,
+				    void *private_data)
+{
+	static char debug_locks[PATH_MAX+1] = "";
+	struct lock_context *lock_ctx;
+	struct ctdb_context *ctdb;
+	pid_t pid;
+	double elapsed_time;
+	bool skip;
+	char *keystr;
+	const char **args;
+
+	lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
+	ctdb = lock_ctx->ctdb;
+
+	elapsed_time = timeval_elapsed(&lock_ctx->start_time);
+
+	/* For database locks, always log */
+	if (lock_ctx->type == LOCK_DB) {
+		DEBUG(DEBUG_WARNING,
+		      ("Unable to get DB lock on database %s for "
+		       "%.0lf seconds\n",
+		       lock_ctx->ctdb_db->db_name, elapsed_time));
+		goto lock_debug;
+	}
+
+	/* For record locks, check if we have already logged */
+	skip = lock_log_skip(ev, lock_ctx->ctdb_db->lock_log,
+			     lock_ctx->key, (unsigned long)elapsed_time);
+	if (skip) {
+		goto skip_lock_debug;
+	}
+
+	keystr = hex_encode_talloc(lock_ctx, lock_ctx->key.dptr,
+				   lock_ctx->key.dsize);
+	DEBUG(DEBUG_WARNING,
+	      ("Unable to get RECORD lock on database %s for %.0lf seconds"
+	       " (key %s)\n",
+	       lock_ctx->ctdb_db->db_name, elapsed_time,
+	       keystr ? keystr : ""));
+	TALLOC_FREE(keystr);
+
+	/* If a node stopped/banned, don't spam the logs */
+	if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
+		goto skip_lock_debug;
+	}
+
+lock_debug:
+
+	if (ctdb_set_helper("lock debugging helper",
+			    debug_locks, sizeof(debug_locks),
+			    "CTDB_DEBUG_LOCKS",
+			    getenv("CTDB_BASE"), "debug_locks.sh")) {
+		args = debug_locks_args(lock_ctx, lock_ctx);
+		if (args != NULL) {
+			pid = vfork();
+			if (pid == 0) {
+				execvp(debug_locks, discard_const(args));
+				_exit(0);
+			}
+			talloc_free(args);
+			ctdb_track_child(ctdb, pid);
+		} else {
+			D_WARNING("No memory for debug locks args\n");
+		}
+	} else {
+		DEBUG(DEBUG_WARNING,
+		      (__location__
+		       " Unable to setup lock debugging\n"));
+	}
+
+skip_lock_debug:
+
+	/* reset the timeout timer */
+	// talloc_free(lock_ctx->ttimer);
+	lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
+					    lock_ctx,
+					    timeval_current_ofs(10, 0),
+					    ctdb_lock_timeout_handler,
+					    (void *)lock_ctx);
+}
+
+static bool lock_helper_args(TALLOC_CTX *mem_ctx,
+			     struct lock_context *lock_ctx, int fd,
+			     int *argc, const char ***argv)
+{
+	const char **args = NULL;
+	int nargs = 0, i;
+
+	switch (lock_ctx->type) {
+	case LOCK_RECORD:
+		nargs = 6;
+		break;
+
+	case LOCK_DB:
+		nargs = 5;
+		break;
+	}
+
+	/* Add extra argument for null termination */
+	nargs++;
+
+	args = talloc_array(mem_ctx, const char *, nargs);
+	if (args == NULL) {
+		return false;
+	}
+
+	args[0] = talloc_asprintf(args, "%d", getpid());
+	args[1] = talloc_asprintf(args, "%d", fd);
+
+	switch (lock_ctx->type) {
+	case LOCK_RECORD:
+		args[2] = talloc_strdup(args, "RECORD");
+		args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+		args[4] = talloc_asprintf(args, "0x%x",
+				tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb));
+		if (lock_ctx->key.dsize == 0) {
+			args[5] = talloc_strdup(args, "NULL");
+		} else {
+			args[5] = hex_encode_talloc(args, lock_ctx->key.dptr, lock_ctx->key.dsize);
+		}
+		break;
+
+	case LOCK_DB:
+		args[2] = talloc_strdup(args, "DB");
+		args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+		args[4] = talloc_asprintf(args, "0x%x",
+				tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb));
+		break;
+	}
+
+	/* Make sure last argument is NULL */
+	args[nargs-1] = NULL;
+
+	for (i=0; i<nargs-1; i++) {
+		if (args[i] == NULL) {
+			talloc_free(args);
+			return false;
+		}
+	}
+
+	*argc = nargs;
+	*argv = args;
+	return true;
+}
+
+/*
+ * Find a lock request that can be scheduled
+ */
+static struct lock_context *ctdb_find_lock_context(struct ctdb_context *ctdb)
+{
+	struct lock_context *lock_ctx, *next_ctx;
+	struct ctdb_db_context *ctdb_db;
+
+	/* First check if there are database lock requests */
+
+	for (lock_ctx = ctdb->lock_pending; lock_ctx != NULL;
+	     lock_ctx = next_ctx) {
+
+		if (lock_ctx->request != NULL) {
+			/* Found a lock context with a request */
+			return lock_ctx;
+		}
+
+		next_ctx = lock_ctx->next;
+
+		DEBUG(DEBUG_INFO, ("Removing lock context without lock "
+				   "request\n"));
+		DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
+		CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
+		CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+		talloc_free(lock_ctx);
+	}
+
+	/* Next check database queues */
+	for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+		if (ctdb_db->lock_num_current ==
+		    ctdb->tunable.lock_processes_per_db) {
+			continue;
+		}
+
+		for (lock_ctx = ctdb_db->lock_pending; lock_ctx != NULL;
+		     lock_ctx = next_ctx) {
+
+			next_ctx = lock_ctx->next;
+
+			if (lock_ctx->request != NULL) {
+				return lock_ctx;
+			}
+
+			DEBUG(DEBUG_INFO, ("Removing lock context without "
+					   "lock request\n"));
+			DLIST_REMOVE(ctdb_db->lock_pending, lock_ctx);
+			CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
+			CTDB_DECREMENT_DB_STAT(ctdb_db, locks.num_pending);
+			talloc_free(lock_ctx);
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Schedule a new lock child process
+ * Set up callback handler and timeout handler
+ */
+static void ctdb_lock_schedule(struct ctdb_context *ctdb)
+{
+	struct lock_context *lock_ctx;
+	int ret, argc;
+	TALLOC_CTX *tmp_ctx;
+	static char prog[PATH_MAX+1] = "";
+	const char **args;
+
+	if (!ctdb_set_helper("lock helper",
+			     prog, sizeof(prog),
+			     "CTDB_LOCK_HELPER",
+			     CTDB_HELPER_BINDIR, "ctdb_lock_helper")) {
+		ctdb_die(ctdb, __location__
+			 " Unable to set lock helper\n");
+	}
+
+	/* Find a lock context with requests */
+	lock_ctx = ctdb_find_lock_context(ctdb);
+	if (lock_ctx == NULL) {
+		return;
+	}
+
+	lock_ctx->child = -1;
+	ret = pipe(lock_ctx->fd);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to create pipe in ctdb_lock_schedule\n"));
+		return;
+	}
+
+	set_close_on_exec(lock_ctx->fd[0]);
+
+	/* Create data for child process */
+	tmp_ctx = talloc_new(lock_ctx);
+	if (tmp_ctx == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate memory for helper args\n"));
+		close(lock_ctx->fd[0]);
+		close(lock_ctx->fd[1]);
+		return;
+	}
+
+	if (! ctdb->do_setsched) {
+		ret = setenv("CTDB_NOSETSCHED", "1", 1);
+		if (ret != 0) {
+			DEBUG(DEBUG_WARNING,
+			      ("Failed to set CTDB_NOSETSCHED variable\n"));
+		}
+	}
+
+	/* Create arguments for lock helper */
+	if (!lock_helper_args(tmp_ctx, lock_ctx, lock_ctx->fd[1],
+			      &argc, &args)) {
+		DEBUG(DEBUG_ERR, ("Failed to create lock helper args\n"));
+		close(lock_ctx->fd[0]);
+		close(lock_ctx->fd[1]);
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	lock_ctx->child = ctdb_vfork_exec(lock_ctx, ctdb, prog, argc,
+					  (const char **)args);
+	if (lock_ctx->child == -1) {
+		DEBUG(DEBUG_ERR, ("Failed to create a child in ctdb_lock_schedule\n"));
+		close(lock_ctx->fd[0]);
+		close(lock_ctx->fd[1]);
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	/* Parent process */
+	close(lock_ctx->fd[1]);
+
+	talloc_free(tmp_ctx);
+
+	/* Set up timeout handler */
+	lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
+					    lock_ctx,
+					    timeval_current_ofs(10, 0),
+					    ctdb_lock_timeout_handler,
+					    (void *)lock_ctx);
+	if (lock_ctx->ttimer == NULL) {
+		ctdb_kill(ctdb, lock_ctx->child, SIGTERM);
+		lock_ctx->child = -1;
+		close(lock_ctx->fd[0]);
+		return;
+	}
+
+	/* Set up callback */
+	lock_ctx->tfd = tevent_add_fd(ctdb->ev,
+				      lock_ctx,
+				      lock_ctx->fd[0],
+				      TEVENT_FD_READ,
+				      ctdb_lock_handler,
+				      (void *)lock_ctx);
+	if (lock_ctx->tfd == NULL) {
+		TALLOC_FREE(lock_ctx->ttimer);
+		ctdb_kill(ctdb, lock_ctx->child, SIGTERM);
+		lock_ctx->child = -1;
+		close(lock_ctx->fd[0]);
+		return;
+	}
+	tevent_fd_set_auto_close(lock_ctx->tfd);
+
+	/* Move the context from pending to current */
+	if (lock_ctx->type == LOCK_RECORD) {
+		DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx);
+		DLIST_ADD_END(lock_ctx->ctdb_db->lock_current, lock_ctx);
+	} else {
+		DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
+		DLIST_ADD_END(ctdb->lock_current, lock_ctx);
+	}
+	CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+	CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+	lock_ctx->ctdb_db->lock_num_current++;
+	CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+	CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+}
+
+
+/*
+ * Lock record / db depending on type
+ */
+static struct lock_request *ctdb_lock_internal(TALLOC_CTX *mem_ctx,
+					       struct ctdb_context *ctdb,
+					       struct ctdb_db_context *ctdb_db,
+					       TDB_DATA key,
+					       uint32_t priority,
+					       void (*callback)(void *, bool),
+					       void *private_data,
+					       enum lock_type type,
+					       bool auto_mark)
+{
+	struct lock_context *lock_ctx = NULL;
+	struct lock_request *request;
+
+	if (callback == NULL) {
+		DEBUG(DEBUG_WARNING, ("No callback function specified, not locking\n"));
+		return NULL;
+	}
+
+	lock_ctx = talloc_zero(ctdb, struct lock_context);
+	if (lock_ctx == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
+		return NULL;
+	}
+
+	if ((request = talloc_zero(mem_ctx, struct lock_request)) == NULL) {
+		talloc_free(lock_ctx);
+		return NULL;
+	}
+
+	lock_ctx->type = type;
+	lock_ctx->ctdb = ctdb;
+	lock_ctx->ctdb_db = ctdb_db;
+	lock_ctx->key.dsize = key.dsize;
+	if (key.dsize > 0) {
+		lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
+		if (lock_ctx->key.dptr == NULL) {
+			DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n"));
+			talloc_free(lock_ctx);
+			talloc_free(request);
+			return NULL;
+		}
+		lock_ctx->key_hash = ctdb_hash(&key);
+	} else {
+		lock_ctx->key.dptr = NULL;
+	}
+	lock_ctx->priority = priority;
+	lock_ctx->auto_mark = auto_mark;
+
+	lock_ctx->request = request;
+	lock_ctx->child = -1;
+
+	/* Non-record locks are required by recovery and should be scheduled
+	 * immediately, so keep them at the head of the pending queue.
+	 */
+	if (lock_ctx->type == LOCK_RECORD) {
+		DLIST_ADD_END(ctdb_db->lock_pending, lock_ctx);
+	} else {
+		DLIST_ADD_END(ctdb->lock_pending, lock_ctx);
+	}
+	CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
+	if (ctdb_db) {
+		CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
+	}
+
+	/* Start the timer when we activate the context */
+	lock_ctx->start_time = timeval_current();
+
+	request->lctx = lock_ctx;
+	request->callback = callback;
+	request->private_data = private_data;
+
+	talloc_set_destructor(request, ctdb_lock_request_destructor);
+	talloc_set_destructor(lock_ctx, ctdb_lock_context_destructor);
+
+	ctdb_lock_schedule(ctdb);
+
+	return request;
+}
+
+
+/*
+ * obtain a lock on a record in a database
+ */
+struct lock_request *ctdb_lock_record(TALLOC_CTX *mem_ctx,
+				      struct ctdb_db_context *ctdb_db,
+				      TDB_DATA key,
+				      bool auto_mark,
+				      void (*callback)(void *, bool),
+				      void *private_data)
+{
+	return ctdb_lock_internal(mem_ctx,
+				  ctdb_db->ctdb,
+				  ctdb_db,
+				  key,
+				  0,
+				  callback,
+				  private_data,
+				  LOCK_RECORD,
+				  auto_mark);
+}
+
+
+/*
+ * obtain a lock on a database
+ */
+struct lock_request *ctdb_lock_db(TALLOC_CTX *mem_ctx,
+				  struct ctdb_db_context *ctdb_db,
+				  bool auto_mark,
+				  void (*callback)(void *, bool),
+				  void *private_data)
+{
+	return ctdb_lock_internal(mem_ctx,
+				  ctdb_db->ctdb,
+				  ctdb_db,
+				  tdb_null,
+				  0,
+				  callback,
+				  private_data,
+				  LOCK_DB,
+				  auto_mark);
+}
diff --git a/ctdb/server/ctdb_lock_helper.c b/ctdb/server/ctdb_lock_helper.c
new file mode 100644
index 0000000..51d2992
--- /dev/null
+++ b/ctdb/server/ctdb_lock_helper.c
@@ -0,0 +1,350 @@
+/*
+   ctdb lock helper
+
+   Copyright (C) Amitay Isaacs  2013
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/util/sys_rw.h"
+#include "lib/util/tevent_unix.h"
+
+#include "protocol/protocol.h"
+
+#include "common/system.h"
+
+static bool realtime = true;
+
+struct lock_state {
+	struct tdb_context *tdb;
+	TDB_DATA key;
+};
+
+static void set_priority(void)
+{
+	const char *ptr;
+
+	ptr = getenv("CTDB_NOSETSCHED");
+	if (ptr != NULL) {
+		realtime = false;
+	}
+
+	if (! realtime) {
+		return;
+	}
+
+	realtime = set_scheduler();
+	if (! realtime) {
+		fprintf(stderr,
+			"locking: Unable to set real-time scheduler priority\n");
+	}
+}
+
+static void reset_priority(void)
+{
+	if (realtime) {
+		reset_scheduler();
+	}
+}
+
+static void send_result(int fd, char result)
+{
+	sys_write(fd, &result, 1);
+	if (result == 1) {
+		exit(1);
+	}
+}
+
+
+static void usage(const char *progname)
+{
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Usage: %s <ctdbd-pid> <output-fd> RECORD <db-path> <db-flags> <db-key>\n", progname);
+	fprintf(stderr, "       %s <ctdbd-pid> <output-fd> DB <db-path> <db-flags>\n", progname);
+}
+
+static uint8_t *hex_decode_talloc(TALLOC_CTX *mem_ctx,
+				  const char *hex_in, size_t *len)
+{
+	unsigned int i;
+	int num;
+	uint8_t *buffer;
+
+	*len = strlen(hex_in) / 2;
+	buffer = talloc_array(mem_ctx, unsigned char, *len);
+
+	for (i=0; i<*len; i++) {
+		sscanf(&hex_in[i*2], "%02X", &num);
+		buffer[i] = (uint8_t)num;
+	}
+
+	return buffer;
+}
+
+static int lock_record(const char *dbpath, const char *dbflags,
+		       const char *dbkey, struct lock_state *state)
+{
+	int tdb_flags;
+
+	/* No error checking since CTDB always passes sane values */
+	tdb_flags = strtol(dbflags, NULL, 0);
+
+	/* Convert hex key to key */
+	if (strcmp(dbkey, "NULL") == 0) {
+		state->key.dptr = NULL;
+		state->key.dsize = 0;
+	} else {
+		state->key.dptr = hex_decode_talloc(NULL, dbkey,
+						    &state->key.dsize);
+	}
+
+	state->tdb = tdb_open(dbpath, 0, tdb_flags, O_RDWR, 0600);
+	if (state->tdb == NULL) {
+		fprintf(stderr, "locking: Error opening database %s\n", dbpath);
+		return 1;
+	}
+
+	set_priority();
+
+	if (tdb_chainlock(state->tdb, state->key) < 0) {
+		fprintf(stderr, "locking: Error getting record lock (%s)\n",
+			tdb_errorstr(state->tdb));
+		return 1;
+	}
+
+	reset_priority();
+
+	return 0;
+
+}
+
+static int lock_db(const char *dbpath, const char *dbflags,
+		   struct lock_state *state)
+{
+	int tdb_flags;
+
+	/* No error checking since CTDB always passes sane values */
+	tdb_flags = strtol(dbflags, NULL, 0);
+
+	state->tdb = tdb_open(dbpath, 0, tdb_flags, O_RDWR, 0600);
+	if (state->tdb == NULL) {
+		fprintf(stderr, "locking: Error opening database %s\n", dbpath);
+		return 1;
+	}
+
+	set_priority();
+
+	if (tdb_lockall(state->tdb) < 0) {
+		fprintf(stderr, "locking: Error getting db lock (%s)\n",
+			tdb_errorstr(state->tdb));
+		return 1;
+	}
+
+	reset_priority();
+
+	return 0;
+}
+
+struct wait_for_parent_state {
+	struct tevent_context *ev;
+	pid_t ppid;
+};
+
+static void wait_for_parent_check(struct tevent_req *subreq);
+
+static struct tevent_req *wait_for_parent_send(TALLOC_CTX *mem_ctx,
+					       struct tevent_context *ev,
+					       pid_t ppid)
+{
+	struct tevent_req *req, *subreq;
+	struct wait_for_parent_state *state;
+
+	req = tevent_req_create(mem_ctx, &state, struct wait_for_parent_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->ppid = ppid;
+
+	if (ppid == 1) {
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	subreq = tevent_wakeup_send(state, ev,
+				    tevent_timeval_current_ofs(5,0));
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, wait_for_parent_check, req);
+
+	return req;
+}
+
+static void wait_for_parent_check(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct wait_for_parent_state *state = tevent_req_data(
+		req, struct wait_for_parent_state);
+	bool status;
+
+	status = tevent_wakeup_recv(subreq);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		/* Ignore error */
+		fprintf(stderr, "locking: tevent_wakeup_recv() failed\n");
+	}
+
+	if (kill(state->ppid, 0) == -1 && errno == ESRCH) {
+		tevent_req_done(req);
+		return;
+	}
+
+	subreq = tevent_wakeup_send(state, state->ev,
+				    tevent_timeval_current_ofs(5,0));
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, wait_for_parent_check, req);
+}
+
+static bool wait_for_parent_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		return false;
+	}
+
+	return true;
+}
+
+static void cleanup(struct lock_state *state)
+{
+	if (state->tdb != NULL) {
+		if (state->key.dsize == 0) {
+			tdb_unlockall(state->tdb);
+		} else {
+			tdb_chainunlock(state->tdb, state->key);
+		}
+		tdb_close(state->tdb);
+	}
+}
+
+static void signal_handler(struct tevent_context *ev,
+			   struct tevent_signal *se,
+			   int signum, int count, void *siginfo,
+			   void *private_data)
+{
+	struct lock_state *state = (struct lock_state *)private_data;
+
+	cleanup(state);
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct tevent_context *ev;
+	struct tevent_signal *se;
+	struct tevent_req *req;
+	struct lock_state state = { 0 };
+	int write_fd;
+	char result = 0;
+	int ppid;
+	const char *lock_type;
+	bool status;
+	int err;
+
+	reset_scheduler();
+
+	if (argc < 4) {
+		usage(argv[0]);
+		exit(1);
+	}
+
+	ppid = atoi(argv[1]);
+	write_fd = atoi(argv[2]);
+	lock_type = argv[3];
+
+	ev = tevent_context_init(NULL);
+	if (ev == NULL) {
+		fprintf(stderr, "locking: tevent_context_init() failed\n");
+		exit(1);
+	}
+
+	se = tevent_add_signal(ev, ev, SIGTERM, 0,
+			       signal_handler, &state);
+	if (se == NULL) {
+		fprintf(stderr, "locking: tevent_add_signal() failed\n");
+		talloc_free(ev);
+		exit(1);
+	}
+
+	if (strcmp(lock_type, "RECORD") == 0) {
+		if (argc != 7) {
+			fprintf(stderr,
+				"locking: Invalid number of arguments (%d)\n",
+				argc);
+			usage(argv[0]);
+			exit(1);
+		}
+		result = lock_record(argv[4], argv[5], argv[6], &state);
+
+	} else if (strcmp(lock_type, "DB") == 0) {
+		if (argc != 6) {
+			fprintf(stderr,
+				"locking: Invalid number of arguments (%d)\n",
+				argc);
+			usage(argv[0]);
+			exit(1);
+		}
+		result = lock_db(argv[4], argv[5], &state);
+
+	} else {
+		fprintf(stderr, "locking: Invalid lock-type '%s'\n", lock_type);
+		usage(argv[0]);
+		exit(1);
+	}
+
+	send_result(write_fd, result);
+
+	req = wait_for_parent_send(ev, ev, ppid);
+	if (req == NULL) {
+		fprintf(stderr, "locking: wait_for_parent_send() failed\n");
+		cleanup(&state);
+		exit(1);
+	}
+
+	tevent_req_poll(req, ev);
+
+	status = wait_for_parent_recv(req, &err);
+	if (! status) {
+		fprintf(stderr,
+			"locking: wait_for_parent_recv() failed (%d)\n",
+			err);
+	}
+
+	talloc_free(ev);
+	cleanup(&state);
+	return 0;
+}
diff --git a/ctdb/server/ctdb_logging.c b/ctdb/server/ctdb_logging.c
new file mode 100644
index 0000000..1da26b5
--- /dev/null
+++ b/ctdb/server/ctdb_logging.c
@@ -0,0 +1,174 @@
+/*
+   ctdb logging code
+
+   Copyright (C) Andrew Tridgell  2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/blocking.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_log_state {
+	int fd, pfd;
+	char buf[1024];
+	uint16_t buf_used;
+};
+
+/* Used by ctdb_set_child_logging() */
+static struct ctdb_log_state *log_state;
+
+/* Initialise logging */
+bool ctdb_logging_init(TALLOC_CTX *mem_ctx, const char *logging,
+		       const char *debug_level)
+{
+	int ret;
+
+	log_state = talloc_zero(mem_ctx, struct ctdb_log_state);
+	if (log_state == NULL) {
+		return false;
+	}
+
+	ret = logging_init(mem_ctx, logging, debug_level, "ctdbd");
+	if (ret != 0) {
+		return false;
+	}
+
+	return true;
+}
+
+static void write_to_log(const char *buf, unsigned int len)
+{
+	DEBUG(script_log_level, ("%*.*s\n", len, len, buf));
+}
+
+/*
+  called when log data comes in from a child process
+ */
+static void ctdb_child_log_handler(struct tevent_context *ev,
+				   struct tevent_fd *fde,
+				   uint16_t flags, void *private)
+{
+	struct ctdb_log_state *log = talloc_get_type(private, struct ctdb_log_state);
+	char *p;
+	int n;
+
+	if (!(flags & TEVENT_FD_READ)) {
+		return;
+	}
+
+	n = sys_read(log->pfd, &log->buf[log->buf_used],
+		 sizeof(log->buf) - log->buf_used);
+	if (n > 0) {
+		log->buf_used += n;
+	} else if (n == 0) {
+		if (log != log_state) {
+			talloc_free(log);
+		}
+		return;
+	}
+
+	while (log->buf_used > 0 &&
+	       (p = memchr(log->buf, '\n', log->buf_used)) != NULL) {
+		int n1 = (p - log->buf)+1;
+		int n2 = n1 - 1;
+		/* swallow \r from child processes */
+		if (n2 > 0 && log->buf[n2-1] == '\r') {
+			n2--;
+		}
+		write_to_log(log->buf, n2);
+		memmove(log->buf, p+1, sizeof(log->buf) - n1);
+		log->buf_used -= n1;
+	}
+
+	/* the buffer could have completely filled - unfortunately we have
+	   no choice but to dump it out straight away */
+	if (log->buf_used == sizeof(log->buf)) {
+		write_to_log(log->buf, log->buf_used);
+		log->buf_used = 0;
+	}
+}
+
+/*
+  setup for logging of child process stdout
+*/
+int ctdb_set_child_logging(struct ctdb_context *ctdb)
+{
+	int p[2];
+	int old_stdout, old_stderr;
+	struct tevent_fd *fde;
+
+	/* setup a pipe to catch IO from subprocesses */
+	if (pipe(p) != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to setup for child logging pipe\n"));
+		return -1;
+	}
+
+	/* We'll fail if stderr/stdout not already open; it's simpler. */
+	old_stdout = dup(STDOUT_FILENO);
+	if (old_stdout < 0) {
+		DEBUG(DEBUG_ERR, ("Failed to dup stdout for child logging\n"));
+		return -1;
+	}
+	old_stderr = dup(STDERR_FILENO);
+	if (old_stderr < 0) {
+		DEBUG(DEBUG_ERR, ("Failed to dup stderr for child logging\n"));
+		close(old_stdout);
+		return -1;
+	}
+	if (dup2(p[1], STDOUT_FILENO) < 0 || dup2(p[1], STDERR_FILENO) < 0) {
+		int saved_errno = errno;
+		dup2(old_stdout, STDOUT_FILENO);
+		dup2(old_stderr, STDERR_FILENO);
+		close(old_stdout);
+		close(old_stderr);
+		close(p[0]);
+		close(p[1]);
+		errno = saved_errno;
+
+		printf(__location__ " dup2 failed: %s\n",
+			strerror(errno));
+		return -1;
+	}
+	close(p[1]);
+	close(old_stdout);
+	close(old_stderr);
+
+	fde = tevent_add_fd(ctdb->ev, log_state, p[0], TEVENT_FD_READ,
+			    ctdb_child_log_handler, log_state);
+	tevent_fd_set_auto_close(fde);
+
+	log_state->pfd = p[0];
+
+	DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for logging\n", p[0]));
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c
new file mode 100644
index 0000000..e2cb916
--- /dev/null
+++ b/ctdb/server/ctdb_ltdb_server.c
@@ -0,0 +1,1663 @@
+/*
+   ctdb ltdb code - server side
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/dir.h"
+#include "system/time.h"
+#include "system/locale.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "server/ctdb_config.h"
+
+#define PERSISTENT_HEALTH_TDB "persistent_health.tdb"
+
+/**
+ * write a record to a normal database
+ *
+ * This is the server-variant of the ctdb_ltdb_store function.
+ * It contains logic to determine whether a record should be
+ * stored or deleted. It also sends SCHEDULE_FOR_DELETION
+ * controls to the local ctdb daemon if appropriate.
+ */
+static int ctdb_ltdb_store_server(struct ctdb_db_context *ctdb_db,
+				  TDB_DATA key,
+				  struct ctdb_ltdb_header *header,
+				  TDB_DATA data)
+{
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	TDB_DATA rec[2];
+	uint32_t hsize = sizeof(struct ctdb_ltdb_header);
+	int ret;
+	bool keep = false;
+	bool schedule_for_deletion = false;
+	bool remove_from_delete_queue = false;
+	uint32_t lmaster;
+
+	if (ctdb->flags & CTDB_FLAG_TORTURE) {
+		TDB_DATA old;
+		struct ctdb_ltdb_header *h2;
+
+		old = tdb_fetch(ctdb_db->ltdb->tdb, key);
+		h2 = (struct ctdb_ltdb_header *)old.dptr;
+		if (old.dptr != NULL &&
+		    old.dsize >= hsize &&
+		    h2->rsn > header->rsn) {
+			DEBUG(DEBUG_ERR,
+			      ("RSN regression! %"PRIu64" %"PRIu64"\n",
+			       h2->rsn, header->rsn));
+		}
+		if (old.dptr) {
+			free(old.dptr);
+		}
+	}
+
+	if (ctdb->vnn_map == NULL) {
+		/*
+		 * Called from a client: always store the record
+		 * Also don't call ctdb_lmaster since it uses the vnn_map!
+		 */
+		keep = true;
+		goto store;
+	}
+
+	lmaster = ctdb_lmaster(ctdb_db->ctdb, &key);
+
+	/*
+	 * If we migrate an empty record off to another node
+	 * and the record has not been migrated with data,
+	 * delete the record instead of storing the empty record.
+	 */
+	if (data.dsize != 0) {
+		keep = true;
+	} else if (header->flags & CTDB_REC_RO_FLAGS) {
+		keep = true;
+	} else if (header->flags & CTDB_REC_FLAG_AUTOMATIC) {
+		/*
+		 * The record is not created by the client but
+		 * automatically by the ctdb_ltdb_fetch logic that
+		 * creates a record with an initial header in the
+		 * ltdb before trying to migrate the record from
+		 * the current lmaster. Keep it instead of trying
+		 * to delete the non-existing record...
+		 */
+		keep = true;
+		schedule_for_deletion = true;
+	} else if (header->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) {
+		keep = true;
+	} else if (ctdb_db->ctdb->pnn == lmaster) {
+		/*
+		 * If we are lmaster, then we usually keep the record.
+		 * But if we retrieve the dmaster role by a VACUUM_MIGRATE
+		 * and the record is empty and has never been migrated
+		 * with data, then we should delete it instead of storing it.
+		 * This is part of the vacuuming process.
+		 *
+		 * The reason that we usually need to store even empty records
+		 * on the lmaster is that a client operating directly on the
+		 * lmaster (== dmaster) expects the local copy of the record to
+		 * exist after successful ctdb migrate call. If the record does
+		 * not exist, the client goes into a migrate loop and eventually
+		 * fails. So storing the empty record makes sure that we do not
+		 * need to change the client code.
+		 */
+		if (!(header->flags & CTDB_REC_FLAG_VACUUM_MIGRATED)) {
+			keep = true;
+		} else if (ctdb_db->ctdb->pnn != header->dmaster) {
+			keep = true;
+		}
+	} else if (ctdb_db->ctdb->pnn == header->dmaster) {
+		keep = true;
+	}
+
+	if (keep) {
+		if (ctdb_db_volatile(ctdb_db) &&
+		    (ctdb_db->ctdb->pnn == header->dmaster) &&
+		    !(header->flags & CTDB_REC_RO_FLAGS))
+		{
+			header->rsn++;
+
+			if (data.dsize == 0) {
+				schedule_for_deletion = true;
+			}
+		}
+		remove_from_delete_queue = !schedule_for_deletion;
+	}
+
+store:
+	/*
+	 * The VACUUM_MIGRATED flag is only set temporarily for
+	 * the above logic when the record was retrieved by a
+	 * VACUUM_MIGRATE call and should not be stored in the
+	 * database.
+	 *
+	 * The VACUUM_MIGRATE call is triggered by a vacuum fetch,
+	 * and there are two cases in which the corresponding record
+	 * is stored in the local database:
+	 * 1. The record has been migrated with data in the past
+	 *    (the MIGRATED_WITH_DATA record flag is set).
+	 * 2. The record has been filled with data again since it
+	 *    had been submitted in the VACUUM_FETCH message to the
+	 *    lmaster.
+	 * For such records it is important to not store the
+	 * VACUUM_MIGRATED flag in the database.
+	 */
+	header->flags &= ~CTDB_REC_FLAG_VACUUM_MIGRATED;
+
+	/*
+	 * Similarly, clear the AUTOMATIC flag which should not enter
+	 * the local database copy since this would require client
+	 * modifications to clear the flag when the client stores
+	 * the record.
+	 */
+	header->flags &= ~CTDB_REC_FLAG_AUTOMATIC;
+
+	rec[0].dsize = hsize;
+	rec[0].dptr = (uint8_t *)header;
+
+	rec[1].dsize = data.dsize;
+	rec[1].dptr = data.dptr;
+
+	DEBUG(DEBUG_DEBUG, (__location__ " db[%s]: %s record: hash[0x%08x]\n",
+			    ctdb_db->db_name,
+			    keep?"storing":"deleting",
+			    ctdb_hash(&key)));
+
+	if (keep) {
+		ret = tdb_storev(ctdb_db->ltdb->tdb, key, rec, 2, TDB_REPLACE);
+	} else {
+		ret = tdb_delete(ctdb_db->ltdb->tdb, key);
+	}
+
+	if (ret != 0) {
+		int lvl = DEBUG_ERR;
+
+		if (keep == false &&
+		    tdb_error(ctdb_db->ltdb->tdb) == TDB_ERR_NOEXIST)
+		{
+			lvl = DEBUG_DEBUG;
+		}
+
+		DEBUG(lvl, (__location__ " db[%s]: Failed to %s record: "
+			    "%d - %s\n",
+			    ctdb_db->db_name,
+			    keep?"store":"delete", ret,
+			    tdb_errorstr(ctdb_db->ltdb->tdb)));
+
+		schedule_for_deletion = false;
+		remove_from_delete_queue = false;
+	}
+
+	if (schedule_for_deletion) {
+		int ret2;
+		ret2 = ctdb_local_schedule_for_deletion(ctdb_db, header, key);
+		if (ret2 != 0) {
+			DEBUG(DEBUG_ERR, (__location__ " ctdb_local_schedule_for_deletion failed.\n"));
+		}
+	}
+
+	if (remove_from_delete_queue) {
+		ctdb_local_remove_from_delete_queue(ctdb_db, header, key);
+	}
+
+	return ret;
+}
+
+struct lock_fetch_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	void (*recv_pkt)(void *, struct ctdb_req_header *);
+	void *recv_context;
+	struct ctdb_req_header *hdr;
+	uint32_t generation;
+	bool ignore_generation;
+};
+
+/*
+  called when we should retry the operation
+ */
+static void lock_fetch_callback(void *p, bool locked)
+{
+	struct lock_fetch_state *state = talloc_get_type(p, struct lock_fetch_state);
+	if (!state->ignore_generation &&
+	    state->generation != state->ctdb_db->generation) {
+		DEBUG(DEBUG_NOTICE,("Discarding previous generation lockwait packet\n"));
+		talloc_free(state->hdr);
+		return;
+	}
+	state->recv_pkt(state->recv_context, state->hdr);
+	DEBUG(DEBUG_INFO,(__location__ " PACKET REQUEUED\n"));
+}
+
+
+/*
+  do a non-blocking ltdb_lock, deferring this ctdb request until we
+  have the chainlock
+
+  It does the following:
+
+   1) tries to get the chainlock. If it succeeds, then it returns 0
+
+   2) if it fails to get a chainlock immediately then it sets up a
+   non-blocking chainlock via ctdb_lock_record, and when it gets the
+   chainlock it re-submits this ctdb request to the main packet
+   receive function.
+
+   This effectively queues all ctdb requests that cannot be
+   immediately satisfied until it can get the lock. This means that
+   the main ctdb daemon will not block waiting for a chainlock held by
+   a client
+
+   There are 3 possible return values:
+
+       0:    means that it got the lock immediately.
+      -1:    means that it failed to get the lock, and won't retry
+      -2:    means that it failed to get the lock immediately, but will retry
+ */
+int ctdb_ltdb_lock_requeue(struct ctdb_db_context *ctdb_db,
+			   TDB_DATA key, struct ctdb_req_header *hdr,
+			   void (*recv_pkt)(void *, struct ctdb_req_header *),
+			   void *recv_context, bool ignore_generation)
+{
+	int ret;
+	struct tdb_context *tdb = ctdb_db->ltdb->tdb;
+	struct lock_request *lreq;
+	struct lock_fetch_state *state;
+
+	ret = tdb_chainlock_nonblock(tdb, key);
+
+	if (ret != 0 &&
+	    !(errno == EACCES || errno == EAGAIN || errno == EDEADLK)) {
+		/* a hard failure - don't try again */
+		return -1;
+	}
+
+	/* when torturing, ensure we test the contended path */
+	if ((ctdb_db->ctdb->flags & CTDB_FLAG_TORTURE) &&
+	    random() % 5 == 0) {
+		ret = -1;
+		tdb_chainunlock(tdb, key);
+	}
+
+	/* first the non-contended path */
+	if (ret == 0) {
+		return 0;
+	}
+
+	state = talloc(hdr, struct lock_fetch_state);
+	state->ctdb = ctdb_db->ctdb;
+	state->ctdb_db = ctdb_db;
+	state->hdr = hdr;
+	state->recv_pkt = recv_pkt;
+	state->recv_context = recv_context;
+	state->generation = ctdb_db->generation;
+	state->ignore_generation = ignore_generation;
+
+	/* now the contended path */
+	lreq = ctdb_lock_record(state, ctdb_db, key, true, lock_fetch_callback, state);
+	if (lreq == NULL) {
+		return -1;
+	}
+
+	/* we need to move the packet off the temporary context in ctdb_input_pkt(),
+	   so it won't be freed yet */
+	talloc_steal(state, hdr);
+
+	/* now tell the caller than we will retry asynchronously */
+	return -2;
+}
+
+/*
+  a variant of ctdb_ltdb_lock_requeue that also fetches the record
+ */
+int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db,
+				 TDB_DATA key, struct ctdb_ltdb_header *header,
+				 struct ctdb_req_header *hdr, TDB_DATA *data,
+				 void (*recv_pkt)(void *, struct ctdb_req_header *),
+				 void *recv_context, bool ignore_generation)
+{
+	int ret;
+
+	ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr, recv_pkt,
+				     recv_context, ignore_generation);
+	if (ret != 0) {
+		return ret;
+	}
+
+	ret = ctdb_ltdb_fetch(ctdb_db, key, header, hdr, data);
+	if (ret != 0) {
+		int uret;
+		uret = ctdb_ltdb_unlock(ctdb_db, key);
+		if (uret != 0) {
+			DBG_ERR("ctdb_ltdb_unlock() failed with error %d\n",
+				uret);
+		}
+	}
+	return ret;
+}
+
+
+/*
+  paranoid check to see if the db is empty
+ */
+static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db)
+{
+	struct tdb_context *tdb = ctdb_db->ltdb->tdb;
+	int count = tdb_traverse_read(tdb, NULL, NULL);
+	if (count != 0) {
+		DEBUG(DEBUG_ALERT,(__location__ " tdb '%s' not empty on attach! aborting\n",
+			 ctdb_db->db_path));
+		ctdb_fatal(ctdb_db->ctdb, "database not empty on attach");
+	}
+}
+
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+				struct ctdb_db_context *ctdb_db)
+{
+	struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+	char *old;
+	char *reason = NULL;
+	TDB_DATA key;
+	TDB_DATA val;
+
+	key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+	key.dsize = strlen(ctdb_db->db_name);
+
+	old = ctdb_db->unhealthy_reason;
+	ctdb_db->unhealthy_reason = NULL;
+
+	val = tdb_fetch(tdb, key);
+	if (val.dsize > 0) {
+		reason = talloc_strndup(ctdb_db,
+					(const char *)val.dptr,
+					val.dsize);
+		if (reason == NULL) {
+			DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n",
+					   (int)val.dsize));
+			ctdb_db->unhealthy_reason = old;
+			free(val.dptr);
+			return -1;
+		}
+	}
+
+	if (val.dptr) {
+		free(val.dptr);
+	}
+
+	talloc_free(old);
+	ctdb_db->unhealthy_reason = reason;
+	return 0;
+}
+
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+				  struct ctdb_db_context *ctdb_db,
+				  const char *given_reason,/* NULL means healthy */
+				  unsigned int num_healthy_nodes)
+{
+	struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+	int ret;
+	TDB_DATA key;
+	TDB_DATA val;
+	char *new_reason = NULL;
+	char *old_reason = NULL;
+
+	ret = tdb_transaction_start(tdb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n",
+				   tdb_name(tdb), ret, tdb_errorstr(tdb)));
+		return -1;
+	}
+
+	ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+	if (ret != 0) {
+		DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+				   ctdb_db->db_name, ret));
+		return -1;
+	}
+	old_reason = ctdb_db->unhealthy_reason;
+
+	key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+	key.dsize = strlen(ctdb_db->db_name);
+
+	if (given_reason) {
+		new_reason = talloc_strdup(ctdb_db, given_reason);
+		if (new_reason == NULL) {
+			DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n",
+					  given_reason));
+			return -1;
+		}
+	} else if (old_reason && num_healthy_nodes == 0) {
+		/*
+		 * If the reason indicates ok, but there were no healthy nodes
+		 * available, it means that we have not recovered valid content
+		 * of the db. So if there's an old reason, prefix it with
+		 * "NO-HEALTHY-NODES - "
+		 */
+		const char *prefix;
+
+#define _TMP_PREFIX "NO-HEALTHY-NODES - "
+		ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX));
+		if (ret != 0) {
+			prefix = _TMP_PREFIX;
+		} else {
+			prefix = "";
+		}
+		new_reason = talloc_asprintf(ctdb_db, "%s%s",
+					 prefix, old_reason);
+		if (new_reason == NULL) {
+			DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n",
+					  prefix, old_reason));
+			return -1;
+		}
+#undef _TMP_PREFIX
+	}
+
+	if (new_reason) {
+		val.dptr = discard_const_p(uint8_t, new_reason);
+		val.dsize = strlen(new_reason);
+
+		ret = tdb_store(tdb, key, val, TDB_REPLACE);
+		if (ret != 0) {
+			tdb_transaction_cancel(tdb);
+			DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n",
+					   tdb_name(tdb), ctdb_db->db_name, new_reason,
+					   ret, tdb_errorstr(tdb)));
+			talloc_free(new_reason);
+			return -1;
+		}
+		DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n",
+				   ctdb_db->db_name, new_reason));
+	} else if (old_reason) {
+		ret = tdb_delete(tdb, key);
+		if (ret != 0) {
+			tdb_transaction_cancel(tdb);
+			DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n",
+					   tdb_name(tdb), ctdb_db->db_name,
+					   ret, tdb_errorstr(tdb)));
+			talloc_free(new_reason);
+			return -1;
+		}
+		DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n",
+				   ctdb_db->db_name));
+	}
+
+	ret = tdb_transaction_commit(tdb);
+	if (ret != TDB_SUCCESS) {
+		DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n",
+				   tdb_name(tdb), ret, tdb_errorstr(tdb)));
+		talloc_free(new_reason);
+		return -1;
+	}
+
+	talloc_free(old_reason);
+	ctdb_db->unhealthy_reason = new_reason;
+
+	return 0;
+}
+
+static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb,
+				     struct ctdb_db_context *ctdb_db)
+{
+	time_t now = time(NULL);
+	char *new_path;
+	char *new_reason;
+	int ret;
+	struct tm *tm;
+
+	tm = gmtime(&now);
+
+	/* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */
+	new_path = talloc_asprintf(ctdb_db, "%s.corrupted."
+				   "%04u%02u%02u%02u%02u%02u.0Z",
+				   ctdb_db->db_path,
+				   tm->tm_year+1900, tm->tm_mon+1,
+				   tm->tm_mday, tm->tm_hour, tm->tm_min,
+				   tm->tm_sec);
+	if (new_path == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+		return -1;
+	}
+
+	new_reason = talloc_asprintf(ctdb_db,
+				     "ERROR - Backup of corrupted TDB in '%s'",
+				     new_path);
+	if (new_reason == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+		return -1;
+	}
+	ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0);
+	talloc_free(new_reason);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,(__location__
+				 ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n",
+				 ctdb_db->db_path));
+		return -1;
+	}
+
+	ret = rename(ctdb_db->db_path, new_path);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,(__location__
+				  ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n",
+				  ctdb_db->db_path, new_path,
+				  errno, strerror(errno)));
+		talloc_free(new_path);
+		return -1;
+	}
+
+	DEBUG(DEBUG_CRIT,(__location__
+			 ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n",
+			 ctdb_db->db_path, new_path));
+	talloc_free(new_path);
+	return 0;
+}
+
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb)
+{
+	struct ctdb_db_context *ctdb_db;
+	int ret;
+	int ok = 0;
+	int fail = 0;
+
+	for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+		if (!ctdb_db_persistent(ctdb_db)) {
+			continue;
+		}
+
+		ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+		if (ret != 0) {
+			DEBUG(DEBUG_ALERT,(__location__
+					   " load persistent health for '%s' failed\n",
+					   ctdb_db->db_path));
+			return -1;
+		}
+
+		if (ctdb_db->unhealthy_reason == NULL) {
+			ok++;
+			DEBUG(DEBUG_INFO,(__location__
+				   " persistent db '%s' healthy\n",
+				   ctdb_db->db_path));
+			continue;
+		}
+
+		fail++;
+		DEBUG(DEBUG_ALERT,(__location__
+				   " persistent db '%s' unhealthy: %s\n",
+				   ctdb_db->db_path,
+				   ctdb_db->unhealthy_reason));
+	}
+	DEBUG(DEBUG_NOTICE,
+	      ("ctdb_recheck_persistent_health: OK[%d] FAIL[%d]\n",
+	       ok, fail));
+
+	if (fail != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+  mark a database - as healthy
+ */
+int32_t ctdb_control_db_set_healthy(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	uint32_t db_id = *(uint32_t *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	int ret;
+	bool may_recover = false;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		may_recover = true;
+	}
+
+	ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, 1);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__
+				 " ctdb_update_persistent_health(%s) failed\n",
+				 ctdb_db->db_name));
+		return -1;
+	}
+
+	if (may_recover && ctdb->runstate == CTDB_RUNSTATE_STARTUP) {
+		DEBUG(DEBUG_ERR, (__location__ " db %s become healthy  - force recovery for startup\n",
+				  ctdb_db->db_name));
+		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+	}
+
+	return 0;
+}
+
+int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb,
+				   TDB_DATA indata,
+				   TDB_DATA *outdata)
+{
+	uint32_t db_id = *(uint32_t *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	int ret;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+		return -1;
+	}
+
+	ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__
+				 " ctdb_load_persistent_health(%s) failed\n",
+				 ctdb_db->db_name));
+		return -1;
+	}
+
+	*outdata = tdb_null;
+	if (ctdb_db->unhealthy_reason) {
+		outdata->dptr = (uint8_t *)ctdb_db->unhealthy_reason;
+		outdata->dsize = strlen(ctdb_db->unhealthy_reason)+1;
+	}
+
+	return 0;
+}
+
+
+int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+	char *ropath;
+
+	if (ctdb_db_readonly(ctdb_db)) {
+		return 0;
+	}
+
+	if (! ctdb_db_volatile(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Non-volatile databases do not support readonly flag\n"));
+		return -1;
+	}
+
+	ropath = talloc_asprintf(ctdb_db, "%s.RO", ctdb_db->db_path);
+	if (ropath == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to asprintf the tracking database\n"));
+		return -1;
+	}
+	ctdb_db->rottdb = tdb_open(ropath,
+			      ctdb->tunable.database_hash_size,
+			      TDB_NOLOCK|TDB_CLEAR_IF_FIRST|TDB_NOSYNC,
+			      O_CREAT|O_RDWR, 0600);
+	if (ctdb_db->rottdb == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to open/create the tracking database '%s'\n", ropath));
+		talloc_free(ropath);
+		return -1;
+	}
+
+	DEBUG(DEBUG_NOTICE,("OPENED tracking database : '%s'\n", ropath));
+
+	ctdb_db_set_readonly(ctdb_db);
+
+	DEBUG(DEBUG_NOTICE, ("Readonly property set on DB %s\n", ctdb_db->db_name));
+
+	talloc_free(ropath);
+	return 0;
+}
+
+/*
+  attach to a database, handling both persistent and non-persistent databases
+  return 0 on success, -1 on failure
+ */
+static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
+			     uint8_t db_flags, const char *unhealthy_reason)
+{
+	struct ctdb_db_context *ctdb_db, *tmp_db;
+	int ret;
+	struct TDB_DATA key;
+	int tdb_flags;
+	int mode = 0600;
+	int remaining_tries = 0;
+
+	ctdb_db = talloc_zero(ctdb, struct ctdb_db_context);
+	CTDB_NO_MEMORY(ctdb, ctdb_db);
+
+	ctdb_db->ctdb = ctdb;
+	ctdb_db->db_name = talloc_strdup(ctdb_db, db_name);
+	CTDB_NO_MEMORY(ctdb, ctdb_db->db_name);
+
+	key.dsize = strlen(db_name)+1;
+	key.dptr  = discard_const(db_name);
+	ctdb_db->db_id = ctdb_hash(&key);
+	ctdb_db->db_flags = db_flags;
+
+	if (ctdb_db_volatile(ctdb_db)) {
+		ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+		if (ctdb_db->delete_queue == NULL) {
+			CTDB_NO_MEMORY(ctdb, ctdb_db->delete_queue);
+		}
+
+		ctdb_db->fetch_queue = trbt_create(ctdb_db, 0);
+		if (ctdb_db->fetch_queue == NULL) {
+			CTDB_NO_MEMORY(ctdb, ctdb_db->fetch_queue);
+		}
+
+		ctdb_db->ctdb_ltdb_store_fn = ctdb_ltdb_store_server;
+	}
+
+	/* check for hash collisions */
+	for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
+		if (tmp_db->db_id == ctdb_db->db_id) {
+			DEBUG(DEBUG_CRIT,("db_id 0x%x hash collision. name1='%s' name2='%s'\n",
+				 tmp_db->db_id, db_name, tmp_db->db_name));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+	}
+
+	if (ctdb_db_persistent(ctdb_db)) {
+		if (unhealthy_reason) {
+			ret = ctdb_update_persistent_health(ctdb, ctdb_db,
+							    unhealthy_reason, 0);
+			if (ret != 0) {
+				DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n",
+						   ctdb_db->db_name, unhealthy_reason, ret));
+				talloc_free(ctdb_db);
+				return -1;
+			}
+		}
+
+		if (ctdb->max_persistent_check_errors > 0) {
+			remaining_tries = 1;
+		}
+		if (ctdb->runstate == CTDB_RUNSTATE_RUNNING) {
+			remaining_tries = 0;
+		}
+
+		ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+		if (ret != 0) {
+			DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+				   ctdb_db->db_name, ret));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+	}
+
+	if (ctdb_db->unhealthy_reason && remaining_tries == 0) {
+		DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n",
+				   ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		/* this is just a warning, but we want that in the log file! */
+		DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n",
+				   ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
+	/* open the database */
+	ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u",
+					   ctdb_db_persistent(ctdb_db) ?
+						ctdb->db_directory_persistent :
+						ctdb->db_directory,
+					   db_name, ctdb->pnn);
+
+	tdb_flags = ctdb_db_tdb_flags(db_flags,
+				      ctdb->valgrinding,
+				      ctdb_config.tdb_mutexes);
+
+again:
+	ctdb_db->ltdb = tdb_wrap_open(ctdb_db, ctdb_db->db_path,
+				      ctdb->tunable.database_hash_size,
+				      tdb_flags,
+				      O_CREAT|O_RDWR, mode);
+	if (ctdb_db->ltdb == NULL) {
+		struct stat st;
+		int saved_errno = errno;
+
+		if (! ctdb_db_persistent(ctdb_db)) {
+			DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		if (remaining_tries == 0) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  "Failed to open persistent tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		ret = stat(ctdb_db->db_path, &st);
+		if (ret != 0) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  "Failed to open persistent tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+		if (ret != 0) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  "Failed to open persistent tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		remaining_tries--;
+		mode = st.st_mode;
+		goto again;
+	}
+
+	if (!ctdb_db_persistent(ctdb_db)) {
+		ctdb_check_db_empty(ctdb_db);
+	} else {
+		ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL);
+		if (ret != 0) {
+			int fd;
+			struct stat st;
+
+			DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n",
+					  ctdb_db->db_path, ret,
+					  tdb_errorstr(ctdb_db->ltdb->tdb)));
+			if (remaining_tries == 0) {
+				talloc_free(ctdb_db);
+				return -1;
+			}
+
+			fd = tdb_fd(ctdb_db->ltdb->tdb);
+			ret = fstat(fd, &st);
+			if (ret != 0) {
+				DEBUG(DEBUG_CRIT,(__location__
+						  "Failed to fstat() persistent tdb '%s': %d - %s\n",
+						  ctdb_db->db_path,
+						  errno,
+						  strerror(errno)));
+				talloc_free(ctdb_db);
+				return -1;
+			}
+
+			/* close the TDB */
+			talloc_free(ctdb_db->ltdb);
+			ctdb_db->ltdb = NULL;
+
+			ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+			if (ret != 0) {
+				DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n",
+						  ctdb_db->db_path));
+				talloc_free(ctdb_db);
+				return -1;
+			}
+
+			remaining_tries--;
+			mode = st.st_mode;
+			goto again;
+		}
+	}
+
+	/* remember the flags the client has specified */
+	tdb_add_flags(ctdb_db->ltdb->tdb, tdb_flags);
+
+
+	/* set up a rb tree we can use to track which records we have a
+	   fetch-lock in-flight for so we can defer any additional calls
+	   for the same record.
+	 */
+	ctdb_db->deferred_fetch = trbt_create(ctdb_db, 0);
+	if (ctdb_db->deferred_fetch == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to create deferred fetch rb tree for ctdb database\n"));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	ctdb_db->defer_dmaster = trbt_create(ctdb_db, 0);
+	if (ctdb_db->defer_dmaster == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to create defer dmaster rb tree for %s\n",
+				  ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	DLIST_ADD(ctdb->db_list, ctdb_db);
+
+	/* setting this can help some high churn databases */
+	tdb_set_max_dead(ctdb_db->ltdb->tdb, ctdb->tunable.database_max_dead);
+
+	/*
+	   all databases support the "null" function. we need this in
+	   order to do forced migration of records
+	*/
+	ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_null_func, CTDB_NULL_FUNC);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to setup null function for '%s'\n", ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	/*
+	   all databases support the "fetch" function. we need this
+	   for efficient Samba3 ctdb fetch
+	*/
+	ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_func, CTDB_FETCH_FUNC);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	/*
+	   all databases support the "fetch_with_header" function. we need this
+	   for efficient readonly record fetches
+	*/
+	ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	ret = ctdb_vacuum_init(ctdb_db);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to setup vacuuming for "
+				  "database '%s'\n", ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	ret = ctdb_migration_init(ctdb_db);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to setup migration tracking for db '%s'\n",
+		       ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	ret = db_hash_init(ctdb_db, "lock_log", 2048, DB_HASH_COMPLEX,
+			   &ctdb_db->lock_log);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to setup lock logging for db '%s'\n",
+		       ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	ctdb_db->generation = ctdb->vnn_map->generation;
+
+	DEBUG(DEBUG_NOTICE,("Attached to database '%s' with flags 0x%x\n",
+			    ctdb_db->db_path, tdb_flags));
+
+	/* success */
+	return 0;
+}
+
+
+struct ctdb_deferred_attach_context {
+	struct ctdb_deferred_attach_context *next, *prev;
+	struct ctdb_context *ctdb;
+	struct ctdb_req_control_old *c;
+};
+
+
+static int ctdb_deferred_attach_destructor(struct ctdb_deferred_attach_context *da_ctx)
+{
+	DLIST_REMOVE(da_ctx->ctdb->deferred_attach, da_ctx);
+
+	return 0;
+}
+
+static void ctdb_deferred_attach_timeout(struct tevent_context *ev,
+					 struct tevent_timer *te,
+					 struct timeval t, void *private_data)
+{
+	struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context);
+	struct ctdb_context *ctdb = da_ctx->ctdb;
+
+	ctdb_request_control_reply(ctdb, da_ctx->c, NULL, -1, NULL);
+	talloc_free(da_ctx);
+}
+
+static void ctdb_deferred_attach_callback(struct tevent_context *ev,
+					  struct tevent_timer *te,
+					  struct timeval t, void *private_data)
+{
+	struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context);
+	struct ctdb_context *ctdb = da_ctx->ctdb;
+
+	/* This talloc-steals the packet ->c */
+	ctdb_input_pkt(ctdb, (struct ctdb_req_header *)da_ctx->c);
+	talloc_free(da_ctx);
+}
+
+int ctdb_process_deferred_attach(struct ctdb_context *ctdb)
+{
+	struct ctdb_deferred_attach_context *da_ctx;
+
+	/* call it from the main event loop as soon as the current event
+	   finishes.
+	 */
+	while ((da_ctx = ctdb->deferred_attach) != NULL) {
+		DLIST_REMOVE(ctdb->deferred_attach, da_ctx);
+		tevent_add_timer(ctdb->ev, da_ctx,
+				 timeval_current_ofs(1,0),
+				 ctdb_deferred_attach_callback, da_ctx);
+	}
+
+	return 0;
+}
+
+/*
+  a client has asked to attach a new database
+ */
+int32_t ctdb_control_db_attach(struct ctdb_context *ctdb,
+			       TDB_DATA indata,
+			       TDB_DATA *outdata,
+			       uint8_t db_flags,
+			       uint32_t srcnode,
+			       uint32_t client_id,
+			       struct ctdb_req_control_old *c,
+			       bool *async_reply)
+{
+	const char *db_name = (const char *)indata.dptr;
+	struct ctdb_db_context *db;
+	struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+	struct ctdb_client *client = NULL;
+	uint32_t opcode;
+
+	if (ctdb->tunable.allow_client_db_attach == 0) {
+		DEBUG(DEBUG_ERR, ("DB Attach to database %s denied by tunable "
+				  "AllowClientDBAccess == 0\n", db_name));
+		return -1;
+	}
+
+	/* don't allow any local clients to attach while we are in recovery mode
+	 * except for the recovery daemon.
+	 * allow all attach from the network since these are always from remote
+	 * recovery daemons.
+	 */
+	if (srcnode == ctdb->pnn && client_id != 0) {
+		client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	}
+	if (client != NULL) {
+		/* If the node is inactive it is not part of the cluster
+		   and we should not allow clients to attach to any
+		   databases
+		*/
+		if (node->flags & NODE_FLAGS_INACTIVE) {
+			DEBUG(DEBUG_ERR,("DB Attach to database %s refused since node is inactive (flags=0x%x)\n", db_name, node->flags));
+			return -1;
+		}
+
+		if ((c->flags & CTDB_CTRL_FLAG_ATTACH_RECOVERY) &&
+		    ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+			DBG_ERR("Attach from recovery refused because "
+				"recovery is not active\n");
+			return -1;
+		}
+
+		if (!(c->flags & CTDB_CTRL_FLAG_ATTACH_RECOVERY) &&
+		    (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ||
+		     ctdb->runstate < CTDB_RUNSTATE_STARTUP)) {
+			struct ctdb_deferred_attach_context *da_ctx = talloc(client, struct ctdb_deferred_attach_context);
+
+			if (da_ctx == NULL) {
+				DEBUG(DEBUG_ERR,("DB Attach to database %s deferral for client with pid:%d failed due to OOM.\n", db_name, client->pid));
+				return -1;
+			}
+
+			da_ctx->ctdb = ctdb;
+			da_ctx->c = talloc_steal(da_ctx, c);
+			talloc_set_destructor(da_ctx, ctdb_deferred_attach_destructor);
+			DLIST_ADD(ctdb->deferred_attach, da_ctx);
+
+			tevent_add_timer(ctdb->ev, da_ctx,
+					 timeval_current_ofs(ctdb->tunable.deferred_attach_timeout, 0),
+					 ctdb_deferred_attach_timeout, da_ctx);
+
+			DEBUG(DEBUG_ERR,("DB Attach to database %s deferred for client with pid:%d since node is in recovery mode.\n", db_name, client->pid));
+			*async_reply = true;
+			return 0;
+		}
+	}
+
+	/* see if we already have this name */
+	db = ctdb_db_handle(ctdb, db_name);
+	if (db) {
+		if ((db->db_flags & db_flags) != db_flags) {
+			DEBUG(DEBUG_ERR,
+			      ("Error: Failed to re-attach with 0x%x flags,"
+			       " database has 0x%x flags\n", db_flags,
+			       db->db_flags));
+			return -1;
+		}
+		outdata->dptr  = (uint8_t *)&db->db_id;
+		outdata->dsize = sizeof(db->db_id);
+		return 0;
+	}
+
+	if (ctdb_local_attach(ctdb, db_name, db_flags, NULL) != 0) {
+		return -1;
+	}
+
+	db = ctdb_db_handle(ctdb, db_name);
+	if (!db) {
+		DEBUG(DEBUG_ERR,("Failed to find db handle for name '%s'\n", db_name));
+		return -1;
+	}
+
+	outdata->dptr  = (uint8_t *)&db->db_id;
+	outdata->dsize = sizeof(db->db_id);
+
+	/* Try to ensure it's locked in mem */
+	lockdown_memory(ctdb->valgrinding);
+
+	if (ctdb_db_persistent(db)) {
+		opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
+	} else if (ctdb_db_replicated(db)) {
+		opcode = CTDB_CONTROL_DB_ATTACH_REPLICATED;
+	} else {
+		opcode = CTDB_CONTROL_DB_ATTACH;
+	}
+
+	/* tell all the other nodes about this database */
+	ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, opcode,
+				 0, CTDB_CTRL_FLAG_NOREPLY,
+				 indata, NULL, NULL);
+
+	/* success */
+	return 0;
+}
+
+/*
+ * a client has asked to detach from a database
+ */
+int32_t ctdb_control_db_detach(struct ctdb_context *ctdb, TDB_DATA indata,
+			       uint32_t client_id)
+{
+	uint32_t db_id;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_client *client = NULL;
+
+	db_id = *(uint32_t *)indata.dptr;
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR, ("Invalid dbid 0x%08x in DB detach\n",
+				  db_id));
+		return -1;
+	}
+
+	if (ctdb->tunable.allow_client_db_attach == 1) {
+		DEBUG(DEBUG_ERR, ("DB detach from database %s denied. "
+				  "Clients are allowed access to databases "
+				  "(AllowClientDBAccess == 1)\n",
+				  ctdb_db->db_name));
+		return -1;
+	}
+
+	if (! ctdb_db_volatile(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Detaching non-volatile database %s denied\n",
+		       ctdb_db->db_name));
+		return -1;
+	}
+
+	/* Cannot detach from database when in recovery */
+	if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
+		DEBUG(DEBUG_ERR, ("DB detach denied while in recovery\n"));
+		return -1;
+	}
+
+	/* If a control comes from a client, then broadcast it to all nodes.
+	 * Do the actual detach only if the control comes from other daemons.
+	 */
+	if (client_id != 0) {
+		client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+		if (client != NULL) {
+			/* forward the control to all the nodes */
+			ctdb_daemon_send_control(ctdb,
+						 CTDB_BROADCAST_CONNECTED, 0,
+						 CTDB_CONTROL_DB_DETACH, 0,
+						 CTDB_CTRL_FLAG_NOREPLY,
+						 indata, NULL, NULL);
+			return 0;
+		}
+		DEBUG(DEBUG_ERR, ("Client has gone away. Failing DB detach "
+				  "for database '%s'\n", ctdb_db->db_name));
+		return -1;
+	}
+
+	/* Disable vacuuming and drop all vacuuming data */
+	talloc_free(ctdb_db->vacuum_handle);
+	talloc_free(ctdb_db->delete_queue);
+	talloc_free(ctdb_db->fetch_queue);
+
+	/* Terminate any deferred fetch */
+	talloc_free(ctdb_db->deferred_fetch);
+
+	/* Terminate any traverses */
+	while (ctdb_db->traverse) {
+		talloc_free(ctdb_db->traverse);
+	}
+
+	/* Terminate any revokes */
+	while (ctdb_db->revokechild_active) {
+		talloc_free(ctdb_db->revokechild_active);
+	}
+
+	/* Free readonly tracking database */
+	if (ctdb_db_readonly(ctdb_db)) {
+		talloc_free(ctdb_db->rottdb);
+	}
+
+	DLIST_REMOVE(ctdb->db_list, ctdb_db);
+
+	DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
+			     ctdb_db->db_name));
+	talloc_free(ctdb_db);
+
+	return 0;
+}
+
+/*
+  attach to all existing persistent databases
+ */
+static int ctdb_attach_persistent(struct ctdb_context *ctdb,
+				  const char *unhealthy_reason)
+{
+	DIR *d;
+	struct dirent *de;
+
+	/* open the persistent db directory and scan it for files */
+	d = opendir(ctdb->db_directory_persistent);
+	if (d == NULL) {
+		return 0;
+	}
+
+	while ((de=readdir(d))) {
+		char *p, *s, *q;
+		size_t len = strlen(de->d_name);
+		uint32_t node;
+		int invalid_name = 0;
+
+		s = talloc_strdup(ctdb, de->d_name);
+		if (s == NULL) {
+			closedir(d);
+			CTDB_NO_MEMORY(ctdb, s);
+		}
+
+		/* only accept names ending in .tdb */
+		p = strstr(s, ".tdb.");
+		if (len < 7 || p == NULL) {
+			talloc_free(s);
+			continue;
+		}
+
+		/* only accept names ending with .tdb. and any number of digits */
+		q = p+5;
+		while (*q != 0 && invalid_name == 0) {
+			if (!isdigit(*q++)) {
+				invalid_name = 1;
+			}
+		}
+		if (invalid_name == 1 || sscanf(p+5, "%u", &node) != 1 || node != ctdb->pnn) {
+			DEBUG(DEBUG_ERR,("Ignoring persistent database '%s'\n", de->d_name));
+			talloc_free(s);
+			continue;
+		}
+		p[4] = 0;
+
+		if (ctdb_local_attach(ctdb, s, CTDB_DB_FLAGS_PERSISTENT, unhealthy_reason) != 0) {
+			DEBUG(DEBUG_ERR,("Failed to attach to persistent database '%s'\n", de->d_name));
+			closedir(d);
+			talloc_free(s);
+			return -1;
+		}
+
+		DEBUG(DEBUG_INFO,("Attached to persistent database %s\n", s));
+
+		talloc_free(s);
+	}
+	closedir(d);
+	return 0;
+}
+
+int ctdb_attach_databases(struct ctdb_context *ctdb)
+{
+	int ret;
+	char *persistent_health_path = NULL;
+	char *unhealthy_reason = NULL;
+	bool first_try = true;
+
+	persistent_health_path = talloc_asprintf(ctdb, "%s/%s.%u",
+						 ctdb->db_directory_state,
+						 PERSISTENT_HEALTH_TDB,
+						 ctdb->pnn);
+	if (persistent_health_path == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+		return -1;
+	}
+
+again:
+
+	ctdb->db_persistent_health = tdb_wrap_open(ctdb, persistent_health_path,
+						   0, TDB_DISALLOW_NESTING,
+						   O_CREAT | O_RDWR, 0600);
+	if (ctdb->db_persistent_health == NULL) {
+		struct tdb_wrap *tdb;
+
+		if (!first_try) {
+			DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+					  persistent_health_path,
+					  errno,
+					  strerror(errno)));
+			talloc_free(persistent_health_path);
+			talloc_free(unhealthy_reason);
+			return -1;
+		}
+		first_try = false;
+
+		unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+						   persistent_health_path,
+						   "was cleared after a failure",
+						   "manual verification needed");
+		if (unhealthy_reason == NULL) {
+			DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+			talloc_free(persistent_health_path);
+			return -1;
+		}
+
+		DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - retrying after CLEAR_IF_FIRST\n",
+				  persistent_health_path));
+		tdb = tdb_wrap_open(ctdb, persistent_health_path,
+				    0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+				    O_CREAT | O_RDWR, 0600);
+		if (tdb) {
+			DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+					  persistent_health_path,
+					  errno,
+					  strerror(errno)));
+			talloc_free(persistent_health_path);
+			talloc_free(unhealthy_reason);
+			return -1;
+		}
+
+		talloc_free(tdb);
+		goto again;
+	}
+	ret = tdb_check(ctdb->db_persistent_health->tdb, NULL, NULL);
+	if (ret != 0) {
+		struct tdb_wrap *tdb;
+
+		talloc_free(ctdb->db_persistent_health);
+		ctdb->db_persistent_health = NULL;
+
+		if (!first_try) {
+			DEBUG(DEBUG_CRIT,("tdb_check('%s') failed\n",
+					  persistent_health_path));
+			talloc_free(persistent_health_path);
+			talloc_free(unhealthy_reason);
+			return -1;
+		}
+		first_try = false;
+
+		unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+						   persistent_health_path,
+						   "was cleared after a failure",
+						   "manual verification needed");
+		if (unhealthy_reason == NULL) {
+			DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+			talloc_free(persistent_health_path);
+			return -1;
+		}
+
+		DEBUG(DEBUG_CRIT,("tdb_check('%s') failed - retrying after CLEAR_IF_FIRST\n",
+				  persistent_health_path));
+		tdb = tdb_wrap_open(ctdb, persistent_health_path,
+				    0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+				    O_CREAT | O_RDWR, 0600);
+		if (tdb) {
+			DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+					  persistent_health_path,
+					  errno,
+					  strerror(errno)));
+			talloc_free(persistent_health_path);
+			talloc_free(unhealthy_reason);
+			return -1;
+		}
+
+		talloc_free(tdb);
+		goto again;
+	}
+	talloc_free(persistent_health_path);
+
+	ret = ctdb_attach_persistent(ctdb, unhealthy_reason);
+	talloc_free(unhealthy_reason);
+	if (ret != 0) {
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+  called when a broadcast seqnum update comes in
+ */
+int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint32_t srcnode)
+{
+	struct ctdb_db_context *ctdb_db;
+	if (srcnode == ctdb->pnn) {
+		/* don't update ourselves! */
+		return 0;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_update_seqnum\n", db_id));
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n",
+				 ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		return -1;
+	}
+
+	tdb_increment_seqnum_nonblock(ctdb_db->ltdb->tdb);
+	ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+	return 0;
+}
+
+/*
+  timer to check for seqnum changes in a ltdb and propagate them
+ */
+static void ctdb_ltdb_seqnum_check(struct tevent_context *ev,
+				   struct tevent_timer *te,
+				   struct timeval t, void *p)
+{
+	struct ctdb_db_context *ctdb_db = talloc_get_type(p, struct ctdb_db_context);
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	uint32_t new_seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+	if (new_seqnum != ctdb_db->seqnum) {
+		/* something has changed - propagate it */
+		TDB_DATA data;
+		data.dptr = (uint8_t *)&ctdb_db->db_id;
+		data.dsize = sizeof(uint32_t);
+		ctdb_daemon_send_control(ctdb,
+					 CTDB_BROADCAST_ACTIVE,
+					 0,
+					 CTDB_CONTROL_UPDATE_SEQNUM,
+					 0,
+					 CTDB_CTRL_FLAG_NOREPLY,
+					 data,
+					 NULL,
+					 NULL);
+	}
+	ctdb_db->seqnum = new_seqnum;
+
+	/* setup a new timer */
+	ctdb_db->seqnum_update =
+		tevent_add_timer(ctdb->ev, ctdb_db,
+				 timeval_current_ofs(ctdb->tunable.seqnum_interval/1000,
+						     (ctdb->tunable.seqnum_interval%1000)*1000),
+				 ctdb_ltdb_seqnum_check, ctdb_db);
+}
+
+/*
+  enable seqnum handling on this db
+ */
+int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id)
+{
+	struct ctdb_db_context *ctdb_db;
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_enable_seqnum\n", db_id));
+		return -1;
+	}
+
+	if (ctdb_db->seqnum_update == NULL) {
+		ctdb_db->seqnum_update = tevent_add_timer(
+			ctdb->ev, ctdb_db,
+			timeval_current_ofs(ctdb->tunable.seqnum_interval/1000,
+					    (ctdb->tunable.seqnum_interval%1000)*1000),
+			ctdb_ltdb_seqnum_check, ctdb_db);
+	}
+
+	tdb_enable_seqnum(ctdb_db->ltdb->tdb);
+	ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+	return 0;
+}
+
+int ctdb_set_db_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+	if (ctdb_db_sticky(ctdb_db)) {
+		return 0;
+	}
+
+	if (! ctdb_db_volatile(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Non-volatile databases do not support sticky flag\n"));
+		return -1;
+	}
+
+	ctdb_db->sticky_records = trbt_create(ctdb_db, 0);
+
+	ctdb_db_set_sticky(ctdb_db);
+
+	DEBUG(DEBUG_NOTICE,("set db sticky %s\n", ctdb_db->db_name));
+
+	return 0;
+}
+
+void ctdb_db_statistics_reset(struct ctdb_db_context *ctdb_db)
+{
+	unsigned int i;
+
+	for (i=0; i<MAX_HOT_KEYS; i++) {
+		if (ctdb_db->hot_keys[i].key.dsize > 0) {
+			TALLOC_FREE(ctdb_db->hot_keys[i].key.dptr);
+			ctdb_db->hot_keys[i].key.dsize = 0;
+		}
+		ctdb_db->hot_keys[i].count = 0;
+		ctdb_db->hot_keys[i].last_logged_count = 0;
+	}
+
+	ZERO_STRUCT(ctdb_db->statistics);
+}
+
+int32_t ctdb_control_get_db_statistics(struct ctdb_context *ctdb,
+				uint32_t db_id,
+				TDB_DATA *outdata)
+{
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_db_statistics_old *stats;
+	unsigned int i;
+	size_t len;
+	char *ptr;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in get_db_statistics\n", db_id));
+		return -1;
+	}
+
+	len = offsetof(struct ctdb_db_statistics_old, hot_keys_wire);
+	for (i = 0; i < MAX_HOT_KEYS; i++) {
+		struct ctdb_db_statistics_old *s = &ctdb_db->statistics;
+
+		s->hot_keys[i].key.dsize = ctdb_db->hot_keys[i].key.dsize;
+		s->hot_keys[i].key.dptr = ctdb_db->hot_keys[i].key.dptr;
+		s->hot_keys[i].count = ctdb_db->hot_keys[i].count;
+
+		len += s->hot_keys[i].key.dsize;
+	}
+
+	stats = talloc_size(outdata, len);
+	if (stats == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate db statistics structure\n"));
+		return -1;
+	}
+
+	memcpy(stats, &ctdb_db->statistics,
+	       offsetof(struct ctdb_db_statistics_old, hot_keys_wire));
+
+	stats->num_hot_keys = MAX_HOT_KEYS;
+
+	ptr = &stats->hot_keys_wire[0];
+	for (i = 0; i < MAX_HOT_KEYS; i++) {
+		memcpy(ptr, ctdb_db->statistics.hot_keys[i].key.dptr,
+		       ctdb_db->statistics.hot_keys[i].key.dsize);
+		ptr += ctdb_db->statistics.hot_keys[i].key.dsize;
+	}
+
+	outdata->dptr  = (uint8_t *)stats;
+	outdata->dsize = len;
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c
new file mode 100644
index 0000000..ab58ec4
--- /dev/null
+++ b/ctdb/server/ctdb_monitor.c
@@ -0,0 +1,509 @@
+/* 
+   monitoring links to all other nodes to detect dead nodes
+
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_monitor_state {
+	TALLOC_CTX *monitor_context;
+	uint32_t next_interval;
+	uint32_t event_script_timeouts;
+};
+
+static void ctdb_check_health(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval t, void *private_data);
+
+static int ctdb_run_notification_script_child(struct ctdb_context *ctdb, const char *event)
+{
+	struct stat st;
+	int ret;
+	char *cmd;
+
+	if (stat(ctdb->notification_script, &st) != 0) {
+		DEBUG(DEBUG_ERR,("Could not stat notification script %s. Can not send notifications.\n", ctdb->notification_script));
+		return -1;
+	}
+	if (!(st.st_mode & S_IXUSR)) {
+		DEBUG(DEBUG_ERR,("Notification script %s is not executable.\n", ctdb->notification_script));
+		return -1;
+	}
+
+	cmd = talloc_asprintf(ctdb, "%s %s\n", ctdb->notification_script, event);
+	CTDB_NO_MEMORY(ctdb, cmd);
+
+	ret = system(cmd);
+	/* if the system() call was successful, translate ret into the
+	   return code from the command
+	*/
+	if (ret != -1) {
+		ret = WEXITSTATUS(ret);
+	}
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Notification script \"%s\" failed with error %d\n", cmd, ret));
+	}
+
+	return ret;
+}
+
+void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event)
+{
+	pid_t child;
+
+	if (ctdb->notification_script == NULL) {
+		return;
+	}
+
+	child = ctdb_fork(ctdb);
+	if (child == (pid_t)-1) {
+		DEBUG(DEBUG_ERR,("Failed to fork() a notification child process\n"));
+		return;
+	}
+	if (child == 0) {
+		int ret;
+
+		prctl_set_comment("ctdb_notification");
+		ret = ctdb_run_notification_script_child(ctdb, event);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Notification script failed\n"));
+		}
+		_exit(0);
+	}
+
+	return;
+}
+
+/*
+  called when a health monitoring event script finishes
+ */
+static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+	struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+	TDB_DATA data;
+	struct ctdb_node_flag_change c;
+	uint32_t next_interval;
+	int ret;
+	TDB_DATA rddata;
+	struct ctdb_srvid_message rd;
+	const char *state_str = NULL;
+
+	c.pnn = ctdb->pnn;
+	c.old_flags = node->flags;
+
+	ZERO_STRUCT(rd);
+	rd.pnn   = ctdb->pnn;
+	rd.srvid = 0;
+
+	rddata.dptr = (uint8_t *)&rd;
+	rddata.dsize = sizeof(rd);
+
+	if (status == ECANCELED) {
+		DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n"));
+		goto after_change_status;
+	}
+
+	if (status == ETIMEDOUT) {
+		ctdb->monitor->event_script_timeouts++;
+
+		if (ctdb->monitor->event_script_timeouts >=
+		    ctdb->tunable.monitor_timeout_count) {
+			DEBUG(DEBUG_ERR,
+			      ("Maximum monitor timeout count %u reached."
+			       " Making node unhealthy\n",
+			       ctdb->tunable.monitor_timeout_count));
+		} else {
+			/* We pretend this is OK. */
+			goto after_change_status;
+		}
+	} else {
+		ctdb->monitor->event_script_timeouts = 0;
+	}
+
+	if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
+		DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
+		node->flags |= NODE_FLAGS_UNHEALTHY;
+		ctdb->monitor->next_interval = 5;
+
+		ctdb_run_notification_script(ctdb, "unhealthy");
+	} else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
+		DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n"));
+		node->flags &= ~NODE_FLAGS_UNHEALTHY;
+		ctdb->monitor->next_interval = 5;
+
+		ctdb_run_notification_script(ctdb, "healthy");
+	}
+
+after_change_status:
+	next_interval = ctdb->monitor->next_interval;
+
+	ctdb->monitor->next_interval *= 2;
+	if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
+		ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
+	}
+
+	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+			 timeval_current_ofs(next_interval, 0),
+			 ctdb_check_health, ctdb);
+
+	if (c.old_flags == node->flags) {
+		return;
+	}
+
+	c.new_flags = node->flags;
+
+	data.dptr = (uint8_t *)&c;
+	data.dsize = sizeof(c);
+
+	/* ask the recovery daemon to push these changes out to all nodes */
+	ctdb_daemon_send_message(ctdb, ctdb->pnn,
+				 CTDB_SRVID_PUSH_NODE_FLAGS, data);
+
+	if (c.new_flags & NODE_FLAGS_UNHEALTHY) {
+		state_str = "UNHEALTHY";
+	} else {
+		state_str = "HEALTHY";
+	}
+
+	/* ask the recmaster to reallocate all addresses */
+	DEBUG(DEBUG_ERR,
+	      ("Node became %s. Ask recovery master to reallocate IPs\n",
+	       state_str));
+	ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_TAKEOVER_RUN, rddata);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__
+		       " Failed to send IP takeover run request\n"));
+	}
+}
+
+
+static void ctdb_run_startup(struct tevent_context *ev,
+			     struct tevent_timer *te,
+			     struct timeval t, void *private_data);
+/*
+  called when the startup event script finishes
+ */
+static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+	if (status != 0) {
+		DEBUG(DEBUG_ERR,("startup event failed\n"));
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(5, 0),
+				 ctdb_run_startup, ctdb);
+		return;
+	}
+
+	DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
+	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
+	ctdb->monitor->next_interval = 2;
+	ctdb_run_notification_script(ctdb, "startup");
+
+	/* tell all other nodes we've just started up */
+	ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED,
+				 0, CTDB_CONTROL_STARTUP, 0,
+				 CTDB_CTRL_FLAG_NOREPLY,
+				 tdb_null, NULL, NULL);
+
+	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+			 timeval_current_ofs(ctdb->monitor->next_interval, 0),
+			 ctdb_check_health, ctdb);
+}
+
+static void ctdb_run_startup(struct tevent_context *ev,
+			     struct tevent_timer *te,
+			     struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data,
+						    struct ctdb_context);
+	int ret;
+
+	/* This is necessary to avoid the "startup" event colliding
+	 * with the "ipreallocated" event from the takeover run
+	 * following the first recovery.  We might as well serialise
+	 * these things if we can.
+	 */
+	if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
+		DEBUG(DEBUG_NOTICE,
+		      ("Not yet in startup runstate. Wait one more second\n"));
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(1, 0),
+				 ctdb_run_startup, ctdb);
+		return;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
+	ret = ctdb_event_script_callback(ctdb,
+					 ctdb->monitor->monitor_context,
+					 ctdb_startup_callback,
+					 ctdb, CTDB_EVENT_STARTUP, "%s", "");
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(5, 0),
+				 ctdb_run_startup, ctdb);
+	}
+}
+
+/*
+  wait until we have finished initial recoveries before we start the
+  monitoring events
+ */
+static void ctdb_wait_until_recovered(struct tevent_context *ev,
+				      struct tevent_timer *te,
+				      struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	int ret;
+	static int count = 0;
+
+	count++;
+
+	if (count < 60 || count%600 == 0) { 
+		DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
+		if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_STOPPED) {
+			DEBUG(DEBUG_NOTICE,("Node is STOPPED. Node will NOT recover.\n"));
+		}
+	}
+
+	if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(1, 0),
+				 ctdb_wait_until_recovered, ctdb);
+		return;
+	}
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+		DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(1, 0),
+				 ctdb_wait_until_recovered, ctdb);
+		return;
+	}
+
+
+	if (!fast_start && timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
+		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+		DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
+
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(1, 0),
+				 ctdb_wait_until_recovered, ctdb);
+		return;
+	}
+
+	if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
+		DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
+				  "until the next recovery\n"));
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(1, 0),
+				 ctdb_wait_until_recovered, ctdb);
+		return;
+	}
+
+	ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
+	ret = ctdb_recheck_persistent_health(ctdb);
+	if (ret != 0) {
+		ctdb->db_persistent_check_errors++;
+		if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ "ctdb_recheck_persistent_health() "
+			      "failed (%llu of %llu times) - retry later\n",
+			      (unsigned long long)ctdb->db_persistent_check_errors,
+			      (unsigned long long)ctdb->max_persistent_check_errors));
+			tevent_add_timer(ctdb->ev,
+					 ctdb->monitor->monitor_context,
+					 timeval_current_ofs(1, 0),
+					 ctdb_wait_until_recovered, ctdb);
+			return;
+		}
+		DEBUG(DEBUG_ALERT,(__location__
+				  "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
+				  (unsigned long long)ctdb->db_persistent_check_errors));
+		ctdb_shutdown_sequence(ctdb, 11);
+		/* In case above returns due to duplicate shutdown */
+		return;
+	}
+	ctdb->db_persistent_check_errors = 0;
+
+	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+			 timeval_current(), ctdb_run_startup, ctdb);
+}
+
+
+/*
+  see if the event scripts think we are healthy
+ */
+static void ctdb_check_health(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	bool skip_monitoring = false;
+	int ret = 0;
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
+	    ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE ||
+	    ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
+		skip_monitoring = true;
+	} else {
+		if (ctdb_db_all_frozen(ctdb)) {
+			DEBUG(DEBUG_ERR,
+			      ("Skip monitoring since databases are frozen\n"));
+			skip_monitoring = true;
+		}
+	}
+
+	if (skip_monitoring) {
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(ctdb->monitor->next_interval, 0),
+				 ctdb_check_health, ctdb);
+		return;
+	}
+
+	ret = ctdb_event_script_callback(ctdb,
+					 ctdb->monitor->monitor_context,
+					 ctdb_health_callback,
+					 ctdb, CTDB_EVENT_MONITOR, "%s", "");
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
+		ctdb->monitor->next_interval = 5;
+		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+				 timeval_current_ofs(5, 0),
+				 ctdb_check_health, ctdb);
+	}
+}
+
+/* stop any monitoring 
+   this should only be done when shutting down the daemon
+*/
+void ctdb_stop_monitoring(struct ctdb_context *ctdb)
+{
+	if (ctdb->monitor == NULL) {
+		D_NOTICE("Monitoring not yet initialised\n");
+		return;
+	}
+
+	TALLOC_FREE(ctdb->monitor->monitor_context);
+
+	ctdb->monitor->next_interval = 5;
+	DEBUG(DEBUG_NOTICE,("Monitoring has been stopped\n"));
+}
+
+/*
+  start watching for nodes that might be dead
+ */
+void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
+{
+	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
+
+	ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
+	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
+
+	ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
+	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
+
+	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+			 timeval_current_ofs(1, 0),
+			 ctdb_wait_until_recovered, ctdb);
+}
+
+
+/*
+  modify flags on a node
+ */
+int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr;
+	struct ctdb_node *node;
+	uint32_t old_flags;
+
+	/*
+	 * Don't let other nodes override the current node's flags.
+	 * The recovery master fetches flags from this node so there's
+	 * no need to push them back.  Doing so is racy.
+	 */
+	if (c->pnn == ctdb->pnn) {
+		DBG_DEBUG("Ignoring flag changes for current node\n");
+		return 0;
+	}
+
+	node = ctdb_find_node(ctdb, c->pnn);
+	if (node == NULL) {
+		DBG_ERR("Node %u is invalid\n", c->pnn);
+		return -1;
+	}
+
+	if (node->flags & NODE_FLAGS_DISCONNECTED) {
+		DBG_DEBUG("Ignoring flag changes for disconnected node\n");
+		return 0;
+	}
+
+	/*
+	 * Remember the old flags.  We don't care what some other node
+	 * thought the old flags were - that's irrelevant.
+	 */
+	old_flags = node->flags;
+
+	/*
+	 * This node tracks nodes it is connected to, so don't let
+	 * another node override this
+	 */
+	node->flags =
+		(old_flags & NODE_FLAGS_DISCONNECTED) |
+		(c->new_flags & ~NODE_FLAGS_DISCONNECTED);
+
+	if (node->flags == old_flags) {
+		return 0;
+	}
+
+	D_NOTICE("Node %u has changed flags - 0x%x -> 0x%x\n",
+		 c->pnn,
+		 old_flags,
+		 node->flags);
+
+	if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) {
+		DBG_ERR("Node %u became healthy - force recovery for startup\n",
+			c->pnn);
+		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+	}
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_mutex_fcntl_helper.c b/ctdb/server/ctdb_mutex_fcntl_helper.c
new file mode 100644
index 0000000..aac98ea
--- /dev/null
+++ b/ctdb/server/ctdb_mutex_fcntl_helper.c
@@ -0,0 +1,795 @@
+/*
+   CTDB mutex fcntl lock file helper
+
+   Copyright (C) Martin Schwenke 2015
+
+   wait_for_parent() code from ctdb_lock_helper.c:
+
+   Copyright (C) Amitay Isaacs  2013
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+#include "system/dir.h"
+
+#include <tevent.h>
+
+#include "lib/util/sys_rw.h"
+#include "lib/util/tevent_unix.h"
+#include "lib/util/util.h"
+#include "lib/util/smb_strtox.h"
+
+/* protocol.h is just needed for ctdb_sock_addr, which is used in system.h */
+#include "protocol/protocol.h"
+#include "common/system.h"
+#include "common/tmon.h"
+
+static char progpath[PATH_MAX];
+static char *progname = NULL;
+
+static int fcntl_lock_fd(int fd, bool block, off_t start)
+{
+	static struct flock lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_len = 1,
+		.l_pid = 0,
+	};
+	int cmd = block ? F_SETLKW : F_SETLK;
+
+	lock.l_start = start;
+	if (fcntl(fd, cmd, &lock) != 0) {
+		return errno;
+	}
+
+	return 0;
+}
+
+static char fcntl_lock(const char *file, int *outfd)
+{
+	int fd;
+	int ret;
+
+	fd = open(file, O_RDWR|O_CREAT, 0600);
+	if (fd == -1) {
+		fprintf(stderr, "%s: Unable to open %s - (%s)\n",
+			progname, file, strerror(errno));
+		return '3';
+	}
+
+	ret = fcntl_lock_fd(fd, false, 0);
+	if (ret != 0) {
+		close(fd);
+		if (ret == EACCES || ret == EAGAIN) {
+			/* Lock contention, fail silently */
+			return '1';
+		}
+
+		/* Log an error for any other failure */
+		fprintf(stderr,
+			"%s: Failed to get lock on '%s' - (%s)\n",
+			progname,
+			file,
+			strerror(ret));
+		return '3';
+	}
+
+	*outfd = fd;
+
+	return '0';
+}
+
+/*
+ * Wait and see if the parent exits
+ */
+
+struct wait_for_parent_state {
+	struct tevent_context *ev;
+	pid_t ppid;
+};
+
+static void wait_for_parent_check(struct tevent_req *subreq);
+
+static struct tevent_req *wait_for_parent_send(TALLOC_CTX *mem_ctx,
+					       struct tevent_context *ev,
+					       pid_t ppid)
+{
+	struct tevent_req *req, *subreq;
+	struct wait_for_parent_state *state;
+
+	req = tevent_req_create(mem_ctx, &state, struct wait_for_parent_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->ppid = ppid;
+
+	if (ppid == 1) {
+		fprintf(stderr, "parent == 1\n");
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	subreq = tevent_wakeup_send(state, ev,
+				    tevent_timeval_current_ofs(5,0));
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, wait_for_parent_check, req);
+
+	return req;
+}
+
+static void wait_for_parent_check(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct wait_for_parent_state *state = tevent_req_data(
+		req, struct wait_for_parent_state);
+	bool status;
+
+	status = tevent_wakeup_recv(subreq);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		/* Ignore error */
+		fprintf(stderr, "%s: tevent_wakeup_recv() failed\n", progname);
+	}
+
+	if (kill(state->ppid, 0) == -1 && errno == ESRCH) {
+		fprintf(stderr, "parent gone\n");
+		tevent_req_done(req);
+		return;
+	}
+
+	subreq = tevent_wakeup_send(state, state->ev,
+				    tevent_timeval_current_ofs(5,0));
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, wait_for_parent_check, req);
+}
+
+static bool wait_for_parent_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Perform I/O on lock in a loop - complete when file removed or replaced
+ */
+
+struct lock_io_check_state {
+	struct tevent_context *ev;
+	const char *lock_file;
+	ino_t inode;
+	unsigned long recheck_interval;
+};
+
+static void lock_io_check_loop(struct tevent_req *subreq);
+
+static struct tevent_req *lock_io_check_send(TALLOC_CTX *mem_ctx,
+					     struct tevent_context *ev,
+					     const char *lock_file,
+					     ino_t inode,
+					     unsigned long recheck_interval)
+{
+	struct tevent_req *req, *subreq;
+	struct lock_io_check_state *state;
+
+	req = tevent_req_create(mem_ctx, &state, struct lock_io_check_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->lock_file = lock_file;
+	state->inode = inode;
+	state->recheck_interval = recheck_interval;
+
+	subreq = tevent_wakeup_send(
+			state,
+			ev,
+			tevent_timeval_current_ofs(state->recheck_interval, 0));
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, lock_io_check_loop, req);
+
+	return req;
+}
+
+static void lock_io_check_loop(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct lock_io_check_state *state = tevent_req_data(
+		req, struct lock_io_check_state);
+	bool status;
+	struct stat sb;
+	int fd = -1;
+	int ret;
+
+	status = tevent_wakeup_recv(subreq);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		/* Ignore error */
+		fprintf(stderr, "%s: tevent_wakeup_recv() failed\n", progname);
+	}
+
+	fd = open(state->lock_file, O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr,
+			"%s: "
+			"lock lost - lock file \"%s\" open failed (ret=%d)\n",
+			progname,
+			state->lock_file,
+			errno);
+		goto done;
+	}
+
+	ret = fstat(fd, &sb);
+	if (ret != 0) {
+		fprintf(stderr,
+			"%s: "
+			"lock lost - lock file \"%s\" check failed (ret=%d)\n",
+			progname,
+			state->lock_file,
+			errno);
+		goto done;
+	}
+
+	if (sb.st_ino != state->inode) {
+		fprintf(stderr,
+			"%s: lock lost - lock file \"%s\" inode changed\n",
+			progname,
+			state->lock_file);
+		goto done;
+	}
+
+	/*
+	 * Attempt to lock a 2nd byte range.  Using a blocking lock
+	 * encourages ping timeouts if the cluster filesystem is in a
+	 * bad state.  It also makes testing easier.
+	 */
+	ret = fcntl_lock_fd(fd, true, 1);
+	if (ret != 0) {
+		fprintf(stderr,
+			"%s: "
+			"lock fail - lock file \"%s\" test lock error (%d)\n",
+			progname,
+			state->lock_file,
+			ret);
+		goto done;
+	}
+
+	/* Unlock occurs on close */
+	close(fd);
+
+	subreq = tevent_wakeup_send(
+			state,
+			state->ev,
+			tevent_timeval_current_ofs(state->recheck_interval, 0));
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, lock_io_check_loop, req);
+
+	return;
+
+done:
+	if (fd != -1) {
+		close(fd);
+	}
+	tevent_req_done(req);
+}
+
+static bool lock_io_check_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		return false;
+	}
+
+	return true;
+}
+
+struct lock_test_child_state {
+};
+
+static void lock_test_child_ping_done(struct tevent_req *subreq);
+static void lock_test_child_io_check_done(struct tevent_req *subreq);
+
+static struct tevent_req *lock_test_child_send(TALLOC_CTX *mem_ctx,
+					       struct tevent_context *ev,
+					       const char *lock_file,
+					       int fd,
+					       ino_t inode,
+					       unsigned long recheck_interval,
+					       bool send_pings)
+{
+	struct tevent_req *req, *subreq;
+	struct lock_test_child_state *state;
+	unsigned int interval = send_pings ? 1 : 0;
+
+	req = tevent_req_create(mem_ctx, &state, struct lock_test_child_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	subreq = tmon_ping_send(state, ev, fd, TMON_FD_BOTH, 0, interval);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, lock_test_child_ping_done, req);
+
+	subreq = lock_io_check_send(state,
+				    ev,
+				    lock_file,
+				    inode,
+				    recheck_interval);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, lock_test_child_io_check_done, req);
+
+	return req;
+}
+
+static void lock_test_child_ping_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	bool status;
+	int err;
+
+	status = tmon_ping_recv(subreq, &err);
+	TALLOC_FREE(subreq);
+	if (!status) {
+		tevent_req_error(req, err);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static void lock_test_child_io_check_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	bool status;
+	int err;
+
+	status = lock_io_check_recv(subreq, &err);
+	TALLOC_FREE(subreq);
+	if (!status) {
+		tevent_req_error(req, err);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool lock_test_child_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		/* Parent exit is expected */
+		if (*perr == EPIPE) {
+			return true;
+		}
+		return false;
+	}
+
+	return true;
+}
+
+static void lock_test_child(const char *lock_file,
+			    int lock_fd,
+			    int pipe_fd,
+			    unsigned long recheck_interval,
+			    bool send_pings)
+{
+	struct tevent_context *ev;
+	struct tevent_req *req;
+	struct stat sb;
+	ino_t inode;
+	bool status;
+	int ret;
+
+	ret = fstat(lock_fd, &sb);
+	if (ret != 0) {
+		fprintf(stderr,
+			"%s: lock lost - "
+			"lock file \"%s\" stat failed (ret=%d)\n",
+			progname,
+			lock_file,
+			errno);
+		_exit(1);
+	}
+	inode = sb.st_ino;
+	close(lock_fd);
+
+	ev = tevent_context_init(NULL);
+	if (ev == NULL) {
+		fprintf(stderr, "%s: tevent_context_init() failed\n", progname);
+		_exit(1);
+	}
+
+	req = lock_test_child_send(ev,
+				   ev,
+				   lock_file,
+				   pipe_fd,
+				   inode,
+				   recheck_interval,
+				   send_pings);
+	if (req == NULL) {
+		fprintf(stderr,
+			"%s: lock_test_child_send() failed\n",
+			progname);
+		_exit(1);
+	}
+
+	tevent_req_poll(req, ev);
+
+	status = lock_test_child_recv(req, &ret);
+	if (! status) {
+		fprintf(stderr,
+			"%s: lock_test_child_recv() failed (%d)\n",
+			progname,
+			ret);
+		_exit(1);
+	}
+
+	_exit(0);
+}
+
+struct lock_test_state {
+	int *lock_fdp;
+	int pipe_fd;
+	pid_t child_pid;
+};
+
+static void lock_test_ping_done(struct tevent_req *subreq);
+
+static struct tevent_req *lock_test_send(TALLOC_CTX *mem_ctx,
+					 struct tevent_context *ev,
+					 const char *lock_file,
+					 int *fdp,
+					 unsigned long recheck_interval,
+					 unsigned long ping_timeout)
+{
+	struct tevent_req *req, *subreq;
+	struct lock_test_state *state;
+	pid_t pid;
+	int sv[2];
+	int ret;
+
+	req = tevent_req_create(mem_ctx, &state, struct lock_test_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	ret = socketpair(AF_UNIX, SOCK_STREAM, 0, sv);
+	if (ret != 0) {
+		fprintf(stderr,
+			"%s: socketpair() failed (errno=%d)\n",
+			progname,
+			errno);
+		tevent_req_error(req, errno);
+		return tevent_req_post(req, ev);
+	}
+
+	pid = fork();
+	if (pid == -1) {
+
+		int err = errno;
+		fprintf(stderr, "%s: fork() failed (errno=%d)\n", progname, err);
+		close(sv[0]);
+		close(sv[1]);
+		tevent_req_error(req, err);
+		return tevent_req_post(req, ev);
+	}
+	if (pid == 0) {
+		/* Child */
+		close(sv[0]);
+		TALLOC_FREE(ev);
+
+		lock_test_child(lock_file,
+				*fdp,
+				sv[1],
+				recheck_interval,
+				ping_timeout != 0);
+		/* Above does not return */
+	}
+
+	/* Parent */
+	close(sv[1]);
+
+	state->lock_fdp = fdp;
+	state->pipe_fd = sv[0];
+	state->child_pid = pid;
+
+	subreq = tmon_ping_send(state, ev, sv[0], TMON_FD_BOTH, ping_timeout, 0);
+	if (tevent_req_nomem(subreq, req)) {
+		close(sv[0]);
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, lock_test_ping_done, req);
+
+	return req;
+}
+
+static void lock_test_ping_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct lock_test_state *state = tevent_req_data(
+		req, struct lock_test_state);
+	int wstatus;
+	bool status;
+	int err;
+
+	status = tmon_ping_recv(subreq, &err);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		switch (err) {
+		case EPIPE:
+			/* Child exit, child already printed message */
+			break;
+		case ETIMEDOUT:
+			fprintf(stderr,
+				"%s: ping timeout from lock test child\n",
+				progname);
+			break;
+		default:
+			fprintf(stderr,
+				"%s: tmon_ping_recv() failed (%d)\n",
+				progname,
+				err);
+		}
+		/* Ignore error */
+	}
+
+	/*
+	 * Lock checking child is gone or not sending pings.  Release
+	 * the lock, close this end of pipe, send SIGKILL to the child
+	 * process and wait for the child to exit.
+	 */
+	close(*state->lock_fdp);
+	*state->lock_fdp = -1;
+	close(state->pipe_fd);
+	kill(state->child_pid, SIGKILL);
+	waitpid(state->child_pid, &wstatus, 0);
+
+	tevent_req_done(req);
+}
+
+static bool lock_test_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Wait for a reason to exit, indicating that parent has exited or I/O
+ * on lock failed
+ */
+
+struct wait_for_exit_state {
+};
+
+static void wait_for_exit_parent_done(struct tevent_req *subreq);
+static void wait_for_exit_lock_test_done(struct tevent_req *subreq);
+
+static struct tevent_req *wait_for_exit_send(TALLOC_CTX *mem_ctx,
+					     struct tevent_context *ev,
+					     pid_t ppid,
+					     const char *lock_file,
+					     int *fdp,
+					     unsigned long recheck_interval,
+					     unsigned long ping_timeout)
+{
+	struct tevent_req *req, *subreq;
+	struct wait_for_exit_state *state;
+
+	req = tevent_req_create(mem_ctx, &state, struct wait_for_exit_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	subreq = wait_for_parent_send(state, ev, ppid);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, wait_for_exit_parent_done, req);
+
+	if (recheck_interval > 0) {
+		subreq = lock_test_send(state,
+					ev,
+					lock_file,
+					fdp,
+					recheck_interval,
+					ping_timeout);
+		if (tevent_req_nomem(subreq, req)) {
+			return tevent_req_post(req, ev);
+		}
+		tevent_req_set_callback(subreq,
+					wait_for_exit_lock_test_done,
+					req);
+	}
+
+	return req;
+}
+
+static void wait_for_exit_parent_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	bool status;
+	int err;
+
+	status = wait_for_parent_recv(subreq, &err);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		/* Ignore error */
+		fprintf(stderr,
+			"%s: "
+			"wait_for_parent_recv() failed (%d)\n",
+			progname,
+			err);
+	}
+
+	tevent_req_done(req);
+}
+
+static void wait_for_exit_lock_test_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	bool status;
+	int err;
+
+	status = lock_test_recv(subreq, &err);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		fprintf(stderr,
+			"%s: "
+			"lock_test_recv() failed (%d)\n",
+			progname,
+			err);
+		/* Ignore error, fall through to done */
+	}
+
+	tevent_req_done(req);
+}
+
+static bool wait_for_exit_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		return false;
+	}
+
+	return true;
+}
+
+static void usage(void)
+{
+	fprintf(stderr,
+		"Usage: %s <file> [recheck_interval [ping_timeout]]\n",
+		progname);
+}
+
+int main(int argc, char *argv[])
+{
+	struct tevent_context *ev;
+	char result;
+	int ppid;
+	const char *file = NULL;
+	unsigned long recheck_interval;
+	unsigned long ping_timeout;
+	int ret;
+	int fd = -1;
+	struct tevent_req *req;
+	bool status;
+
+	strlcpy(progpath, argv[0], sizeof(progpath));
+	progname = basename(progpath);
+
+	if (argc < 2 || argc > 4) {
+		usage();
+		exit(1);
+	}
+
+	ev = tevent_context_init(NULL);
+	if (ev == NULL) {
+		fprintf(stderr, "locking: tevent_context_init() failed\n");
+		exit(1);
+	}
+
+	ppid = getppid();
+
+	file = argv[1];
+
+	recheck_interval = 5;
+	ping_timeout = 0;
+	if (argc >= 3) {
+		recheck_interval = smb_strtoul(argv[2],
+					       NULL,
+					       10,
+					       &ret,
+					       SMB_STR_STANDARD);
+		if (ret != 0) {
+			usage();
+			exit(1);
+		}
+	}
+	if (argc >= 4) {
+		ping_timeout = smb_strtoul(argv[3],
+					   NULL,
+					   10,
+					   &ret,
+					   SMB_STR_STANDARD);
+		if (ret != 0) {
+			usage();
+			exit(1);
+		}
+	}
+
+	result = fcntl_lock(file, &fd);
+	sys_write(STDOUT_FILENO, &result, 1);
+
+	if (result != '0') {
+		return 0;
+	}
+
+	req = wait_for_exit_send(ev,
+				 ev,
+				 ppid,
+				 file,
+				 &fd,
+				 recheck_interval,
+				 ping_timeout);
+	if (req == NULL) {
+		fprintf(stderr,
+			"%s: wait_for_exit_send() failed\n",
+			progname);
+		exit(1);
+	}
+
+	tevent_req_poll(req, ev);
+
+	status = wait_for_exit_recv(req, &ret);
+	if (! status) {
+		fprintf(stderr,
+			"%s: wait_for_exit_recv() failed (%d)\n",
+			progname,
+			ret);
+	}
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_persistent.c b/ctdb/server/ctdb_persistent.c
new file mode 100644
index 0000000..2671744
--- /dev/null
+++ b/ctdb/server/ctdb_persistent.c
@@ -0,0 +1,397 @@
+/* 
+   persistent store logic
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/time.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+
+#include "common/reqid.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_persistent_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db; /* used by trans3_commit */
+	struct ctdb_client *client; /* used by trans3_commit */
+	struct ctdb_req_control_old *c;
+	const char *errormsg;
+	uint32_t num_pending;
+	int32_t status;
+	uint32_t num_failed, num_sent;
+};
+
+/*
+  1) all nodes fail, and all nodes reply
+  2) some nodes fail, all nodes reply
+  3) some nodes timeout
+  4) all nodes succeed
+ */
+
+/*
+  called when a node has acknowledged a ctdb_control_update_record call
+ */
+static void ctdb_persistent_callback(struct ctdb_context *ctdb,
+				     int32_t status, TDB_DATA data, 
+				     const char *errormsg,
+				     void *private_data)
+{
+	struct ctdb_persistent_state *state = talloc_get_type(private_data, 
+							      struct ctdb_persistent_state);
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		DEBUG(DEBUG_INFO, ("ctdb_persistent_callback: ignoring reply "
+				   "during recovery\n"));
+		return;
+	}
+
+	if (status != 0) {
+		DEBUG(DEBUG_ERR,("ctdb_persistent_callback failed with status %d (%s)\n",
+			 status, errormsg?errormsg:"no error message given"));
+		state->status = status;
+		state->errormsg = errormsg;
+		state->num_failed++;
+
+		/*
+		 * If a node failed to complete the update_record control,
+		 * then either a recovery is already running or something
+		 * bad is going on. So trigger a recovery and let the
+		 * recovery finish the transaction, sending back the reply
+		 * for the trans3_commit control to the client.
+		 */
+		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+		return;
+	}
+
+	state->num_pending--;
+
+	if (state->num_pending != 0) {
+		return;
+	}
+
+	ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, state->errormsg);
+	talloc_free(state);
+}
+
+/*
+  called if persistent store times out
+ */
+static void ctdb_persistent_store_timeout(struct tevent_context *ev,
+					  struct tevent_timer *te,
+					  struct timeval t, void *private_data)
+{
+	struct ctdb_persistent_state *state = talloc_get_type(private_data, struct ctdb_persistent_state);
+
+	if (state->ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		DEBUG(DEBUG_INFO, ("ctdb_persistent_store_timeout: ignoring "
+				   "timeout during recovery\n"));
+		return;
+	}
+
+	ctdb_request_control_reply(state->ctdb, state->c, NULL, 1,
+				   "timeout in ctdb_persistent_state");
+
+	talloc_free(state);
+}
+
+/**
+ * Finish pending trans3 commit controls, i.e. send
+ * reply to the client. This is called by the end-recovery
+ * control to fix the situation when a recovery interrupts
+ * the usual progress of a transaction.
+ */
+void ctdb_persistent_finish_trans3_commits(struct ctdb_context *ctdb)
+{
+	struct ctdb_db_context *ctdb_db;
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		DEBUG(DEBUG_INFO, ("ctdb_persistent_finish_trans3_commits: "
+				   "skipping execution when recovery is "
+				   "active\n"));
+		return;
+	}
+
+	for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+		struct ctdb_persistent_state *state;
+
+		if (ctdb_db->persistent_state == NULL) {
+			continue;
+		}
+
+		state = ctdb_db->persistent_state;
+
+		ctdb_request_control_reply(ctdb, state->c, NULL, 2,
+					   "trans3 commit ended by recovery");
+
+		/* The destructor sets ctdb_db->persistent_state to NULL. */
+		talloc_free(state);
+	}
+}
+
+static int ctdb_persistent_state_destructor(struct ctdb_persistent_state *state)
+{
+	if (state->client != NULL) {
+		state->client->db_id = 0;
+	}
+
+	if (state->ctdb_db != NULL) {
+		state->ctdb_db->persistent_state = NULL;
+	}
+
+	return 0;
+}
+
+/*
+ * Store a set of persistent records.
+ * This is used to roll out a transaction to all nodes.
+ */
+int32_t ctdb_control_trans3_commit(struct ctdb_context *ctdb,
+				   struct ctdb_req_control_old *c,
+				   TDB_DATA recdata, bool *async_reply)
+{
+	struct ctdb_client *client;
+	struct ctdb_persistent_state *state;
+	unsigned int i;
+	struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
+	struct ctdb_db_context *ctdb_db;
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		DEBUG(DEBUG_INFO,("rejecting ctdb_control_trans3_commit when recovery active\n"));
+		return -1;
+	}
+
+	client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client);
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " can not match persistent_store "
+				 "to a client. Returning error\n"));
+		return -1;
+	}
+
+	if (client->db_id != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ERROR: trans3_commit: "
+				 "client-db_id[0x%08x] != 0 "
+				 "(client_id[0x%08x]): trans3_commit active?\n",
+				 client->db_id, client->client_id));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, m->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control_trans3_commit: "
+				 "Unknown database db_id[0x%08x]\n", m->db_id));
+		return -1;
+	}
+
+	if (ctdb_db->persistent_state != NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Error: "
+				  "ctdb_control_trans3_commit "
+				  "called while a transaction commit is "
+				  "active. db_id[0x%08x]\n", m->db_id));
+		return -1;
+	}
+
+	ctdb_db->persistent_state = talloc_zero(ctdb_db,
+						struct ctdb_persistent_state);
+	CTDB_NO_MEMORY(ctdb, ctdb_db->persistent_state);
+
+	client->db_id = m->db_id;
+
+	state = ctdb_db->persistent_state;
+	state->ctdb = ctdb;
+	state->ctdb_db = ctdb_db;
+	state->c    = c;
+	state->client = client;
+
+	talloc_set_destructor(state, ctdb_persistent_state_destructor);
+
+	for (i = 0; i < ctdb->vnn_map->size; i++) {
+		struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]];
+		int ret;
+
+		/* only send to active nodes */
+		if (node->flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+
+		ret = ctdb_daemon_send_control(ctdb, node->pnn, 0,
+					       CTDB_CONTROL_UPDATE_RECORD,
+					       c->client_id, 0, recdata,
+					       ctdb_persistent_callback,
+					       state);
+		if (ret == -1) {
+			DEBUG(DEBUG_ERR,("Unable to send "
+					 "CTDB_CONTROL_UPDATE_RECORD "
+					 "to pnn %u\n", node->pnn));
+			talloc_free(state);
+			return -1;
+		}
+
+		state->num_pending++;
+		state->num_sent++;
+	}
+
+	if (state->num_pending == 0) {
+		talloc_free(state);
+		return 0;
+	}
+
+	/* we need to wait for the replies */
+	*async_reply = true;
+
+	/* need to keep the control structure around */
+	talloc_steal(state, c);
+
+	/* but we won't wait forever */
+	tevent_add_timer(ctdb->ev, state,
+			 timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+			 ctdb_persistent_store_timeout, state);
+
+	return 0;
+}
+
+
+/*
+  backwards compatibility:
+
+  start a persistent store operation. passing both the key, header and
+  data to the daemon. If the client disconnects before it has issued
+  a persistent_update call to the daemon we trigger a full recovery
+  to ensure the databases are brought back in sync.
+  for now we ignore the recdata that the client has passed to us.
+ */
+int32_t ctdb_control_start_persistent_update(struct ctdb_context *ctdb, 
+				      struct ctdb_req_control_old *c,
+				      TDB_DATA recdata)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client);
+
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " can not match start_persistent_update to a client. Returning error\n"));
+		return -1;
+	}
+
+	client->num_persistent_updates++;
+
+	return 0;
+}
+
+/* 
+  backwards compatibility:
+
+  called to tell ctdbd that it is no longer doing a persistent update 
+*/
+int32_t ctdb_control_cancel_persistent_update(struct ctdb_context *ctdb, 
+					      struct ctdb_req_control_old *c,
+					      TDB_DATA recdata)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client);
+
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " can not match cancel_persistent_update to a client. Returning error\n"));
+		return -1;
+	}
+
+	if (client->num_persistent_updates > 0) {
+		client->num_persistent_updates--;
+	}
+
+	return 0;
+}
+
+static int32_t ctdb_get_db_seqnum(struct ctdb_context *ctdb,
+				  uint32_t db_id,
+				  uint64_t *seqnum)
+{
+	int32_t ret;
+	struct ctdb_db_context *ctdb_db;
+	const char *keyname = CTDB_DB_SEQNUM_KEY;
+	TDB_DATA key;
+	TDB_DATA data;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+	struct ctdb_ltdb_header header;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id));
+		ret = -1;
+		goto done;
+	}
+
+	if (! ctdb_db_allow_access(ctdb_db)) {
+		ret = -1;
+		goto done;
+	}
+
+	key.dptr = (uint8_t *)discard_const(keyname);
+	key.dsize = strlen(keyname) + 1;
+
+	ret = (int32_t)ctdb_ltdb_fetch(ctdb_db, key, &header, mem_ctx, &data);
+	if (ret != 0) {
+		goto done;
+	}
+
+	if (data.dsize != sizeof(uint64_t)) {
+		*seqnum = 0;
+		goto done;
+	}
+
+	*seqnum = *(uint64_t *)data.dptr;
+
+done:
+	talloc_free(mem_ctx);
+	return ret;
+}
+
+/**
+ * Get the sequence number of a persistent database.
+ */
+int32_t ctdb_control_get_db_seqnum(struct ctdb_context *ctdb,
+				   TDB_DATA indata,
+				   TDB_DATA *outdata)
+{
+	uint32_t db_id;
+	int32_t ret;
+	uint64_t seqnum;
+
+	db_id = *(uint32_t *)indata.dptr;
+	ret = ctdb_get_db_seqnum(ctdb, db_id, &seqnum);
+	if (ret != 0) {
+		goto done;
+	}
+
+	outdata->dsize = sizeof(uint64_t);
+	outdata->dptr = talloc_memdup(outdata, &seqnum, sizeof(uint64_t));
+	if (outdata->dptr == NULL) {
+		ret = -1;
+	}
+
+done:
+	return ret;
+}
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
new file mode 100644
index 0000000..7b30d11
--- /dev/null
+++ b/ctdb/server/ctdb_recover.c
@@ -0,0 +1,1243 @@
+/*
+   ctdb recovery code
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "ctdb_cluster_mutex.h"
+
+int
+ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+	struct ctdb_vnn_map_wire *map;
+	size_t len;
+
+	CHECK_CONTROL_DATA_SIZE(0);
+
+	len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
+	map = talloc_size(outdata, len);
+	CTDB_NO_MEMORY(ctdb, map);
+
+	map->generation = ctdb->vnn_map->generation;
+	map->size = ctdb->vnn_map->size;
+	memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
+
+	outdata->dsize = len;
+	outdata->dptr  = (uint8_t *)map;
+
+	return 0;
+}
+
+int
+ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+	struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+		DEBUG(DEBUG_ERR, ("Attempt to set vnnmap when not in recovery\n"));
+		return -1;
+	}
+
+	talloc_free(ctdb->vnn_map);
+
+	ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+	CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
+
+	ctdb->vnn_map->generation = map->generation;
+	ctdb->vnn_map->size       = map->size;
+	ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
+	CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
+
+	memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
+
+	return 0;
+}
+
+int
+ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+	uint32_t i, len;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_dbid_map_old *dbid_map;
+
+	CHECK_CONTROL_DATA_SIZE(0);
+
+	len = 0;
+	for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
+		len++;
+	}
+
+
+	outdata->dsize = offsetof(struct ctdb_dbid_map_old, dbs) + sizeof(dbid_map->dbs[0])*len;
+	outdata->dptr  = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
+	if (!outdata->dptr) {
+		DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
+		exit(1);
+	}
+
+	dbid_map = (struct ctdb_dbid_map_old *)outdata->dptr;
+	dbid_map->num = len;
+	for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
+		dbid_map->dbs[i].db_id       = ctdb_db->db_id;
+		dbid_map->dbs[i].flags       = ctdb_db->db_flags;
+	}
+
+	return 0;
+}
+
+int
+ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+	CHECK_CONTROL_DATA_SIZE(0);
+
+	outdata->dptr  = (unsigned char *)ctdb_node_list_to_map(ctdb->nodes,
+								ctdb->num_nodes,
+								outdata);
+	if (outdata->dptr == NULL) {
+		return -1;
+	}
+
+	outdata->dsize = talloc_get_size(outdata->dptr);
+
+	return 0;
+}
+
+/*
+  reload the nodes file
+*/
+int
+ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
+{
+	unsigned int i, num_nodes;
+	TALLOC_CTX *tmp_ctx;
+	struct ctdb_node **nodes;
+
+	tmp_ctx = talloc_new(ctdb);
+
+	/* steal the old nodes file for a while */
+	talloc_steal(tmp_ctx, ctdb->nodes);
+	nodes = ctdb->nodes;
+	ctdb->nodes = NULL;
+	num_nodes = ctdb->num_nodes;
+	ctdb->num_nodes = 0;
+
+	/* load the new nodes file */
+	ctdb_load_nodes_file(ctdb);
+
+	for (i=0; i<ctdb->num_nodes; i++) {
+		/* keep any identical pre-existing nodes and connections */
+		if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
+			talloc_free(ctdb->nodes[i]);
+			ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
+			continue;
+		}
+
+		if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+			continue;
+		}
+
+		/* any new or different nodes must be added */
+		if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
+			DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
+			ctdb_fatal(ctdb, "failed to add node. shutting down\n");
+		}
+		if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
+			DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
+			ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
+		}
+	}
+
+	/* tell the recovery daemon to reload the nodes file too */
+	ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
+
+	talloc_free(tmp_ctx);
+
+	return 0;
+}
+
+struct db_pull_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_marshall_buffer *recs;
+	uint32_t pnn;
+	uint64_t srvid;
+	uint32_t num_records;
+};
+
+static int traverse_db_pull(struct tdb_context *tdb, TDB_DATA key,
+			    TDB_DATA data, void *private_data)
+{
+	struct db_pull_state *state = (struct db_pull_state *)private_data;
+	struct ctdb_marshall_buffer *recs;
+
+	recs = ctdb_marshall_add(state->ctdb, state->recs,
+				 state->ctdb_db->db_id, 0, key, NULL, data);
+	if (recs == NULL) {
+		TALLOC_FREE(state->recs);
+		return -1;
+	}
+	state->recs = recs;
+
+	if (talloc_get_size(state->recs) >=
+			state->ctdb->tunable.rec_buffer_size_limit) {
+		TDB_DATA buffer;
+		int ret;
+
+		buffer = ctdb_marshall_finish(state->recs);
+		ret = ctdb_daemon_send_message(state->ctdb, state->pnn,
+					       state->srvid, buffer);
+		if (ret != 0) {
+			TALLOC_FREE(state->recs);
+			return -1;
+		}
+
+		state->num_records += state->recs->count;
+		TALLOC_FREE(state->recs);
+	}
+
+	return 0;
+}
+
+int32_t ctdb_control_db_pull(struct ctdb_context *ctdb,
+			     struct ctdb_req_control_old *c,
+			     TDB_DATA indata, TDB_DATA *outdata)
+{
+	struct ctdb_pulldb_ext *pulldb_ext;
+	struct ctdb_db_context *ctdb_db;
+	struct db_pull_state state;
+	int ret;
+
+	pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr;
+
+	ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n",
+				 pulldb_ext->db_id));
+		return -1;
+	}
+
+	if (!ctdb_db_frozen(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("rejecting ctdb_control_pull_db when not frozen\n"));
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		/* this is just a warning, as the tdb should be empty anyway */
+		DEBUG(DEBUG_WARNING,
+		      ("db(%s) unhealty in ctdb_control_db_pull: %s\n",
+		       ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
+	state.ctdb = ctdb;
+	state.ctdb_db = ctdb_db;
+	state.recs = NULL;
+	state.pnn = c->hdr.srcnode;
+	state.srvid = pulldb_ext->srvid;
+	state.num_records = 0;
+
+	/* If the records are invalid, we are done */
+	if (ctdb_db->invalid_records) {
+		goto done;
+	}
+
+	if (ctdb_lockdb_mark(ctdb_db) != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Failed to get lock on entire db - failing\n"));
+		return -1;
+	}
+
+	ret = tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_db_pull, &state);
+	if (ret == -1) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Failed to get traverse db '%s'\n",
+		       ctdb_db->db_name));
+		ctdb_lockdb_unmark(ctdb_db);
+		return -1;
+	}
+
+	/* Last few records */
+	if (state.recs != NULL) {
+		TDB_DATA buffer;
+
+		buffer = ctdb_marshall_finish(state.recs);
+		ret = ctdb_daemon_send_message(state.ctdb, state.pnn,
+					       state.srvid, buffer);
+		if (ret != 0) {
+			TALLOC_FREE(state.recs);
+			ctdb_lockdb_unmark(ctdb_db);
+			return -1;
+		}
+
+		state.num_records += state.recs->count;
+		TALLOC_FREE(state.recs);
+	}
+
+	ctdb_lockdb_unmark(ctdb_db);
+
+done:
+	outdata->dptr = talloc_size(outdata, sizeof(uint32_t));
+	if (outdata->dptr == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
+		return -1;
+	}
+
+	memcpy(outdata->dptr, (uint8_t *)&state.num_records, sizeof(uint32_t));
+	outdata->dsize = sizeof(uint32_t);
+
+	return 0;
+}
+
+struct db_push_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	uint64_t srvid;
+	uint32_t num_records;
+	bool failed;
+};
+
+static void db_push_msg_handler(uint64_t srvid, TDB_DATA indata,
+				void *private_data)
+{
+	struct db_push_state *state = talloc_get_type(
+		private_data, struct db_push_state);
+	struct ctdb_marshall_buffer *recs;
+	struct ctdb_rec_data_old *rec;
+	unsigned int i;
+	int ret;
+
+	if (state->failed) {
+		return;
+	}
+
+	recs = (struct ctdb_marshall_buffer *)indata.dptr;
+	rec = (struct ctdb_rec_data_old *)&recs->data[0];
+
+	DEBUG(DEBUG_INFO, ("starting push of %u records for dbid 0x%x\n",
+			   recs->count, recs->db_id));
+
+	for (i=0; i<recs->count; i++) {
+		TDB_DATA key, data;
+		struct ctdb_ltdb_header *hdr;
+
+		key.dptr = &rec->data[0];
+		key.dsize = rec->keylen;
+		data.dptr = &rec->data[key.dsize];
+		data.dsize = rec->datalen;
+
+		if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+			DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+			goto failed;
+		}
+
+		hdr = (struct ctdb_ltdb_header *)data.dptr;
+		/* Strip off any read only record flags.
+		 * All readonly records are revoked implicitly by a recovery.
+		 */
+		hdr->flags &= ~CTDB_REC_RO_FLAGS;
+
+		data.dptr += sizeof(*hdr);
+		data.dsize -= sizeof(*hdr);
+
+		ret = ctdb_ltdb_store(state->ctdb_db, key, hdr, data);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ " Unable to store record\n"));
+			goto failed;
+		}
+
+		rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
+	}
+
+	DEBUG(DEBUG_DEBUG, ("finished push of %u records for dbid 0x%x\n",
+			    recs->count, recs->db_id));
+
+	state->num_records += recs->count;
+	return;
+
+failed:
+	state->failed = true;
+}
+
+int32_t ctdb_control_db_push_start(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_pulldb_ext *pulldb_ext;
+	struct ctdb_db_context *ctdb_db;
+	struct db_push_state *state;
+	int ret;
+
+	pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr;
+
+	ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Unknown db 0x%08x\n", pulldb_ext->db_id));
+		return -1;
+	}
+
+	if (!ctdb_db_frozen(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("rejecting ctdb_control_db_push_start when not frozen\n"));
+		return -1;
+	}
+
+	if (ctdb_db->push_started) {
+		DEBUG(DEBUG_WARNING,
+		      (__location__ " DB push already started for %s\n",
+		       ctdb_db->db_name));
+
+		/* De-register old state */
+		state = (struct db_push_state *)ctdb_db->push_state;
+		if (state != NULL) {
+			srvid_deregister(ctdb->srv, state->srvid, state);
+			talloc_free(state);
+			ctdb_db->push_state = NULL;
+		}
+	}
+
+	state = talloc_zero(ctdb_db, struct db_push_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
+		return -1;
+	}
+
+	state->ctdb = ctdb;
+	state->ctdb_db = ctdb_db;
+	state->srvid = pulldb_ext->srvid;
+	state->failed = false;
+
+	ret = srvid_register(ctdb->srv, state, state->srvid,
+			     db_push_msg_handler, state);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Failed to register srvid for db push\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	if (ctdb_lockdb_mark(ctdb_db) != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Failed to get lock on entire db - failing\n"));
+		srvid_deregister(ctdb->srv, state->srvid, state);
+		talloc_free(state);
+		return -1;
+	}
+
+	ctdb_db->push_started = true;
+	ctdb_db->push_state = state;
+
+	return 0;
+}
+
+int32_t ctdb_control_db_push_confirm(struct ctdb_context *ctdb,
+				     TDB_DATA indata, TDB_DATA *outdata)
+{
+	uint32_t db_id;
+	struct ctdb_db_context *ctdb_db;
+	struct db_push_state *state;
+
+	db_id = *(uint32_t *)indata.dptr;
+
+	ctdb_db = find_ctdb_db(ctdb, db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id));
+		return -1;
+	}
+
+	if (!ctdb_db_frozen(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("rejecting ctdb_control_db_push_confirm when not frozen\n"));
+		return -1;
+	}
+
+	if (!ctdb_db->push_started) {
+		DEBUG(DEBUG_ERR, (__location__ " DB push not started\n"));
+		return -1;
+	}
+
+	if (ctdb_db_readonly(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Clearing the tracking database for dbid 0x%x\n",
+		       ctdb_db->db_id));
+		if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
+			DEBUG(DEBUG_ERR,
+			      ("Failed to wipe tracking database for 0x%x."
+			       " Dropping read-only delegation support\n",
+			       ctdb_db->db_id));
+			tdb_close(ctdb_db->rottdb);
+			ctdb_db->rottdb = NULL;
+			ctdb_db_reset_readonly(ctdb_db);
+		}
+
+		while (ctdb_db->revokechild_active != NULL) {
+			talloc_free(ctdb_db->revokechild_active);
+		}
+	}
+
+	ctdb_lockdb_unmark(ctdb_db);
+
+	state = (struct db_push_state *)ctdb_db->push_state;
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Missing push db state\n"));
+		return -1;
+	}
+
+	srvid_deregister(ctdb->srv, state->srvid, state);
+
+	outdata->dptr = talloc_size(outdata, sizeof(uint32_t));
+	if (outdata->dptr == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
+		talloc_free(state);
+		ctdb_db->push_state = NULL;
+		return -1;
+	}
+
+	memcpy(outdata->dptr, (uint8_t *)&state->num_records, sizeof(uint32_t));
+	outdata->dsize = sizeof(uint32_t);
+
+	talloc_free(state);
+	ctdb_db->push_started = false;
+	ctdb_db->push_state = NULL;
+
+	return 0;
+}
+
+struct set_recmode_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_control_old *c;
+};
+
+static void set_recmode_handler(char status,
+				double latency,
+				void *private_data)
+{
+	struct set_recmode_state *state = talloc_get_type_abort(
+		private_data, struct set_recmode_state);
+	int s = 0;
+	const char *err = NULL;
+
+	switch (status) {
+	case '0':
+		/* Mutex taken */
+		DEBUG(DEBUG_ERR,
+		      ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n",
+		       state->ctdb->recovery_lock));
+		s = -1;
+		err = "Took recovery lock from daemon during recovery - probably a cluster filesystem lock coherence problem";
+		break;
+
+	case '1':
+		/* Contention */
+		DEBUG(DEBUG_DEBUG, (__location__ " Recovery lock check OK\n"));
+		state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+		ctdb_process_deferred_attach(state->ctdb);
+
+		s = 0;
+
+		CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock",
+					    reclock.ctdbd, latency);
+		break;
+
+	case '2':
+		/* Timeout.  Consider this a success, not a failure,
+		 * as we failed to set the recovery lock which is what
+		 * we wanted.  This can be caused by the cluster
+		 * filesystem being very slow to arbitrate locks
+		 * immediately after a node failure. */
+		DEBUG(DEBUG_WARNING,
+		      (__location__
+		       "Time out getting recovery lock, allowing recmode set anyway\n"));
+		state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+		ctdb_process_deferred_attach(state->ctdb);
+
+		s = 0;
+		break;
+
+	default:
+		DEBUG(DEBUG_ERR,
+		      ("Unexpected error when testing recovery lock\n"));
+		s = -1;
+		err = "Unexpected error when testing recovery lock";
+	}
+
+	ctdb_request_control_reply(state->ctdb, state->c, NULL, s, err);
+	talloc_free(state);
+}
+
+static void
+ctdb_drop_all_ips_event(struct tevent_context *ev, struct tevent_timer *te,
+			struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+	DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
+	talloc_free(ctdb->release_ips_ctx);
+	ctdb->release_ips_ctx = NULL;
+
+	ctdb_release_all_ips(ctdb);
+}
+
+/*
+ * Set up an event to drop all public ips if we remain in recovery for too
+ * long
+ */
+int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
+{
+	if (ctdb->release_ips_ctx != NULL) {
+		talloc_free(ctdb->release_ips_ctx);
+	}
+	ctdb->release_ips_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
+
+	tevent_add_timer(ctdb->ev, ctdb->release_ips_ctx,
+			 timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0),
+			 ctdb_drop_all_ips_event, ctdb);
+	return 0;
+}
+
+/*
+  set the recovery mode
+ */
+int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
+				 struct ctdb_req_control_old *c,
+				 TDB_DATA indata, bool *async_reply,
+				 const char **errormsg)
+{
+	uint32_t recmode = *(uint32_t *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	struct set_recmode_state *state;
+	struct ctdb_cluster_mutex_handle *h;
+
+	if (recmode == ctdb->recovery_mode) {
+		D_INFO("Recovery mode already set to %s\n",
+		       recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE");
+		return 0;
+	}
+
+	D_NOTICE("Recovery mode set to %s\n",
+		 recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE");
+
+	/* if we enter recovery but stay in recovery for too long
+	   we will eventually drop all our ip addresses
+	*/
+	if (recmode == CTDB_RECOVERY_ACTIVE) {
+		if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
+			D_ERR("Failed to set up deferred drop all ips\n");
+		}
+
+		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+		return 0;
+	}
+
+	/* From this point: recmode == CTDB_RECOVERY_NORMAL
+	 *
+	 * Therefore, what follows is special handling when setting
+	 * recovery mode back to normal */
+
+	TALLOC_FREE(ctdb->release_ips_ctx);
+
+	for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) {
+		if (ctdb_db->generation != ctdb->vnn_map->generation) {
+			DEBUG(DEBUG_ERR,
+			      ("Inconsistent DB generation %u for %s\n",
+			       ctdb_db->generation, ctdb_db->db_name));
+			DEBUG(DEBUG_ERR, ("Recovery mode set to ACTIVE\n"));
+			return -1;
+		}
+	}
+
+	/* force the databases to thaw */
+	if (ctdb_db_all_frozen(ctdb)) {
+		ctdb_control_thaw(ctdb, false);
+	}
+
+	if (ctdb->recovery_lock == NULL) {
+		/* Not using recovery lock file */
+		ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+		ctdb_process_deferred_attach(ctdb);
+		return 0;
+	}
+
+	state = talloc_zero(ctdb, struct set_recmode_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+		return -1;
+	}
+	state->ctdb = ctdb;
+	state->c = NULL;
+
+	h = ctdb_cluster_mutex(state, ctdb, ctdb->recovery_lock, 5,
+			       set_recmode_handler, state, NULL, NULL);
+	if (h == NULL) {
+		talloc_free(state);
+		return -1;
+	}
+
+	state->c = talloc_steal(state, c);
+	*async_reply = true;
+
+	return 0;
+}
+
+
+/*
+  delete a record as part of the vacuum process
+  only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
+  use non-blocking locks
+
+  return 0 if the record was successfully deleted (i.e. it does not exist
+  when the function returns)
+  or !0 is the record still exists in the tdb after returning.
+ */
+static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data_old *rec)
+{
+	TDB_DATA key, data, data2;
+	struct ctdb_ltdb_header *hdr, *hdr2;
+
+	/* these are really internal tdb functions - but we need them here for
+	   non-blocking lock of the freelist */
+	int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+	int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+
+
+	key.dsize = rec->keylen;
+	key.dptr  = &rec->data[0];
+	data.dsize = rec->datalen;
+	data.dptr = &rec->data[rec->keylen];
+
+	if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
+		DBG_INFO("Called delete on record where we are lmaster\n");
+		return -1;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		DBG_ERR("Bad record size\n");
+		return -1;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	/* use a non-blocking lock */
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		DBG_INFO("Failed to get non-blocking chain lock\n");
+		return -1;
+	}
+
+	data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	if (data2.dptr == NULL) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+
+	if (data2.dsize < sizeof(struct ctdb_ltdb_header)) {
+		if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
+			if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+				DBG_ERR("Failed to delete corrupt record\n");
+			}
+			tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+			DBG_ERR("Deleted corrupt record\n");
+		}
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data2.dptr);
+		return 0;
+	}
+
+	hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
+
+	if (hdr2->rsn > hdr->rsn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DBG_INFO("Skipping record with rsn=%llu - called with rsn=%llu\n",
+			 (unsigned long long)hdr2->rsn,
+			 (unsigned long long)hdr->rsn);
+		free(data2.dptr);
+		return -1;
+	}
+
+	/* do not allow deleting record that have readonly flags set. */
+	if (hdr->flags & CTDB_REC_RO_FLAGS) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DBG_INFO("Skipping record with readonly flags set\n");
+		free(data2.dptr);
+		return -1;
+	}
+	if (hdr2->flags & CTDB_REC_RO_FLAGS) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DBG_INFO("Skipping record with readonly flags set locally\n");
+		free(data2.dptr);
+		return -1;
+	}
+
+	if (hdr2->dmaster == ctdb->pnn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DBG_INFO("Attempted delete record where we are the dmaster\n");
+		free(data2.dptr);
+		return -1;
+	}
+
+	if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DBG_INFO("Failed to get non-blocking freelist lock\n");
+		free(data2.dptr);
+		return -1;
+	}
+
+	if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+		tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DBG_INFO("Failed to delete record\n");
+		free(data2.dptr);
+		return -1;
+	}
+
+	tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	free(data2.dptr);
+	return 0;
+}
+
+
+
+struct recovery_callback_state {
+	struct ctdb_req_control_old *c;
+};
+
+
+/*
+  called when the 'recovered' event script has finished
+ */
+static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+	struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+	CTDB_INCREMENT_STAT(ctdb, num_recoveries);
+
+	if (status != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
+		if (status == -ETIMEDOUT) {
+			ctdb_ban_self(ctdb);
+		}
+	}
+
+	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+	talloc_free(state);
+
+	gettimeofday(&ctdb->last_recovery_finished, NULL);
+
+	if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+		ctdb_set_runstate(ctdb, CTDB_RUNSTATE_STARTUP);
+	}
+}
+
+/*
+  recovery has finished
+ */
+int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
+				struct ctdb_req_control_old *c,
+				bool *async_reply)
+{
+	int ret;
+	struct recovery_callback_state *state;
+
+	DEBUG(DEBUG_ERR,("Recovery has finished\n"));
+
+	ctdb_persistent_finish_trans3_commits(ctdb);
+
+	state = talloc(ctdb, struct recovery_callback_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->c    = c;
+
+	ret = ctdb_event_script_callback(ctdb, state,
+					 ctdb_end_recovery_callback,
+					 state,
+					 CTDB_EVENT_RECOVERED, "%s", "");
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	/* tell the control that we will be reply asynchronously */
+	state->c    = talloc_steal(state, c);
+	*async_reply = true;
+	return 0;
+}
+
+/*
+  called when the 'startrecovery' event script has finished
+ */
+static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+	struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+	if (status != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
+	}
+
+	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+	talloc_free(state);
+}
+
+static void run_start_recovery_event(struct ctdb_context *ctdb,
+				     struct recovery_callback_state *state)
+{
+	int ret;
+
+	ret = ctdb_event_script_callback(ctdb, state,
+					 ctdb_start_recovery_callback,
+					 state,
+					 CTDB_EVENT_START_RECOVERY,
+					 "%s", "");
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Unable to run startrecovery event\n"));
+		ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+		talloc_free(state);
+		return;
+	}
+
+	return;
+}
+
+static bool reclock_strings_equal(const char *a, const char *b)
+{
+	return (a == NULL && b == NULL) ||
+		(a != NULL && b != NULL && strcmp(a, b) == 0);
+}
+
+static void start_recovery_reclock_callback(struct ctdb_context *ctdb,
+						int32_t status,
+						TDB_DATA data,
+						const char *errormsg,
+						void *private_data)
+{
+	struct recovery_callback_state *state = talloc_get_type_abort(
+		private_data, struct recovery_callback_state);
+	const char *local = ctdb->recovery_lock;
+	const char *remote = NULL;
+
+	if (status != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
+		ctdb_request_control_reply(ctdb, state->c, NULL,
+					   status, errormsg);
+		talloc_free(state);
+		return;
+	}
+
+	/* Check reclock consistency */
+	if (data.dsize > 0) {
+		/* Ensure NUL-termination */
+		data.dptr[data.dsize-1] = '\0';
+		remote = (const char *)data.dptr;
+	}
+	if (! reclock_strings_equal(local, remote)) {
+		/* Inconsistent */
+		ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+		DEBUG(DEBUG_ERR,
+		      ("Recovery lock configuration inconsistent: "
+		       "recmaster has %s, this node has %s, shutting down\n",
+		       remote == NULL ? "NULL" : remote,
+		       local == NULL ? "NULL" : local));
+		talloc_free(state);
+		ctdb_shutdown_sequence(ctdb, 1);
+	}
+	DEBUG(DEBUG_INFO,
+	      ("Recovery lock consistency check successful\n"));
+
+	run_start_recovery_event(ctdb, state);
+}
+
+/* Check recovery lock consistency and run eventscripts for the
+ * "startrecovery" event */
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
+				    struct ctdb_req_control_old *c,
+				    bool *async_reply)
+{
+	int ret;
+	struct recovery_callback_state *state;
+	uint32_t recmaster = c->hdr.srcnode;
+
+	DEBUG(DEBUG_ERR, ("Recovery has started\n"));
+	gettimeofday(&ctdb->last_recovery_started, NULL);
+
+	state = talloc(ctdb, struct recovery_callback_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->c = c;
+
+	/* Although the recovery master sent this node a start
+	 * recovery control, this node might still think the recovery
+	 * master is disconnected.  In this case defer the recovery
+	 * lock consistency check. */
+	if (ctdb->nodes[recmaster]->flags & NODE_FLAGS_DISCONNECTED) {
+		run_start_recovery_event(ctdb, state);
+	} else {
+		/* Ask the recovery master about its reclock setting */
+		ret = ctdb_daemon_send_control(ctdb,
+					       recmaster,
+					       0,
+					       CTDB_CONTROL_GET_RECLOCK_FILE,
+					       0, 0,
+					       tdb_null,
+					       start_recovery_reclock_callback,
+					       state);
+
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
+			talloc_free(state);
+			return -1;
+		}
+	}
+
+	/* tell the control that we will be reply asynchronously */
+	state->c = talloc_steal(state, c);
+	*async_reply = true;
+
+	return 0;
+}
+
+/*
+ try to delete all these records as part of the vacuuming process
+ and return the records we failed to delete
+*/
+int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+	struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	unsigned int i;
+	struct ctdb_rec_data_old *rec;
+	struct ctdb_marshall_buffer *records;
+
+	if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
+		DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, reply->db_id);
+	if (!ctdb_db) {
+		DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
+		return -1;
+	}
+
+
+	DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
+		 reply->count, reply->db_id));
+
+
+	/* create a blob to send back the records we couldn't delete */
+	records = (struct ctdb_marshall_buffer *)
+			talloc_zero_size(outdata,
+				    offsetof(struct ctdb_marshall_buffer, data));
+	if (records == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		return -1;
+	}
+	records->db_id = ctdb_db->db_id;
+
+
+	rec = (struct ctdb_rec_data_old *)&reply->data[0];
+	for (i=0;i<reply->count;i++) {
+		TDB_DATA key, data;
+
+		key.dptr = &rec->data[0];
+		key.dsize = rec->keylen;
+		data.dptr = &rec->data[key.dsize];
+		data.dsize = rec->datalen;
+
+		if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+			DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
+			talloc_free(records);
+			return -1;
+		}
+
+		/* If we can't delete the record we must add it to the reply
+		   so the lmaster knows it may not purge this record
+		*/
+		if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
+			size_t old_size;
+			struct ctdb_ltdb_header *hdr;
+
+			hdr = (struct ctdb_ltdb_header *)data.dptr;
+			data.dptr += sizeof(*hdr);
+			data.dsize -= sizeof(*hdr);
+
+			DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
+
+			old_size = talloc_get_size(records);
+			records = talloc_realloc_size(outdata, records, old_size + rec->length);
+			if (records == NULL) {
+				DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+				return -1;
+			}
+			records->count++;
+			memcpy(old_size+(uint8_t *)records, rec, rec->length);
+		}
+
+		rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
+	}
+
+
+	*outdata = ctdb_marshall_finish(records);
+
+	return 0;
+}
+
+/*
+  report capabilities
+ */
+int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+	uint32_t *capabilities = NULL;
+
+	capabilities = talloc(outdata, uint32_t);
+	CTDB_NO_MEMORY(ctdb, capabilities);
+	*capabilities = ctdb->capabilities;
+
+	outdata->dsize = sizeof(uint32_t);
+	outdata->dptr = (uint8_t *)capabilities;
+
+	return 0;
+}
+
+/* The recovery daemon will ping us at regular intervals.
+   If we haven't been pinged for a while we assume the recovery
+   daemon is inoperable and we restart.
+*/
+static void ctdb_recd_ping_timeout(struct tevent_context *ev,
+				   struct tevent_timer *te,
+				   struct timeval t, void *p)
+{
+	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+	uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
+
+	DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
+
+	if (*count < ctdb->tunable.recd_ping_failcount) {
+		(*count)++;
+		tevent_add_timer(ctdb->ev, ctdb->recd_ping_count,
+				 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+				 ctdb_recd_ping_timeout, ctdb);
+		return;
+	}
+
+	DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
+
+	ctdb_stop_recoverd(ctdb);
+	ctdb_start_recoverd(ctdb);
+}
+
+int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
+{
+	talloc_free(ctdb->recd_ping_count);
+
+	ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
+	CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
+
+	if (ctdb->tunable.recd_ping_timeout != 0) {
+		tevent_add_timer(ctdb->ev, ctdb->recd_ping_count,
+				 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+				 ctdb_recd_ping_timeout, ctdb);
+	}
+
+	return 0;
+}
+
+void ctdb_node_become_inactive(struct ctdb_context *ctdb)
+{
+	struct ctdb_db_context *ctdb_db;
+
+	D_WARNING("Making node INACTIVE\n");
+
+	/*
+	 * Do not service database calls - reset generation to invalid
+	 * so this node ignores any REQ/REPLY CALL/DMASTER
+	 */
+	ctdb->vnn_map->generation = INVALID_GENERATION;
+	for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) {
+		ctdb_db->generation = INVALID_GENERATION;
+	}
+
+	/*
+	 * Although this bypasses the control, the only thing missing
+	 * is the deferred drop of all public IPs, which isn't
+	 * necessary because they are dropped below
+	 */
+	if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+		D_NOTICE("Recovery mode set to ACTIVE\n");
+		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+	}
+
+	/*
+	 * Initiate database freeze - this will be scheduled for
+	 * immediate execution and will be in progress long before the
+	 * calling control returns
+	 */
+	ctdb_daemon_send_control(ctdb,
+				 ctdb->pnn,
+				 0,
+				 CTDB_CONTROL_FREEZE,
+				 0,
+				 CTDB_CTRL_FLAG_NOREPLY,
+				 tdb_null,
+				 NULL,
+				 NULL);
+
+	D_NOTICE("Dropping all public IP addresses\n");
+	ctdb_release_all_ips(ctdb);
+}
+
+int32_t ctdb_control_stop_node(struct ctdb_context *ctdb)
+{
+	DEBUG(DEBUG_ERR, ("Stopping node\n"));
+	ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
+
+	ctdb_node_become_inactive(ctdb);
+
+	return 0;
+}
+
+int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
+{
+	DEBUG(DEBUG_ERR, ("Continue node\n"));
+	ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
+
+	return 0;
+}
+
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
new file mode 100644
index 0000000..84e2081
--- /dev/null
+++ b/ctdb/server/ctdb_recoverd.c
@@ -0,0 +1,3286 @@
+/* 
+   ctdb recovery daemon
+
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <popt.h>
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "protocol/protocol_basic.h"
+
+#include "common/system_socket.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "server/ctdb_config.h"
+
+#include "ctdb_cluster_mutex.h"
+
+/* List of SRVID requests that need to be processed */
+struct srvid_list {
+	struct srvid_list *next, *prev;
+	struct ctdb_srvid_message *request;
+};
+
+struct srvid_requests {
+	struct srvid_list *requests;
+};
+
+static void srvid_request_reply(struct ctdb_context *ctdb,
+				struct ctdb_srvid_message *request,
+				TDB_DATA result)
+{
+	/* Someone that sent srvid==0 does not want a reply */
+	if (request->srvid == 0) {
+		talloc_free(request);
+		return;
+	}
+
+	if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
+				     result) == 0) {
+		DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
+				  (unsigned)request->pnn,
+				  (unsigned long long)request->srvid));
+	} else {
+		DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
+				 (unsigned)request->pnn,
+				 (unsigned long long)request->srvid));
+	}
+
+	talloc_free(request);
+}
+
+static void srvid_requests_reply(struct ctdb_context *ctdb,
+				 struct srvid_requests **requests,
+				 TDB_DATA result)
+{
+	struct srvid_list *r;
+
+	if (*requests == NULL) {
+		return;
+	}
+
+	for (r = (*requests)->requests; r != NULL; r = r->next) {
+		srvid_request_reply(ctdb, r->request, result);
+	}
+
+	/* Free the list structure... */
+	TALLOC_FREE(*requests);
+}
+
+static void srvid_request_add(struct ctdb_context *ctdb,
+			      struct srvid_requests **requests,
+			      struct ctdb_srvid_message *request)
+{
+	struct srvid_list *t;
+	int32_t ret;
+	TDB_DATA result;
+
+	if (*requests == NULL) {
+		*requests = talloc_zero(ctdb, struct srvid_requests);
+		if (*requests == NULL) {
+			goto nomem;
+		}
+	}
+
+	t = talloc_zero(*requests, struct srvid_list);
+	if (t == NULL) {
+		/* If *requests was just allocated above then free it */
+		if ((*requests)->requests == NULL) {
+			TALLOC_FREE(*requests);
+		}
+		goto nomem;
+	}
+
+	t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
+	DLIST_ADD((*requests)->requests, t);
+
+	return;
+
+nomem:
+	/* Failed to add the request to the list.  Send a fail. */
+	DEBUG(DEBUG_ERR, (__location__
+			  " Out of memory, failed to queue SRVID request\n"));
+	ret = -ENOMEM;
+	result.dsize = sizeof(ret);
+	result.dptr = (uint8_t *)&ret;
+	srvid_request_reply(ctdb, request, result);
+}
+
+/* An abstraction to allow an operation (takeover runs, recoveries,
+ * ...) to be disabled for a given timeout */
+struct ctdb_op_state {
+	struct tevent_timer *timer;
+	bool in_progress;
+	const char *name;
+};
+
+static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
+{
+	struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
+
+	if (state != NULL) {
+		state->in_progress = false;
+		state->name = name;
+	}
+
+	return state;
+}
+
+static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
+{
+	return state->timer != NULL;
+}
+
+static bool ctdb_op_begin(struct ctdb_op_state *state)
+{
+	if (ctdb_op_is_disabled(state)) {
+		DEBUG(DEBUG_NOTICE,
+		      ("Unable to begin - %s are disabled\n", state->name));
+		return false;
+	}
+
+	state->in_progress = true;
+	return true;
+}
+
+static bool ctdb_op_end(struct ctdb_op_state *state)
+{
+	return state->in_progress = false;
+}
+
+static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
+{
+	return state->in_progress;
+}
+
+static void ctdb_op_enable(struct ctdb_op_state *state)
+{
+	TALLOC_FREE(state->timer);
+}
+
+static void ctdb_op_timeout_handler(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval yt, void *p)
+{
+	struct ctdb_op_state *state =
+		talloc_get_type(p, struct ctdb_op_state);
+
+	DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
+	ctdb_op_enable(state);
+}
+
+static int ctdb_op_disable(struct ctdb_op_state *state,
+			   struct tevent_context *ev,
+			   uint32_t timeout)
+{
+	if (timeout == 0) {
+		DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
+		ctdb_op_enable(state);
+		return 0;
+	}
+
+	if (state->in_progress) {
+		DEBUG(DEBUG_ERR,
+		      ("Unable to disable %s - in progress\n", state->name));
+		return -EAGAIN;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
+			    state->name, timeout));
+
+	/* Clear any old timers */
+	talloc_free(state->timer);
+
+	/* Arrange for the timeout to occur */
+	state->timer = tevent_add_timer(ev, state,
+					timeval_current_ofs(timeout, 0),
+					ctdb_op_timeout_handler, state);
+	if (state->timer == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+struct ctdb_banning_state {
+	uint32_t pnn;
+	uint32_t count;
+	struct timeval last_reported_time;
+};
+
+struct ctdb_cluster_lock_handle;
+
+/*
+  private state of recovery daemon
+ */
+struct ctdb_recoverd {
+	struct ctdb_context *ctdb;
+	uint32_t leader;
+	struct tevent_timer *leader_broadcast_te;
+	struct tevent_timer *leader_broadcast_timeout_te;
+	uint32_t pnn;
+	uint32_t last_culprit_node;
+	struct ctdb_banning_state *banning_state;
+	struct ctdb_node_map_old *nodemap;
+	struct timeval priority_time;
+	bool need_takeover_run;
+	bool need_recovery;
+	uint32_t node_flags;
+	struct tevent_timer *send_election_te;
+	bool election_in_progress;
+	struct tevent_timer *election_timeout;
+	struct srvid_requests *reallocate_requests;
+	struct ctdb_op_state *takeover_run;
+	struct ctdb_op_state *recovery;
+	struct ctdb_iface_list_old *ifaces;
+	uint32_t *force_rebalance_nodes;
+	struct ctdb_node_capabilities *caps;
+	bool frozen_on_inactive;
+	struct ctdb_cluster_lock_handle *cluster_lock_handle;
+	pid_t helper_pid;
+};
+
+#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
+#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
+
+static void ctdb_restart_recd(struct tevent_context *ev,
+			      struct tevent_timer *te, struct timeval t,
+			      void *private_data);
+
+static bool this_node_is_leader(struct ctdb_recoverd *rec)
+{
+	return rec->leader == rec->pnn;
+}
+
+static bool this_node_can_be_leader(struct ctdb_recoverd *rec)
+{
+	return (rec->node_flags & NODE_FLAGS_INACTIVE) == 0 &&
+		(rec->ctdb->capabilities & CTDB_CAP_RECMASTER) != 0;
+}
+
+static bool node_flags(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t *flags)
+{
+	size_t i;
+
+	for (i = 0; i < rec->nodemap->num; i++) {
+		struct ctdb_node_and_flags *node = &rec->nodemap->nodes[i];
+		if (node->pnn == pnn) {
+			if (flags != NULL) {
+				*flags = node->flags;
+			}
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+  ban a node for a period of time
+ */
+static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn)
+{
+	int ret;
+	struct ctdb_context *ctdb = rec->ctdb;
+	uint32_t ban_time = ctdb->tunable.recovery_ban_period;
+	struct ctdb_ban_state bantime;
+
+	if (!ctdb_validate_pnn(ctdb, pnn)) {
+		DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
+		return;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
+
+	bantime.pnn  = pnn;
+	bantime.time = ban_time;
+
+	ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
+		return;
+	}
+
+}
+
+enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
+
+
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec,
+				   uint32_t culprit,
+				   uint32_t count)
+{
+	struct ctdb_context *ctdb = talloc_get_type_abort(
+		rec->ctdb, struct ctdb_context);
+	struct ctdb_banning_state *ban_state = NULL;
+	size_t len;
+	bool ok;
+
+	ok = node_flags(rec, culprit, NULL);
+	if (!ok) {
+		DBG_WARNING("Unknown culprit node %"PRIu32"\n", culprit);
+		return;
+	}
+
+	/* If we are banned or stopped, do not set other nodes as culprits */
+	if (rec->node_flags & NODE_FLAGS_INACTIVE) {
+		D_WARNING("This node is INACTIVE, cannot set culprit node %d\n",
+			  culprit);
+		return;
+	}
+
+	if (rec->banning_state == NULL) {
+		len = 0;
+	} else {
+		size_t i;
+
+		len = talloc_array_length(rec->banning_state);
+
+		for (i = 0 ; i < len; i++) {
+			if (rec->banning_state[i].pnn == culprit) {
+				ban_state= &rec->banning_state[i];
+				break;
+			}
+		}
+	}
+
+	/* Not found, so extend (or allocate new) array */
+	if (ban_state == NULL) {
+		struct ctdb_banning_state *t;
+
+		len += 1;
+		/*
+		 * talloc_realloc() handles the corner case where
+		 * rec->banning_state is NULL
+		 */
+		t = talloc_realloc(rec,
+				   rec->banning_state,
+				   struct ctdb_banning_state,
+				   len);
+		if (t == NULL) {
+			DBG_WARNING("Memory allocation error\n");
+			return;
+		}
+		rec->banning_state = t;
+
+		/* New element is always at the end - initialise it... */
+		ban_state = &rec->banning_state[len - 1];
+		*ban_state = (struct ctdb_banning_state) {
+			.pnn = culprit,
+			.count = 0,
+		};
+	} else if (ban_state->count > 0 &&
+		   timeval_elapsed(&ban_state->last_reported_time) >
+		   ctdb->tunable.recovery_grace_period) {
+		/*
+		 * Forgive old transgressions beyond the tunable time-limit
+		 */
+		ban_state->count = 0;
+	}
+
+	ban_state->count += count;
+	ban_state->last_reported_time = timeval_current();
+	rec->last_culprit_node = culprit;
+}
+
+static void ban_counts_reset(struct ctdb_recoverd *rec)
+{
+	D_NOTICE("Resetting ban count to 0 for all nodes\n");
+	TALLOC_FREE(rec->banning_state);
+}
+
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+{
+	ctdb_set_culprit_count(rec, culprit, 1);
+}
+
+/*
+  Retrieve capabilities from all connected nodes
+ */
+static int update_capabilities(struct ctdb_recoverd *rec,
+			       struct ctdb_node_map_old *nodemap)
+{
+	uint32_t *capp;
+	TALLOC_CTX *tmp_ctx;
+	struct ctdb_node_capabilities *caps;
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	tmp_ctx = talloc_new(rec);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+	caps = ctdb_get_capabilities(ctdb, tmp_ctx,
+				     CONTROL_TIMEOUT(), nodemap);
+
+	if (caps == NULL) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Failed to get node capabilities\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	capp = ctdb_get_node_capabilities(caps, rec->pnn);
+	if (capp == NULL) {
+		DEBUG(DEBUG_ERR,
+		      (__location__
+		       " Capabilities don't include current node.\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+	ctdb->capabilities = *capp;
+
+	TALLOC_FREE(rec->caps);
+	rec->caps = talloc_steal(rec, caps);
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
+
+/*
+  change recovery mode on all nodes
+ */
+static int set_recovery_mode(struct ctdb_context *ctdb,
+			     struct ctdb_recoverd *rec,
+			     struct ctdb_node_map_old *nodemap,
+			     uint32_t rec_mode)
+{
+	TDB_DATA data;
+	uint32_t *nodes;
+	TALLOC_CTX *tmp_ctx;
+
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+
+	data.dsize = sizeof(uint32_t);
+	data.dptr = (unsigned char *)&rec_mode;
+
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
+					nodes, 0,
+					CONTROL_TIMEOUT(),
+					false, data,
+					NULL, NULL,
+					NULL) != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
+
+/*
+ * Update flags on all connected nodes
+ */
+static int update_flags_on_all_nodes(struct ctdb_recoverd *rec,
+				     uint32_t pnn,
+				     uint32_t flags)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct timeval timeout = CONTROL_TIMEOUT();
+	TDB_DATA data;
+	struct ctdb_node_map_old *nodemap=NULL;
+	struct ctdb_node_flag_change c;
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	uint32_t *nodes;
+	uint32_t i;
+	int ret;
+
+	nodemap = rec->nodemap;
+
+	for (i = 0; i < nodemap->num; i++) {
+		if (pnn == nodemap->nodes[i].pnn) {
+			break;
+		}
+	}
+	if (i >= nodemap->num) {
+		DBG_ERR("Nodemap does not contain node %d\n", pnn);
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	c.pnn       = pnn;
+	c.old_flags = nodemap->nodes[i].flags;
+	c.new_flags = flags;
+
+	data.dsize = sizeof(c);
+	data.dptr = (unsigned char *)&c;
+
+	/* send the flags update to all connected nodes */
+	nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+	ret = ctdb_client_async_control(ctdb,
+					CTDB_CONTROL_MODIFY_FLAGS,
+					nodes,
+					0,
+					timeout,
+					false,
+					data,
+					NULL,
+					NULL,
+					NULL);
+	if (ret != 0) {
+		DBG_ERR("Unable to update flags on remote nodes\n");
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
+
+static bool _cluster_lock_lock(struct ctdb_recoverd *rec);
+static bool cluster_lock_held(struct ctdb_recoverd *rec);
+
+static bool cluster_lock_enabled(struct ctdb_recoverd *rec)
+{
+	return rec->ctdb->recovery_lock != NULL;
+}
+
+static bool cluster_lock_take(struct ctdb_recoverd *rec)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	bool have_lock;
+
+	if (!cluster_lock_enabled(rec)) {
+		return true;
+	}
+
+	if (cluster_lock_held(rec)) {
+		D_NOTICE("Already holding cluster lock\n");
+		return true;
+	}
+
+	D_NOTICE("Attempting to take cluster lock (%s)\n", ctdb->recovery_lock);
+	have_lock = _cluster_lock_lock(rec);
+	if (!have_lock) {
+		return false;
+	}
+
+	D_NOTICE("Cluster lock taken successfully\n");
+	return true;
+}
+
+/*
+  called when ctdb_wait_timeout should finish
+ */
+static void ctdb_wait_handler(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval yt, void *p)
+{
+	uint32_t *timed_out = (uint32_t *)p;
+	(*timed_out) = 1;
+}
+
+/*
+  wait for a given number of seconds
+ */
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
+{
+	uint32_t timed_out = 0;
+	uint32_t usecs = (secs - (uint32_t)secs) * 1000000;
+	tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
+			 ctdb_wait_handler, &timed_out);
+	while (!timed_out) {
+		tevent_loop_once(ctdb->ev);
+	}
+}
+
+/*
+ * Broadcast cluster leader
+ */
+
+static int leader_broadcast_send(struct ctdb_recoverd *rec, uint32_t pnn)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	TDB_DATA data;
+	int ret;
+
+	data.dptr = (uint8_t *)&pnn;
+	data.dsize = sizeof(pnn);
+
+	ret = ctdb_client_send_message(ctdb,
+				       CTDB_BROADCAST_CONNECTED,
+				       CTDB_SRVID_LEADER,
+				       data);
+	return ret;
+}
+
+static int leader_broadcast_loop(struct ctdb_recoverd *rec);
+static void cluster_lock_release(struct ctdb_recoverd *rec);
+
+/* This runs continuously but only sends the broadcast when leader */
+static void leader_broadcast_loop_handler(struct tevent_context *ev,
+					  struct tevent_timer *te,
+					  struct timeval current_time,
+					  void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+	int ret;
+
+	if (!this_node_can_be_leader(rec)) {
+		if (this_node_is_leader(rec)) {
+			rec->leader = CTDB_UNKNOWN_PNN;
+		}
+		if (cluster_lock_enabled(rec) && cluster_lock_held(rec)) {
+			cluster_lock_release(rec);
+		}
+		goto done;
+	}
+
+	if (!this_node_is_leader(rec)) {
+		goto done;
+	}
+
+	if (rec->election_in_progress) {
+		goto done;
+	}
+
+	ret = leader_broadcast_send(rec, rec->leader);
+	if (ret != 0) {
+		DBG_WARNING("Failed to send leader broadcast\n");
+	}
+
+done:
+	ret = leader_broadcast_loop(rec);
+	if (ret != 0) {
+		D_WARNING("Failed to set up leader broadcast\n");
+	}
+}
+
+static int leader_broadcast_loop(struct ctdb_recoverd *rec)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	TALLOC_FREE(rec->leader_broadcast_te);
+	rec->leader_broadcast_te =
+		tevent_add_timer(ctdb->ev,
+				 rec,
+				 timeval_current_ofs(1, 0),
+				 leader_broadcast_loop_handler,
+				 rec);
+	if (rec->leader_broadcast_te == NULL) {
+		return ENOMEM;
+	}
+
+	return 0;
+}
+
+static bool leader_broadcast_loop_active(struct ctdb_recoverd *rec)
+{
+	return rec->leader_broadcast_te != NULL;
+}
+
+/*
+  called when an election times out (ends)
+ */
+static void ctdb_election_timeout(struct tevent_context *ev,
+				  struct tevent_timer *te,
+				  struct timeval t, void *p)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+	bool ok;
+
+	rec->election_in_progress = false;
+	rec->election_timeout = NULL;
+	fast_start = false;
+
+	D_WARNING("Election period ended, leader=%u\n", rec->leader);
+
+	if (!this_node_is_leader(rec)) {
+		return;
+	}
+
+	ok = cluster_lock_take(rec);
+	if (!ok) {
+		D_ERR("Unable to get cluster lock, banning node\n");
+		ctdb_ban_node(rec, rec->pnn);
+	}
+}
+
+
+/*
+  wait for an election to finish. It finished election_timeout seconds after
+  the last election packet is received
+ */
+static void ctdb_wait_election(struct ctdb_recoverd *rec)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	while (rec->election_in_progress) {
+		tevent_loop_once(ctdb->ev);
+	}
+}
+
+/*
+ * Update local flags from all remote connected nodes and push out
+ * flags changes to all nodes.  This is only run by the leader.
+ */
+static int update_flags(struct ctdb_recoverd *rec,
+			struct ctdb_node_map_old *nodemap,
+			struct ctdb_node_map_old **remote_nodemaps)
+{
+	unsigned int j;
+	struct ctdb_context *ctdb = rec->ctdb;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+
+	/* Check flags from remote nodes */
+	for (j=0; j<nodemap->num; j++) {
+		struct ctdb_node_map_old *remote_nodemap=NULL;
+		uint32_t local_flags = nodemap->nodes[j].flags;
+		uint32_t remote_pnn = nodemap->nodes[j].pnn;
+		uint32_t remote_flags;
+		unsigned int i;
+		int ret;
+
+		if (local_flags & NODE_FLAGS_DISCONNECTED) {
+			continue;
+		}
+		if (remote_pnn == rec->pnn) {
+			/*
+			 * No remote nodemap for this node since this
+			 * is the local nodemap.  However, still need
+			 * to check this against the remote nodes and
+			 * push it if they are out-of-date.
+			 */
+			goto compare_remotes;
+		}
+
+		remote_nodemap = remote_nodemaps[j];
+		remote_flags = remote_nodemap->nodes[j].flags;
+
+		if (local_flags != remote_flags) {
+			/*
+			 * Update the local copy of the flags in the
+			 * recovery daemon.
+			 */
+			D_NOTICE("Remote node %u had flags 0x%x, "
+				 "local had 0x%x - updating local\n",
+				 remote_pnn,
+				 remote_flags,
+				 local_flags);
+			nodemap->nodes[j].flags = remote_flags;
+			local_flags = remote_flags;
+			goto push;
+		}
+
+compare_remotes:
+		for (i = 0; i < nodemap->num; i++) {
+			if (i == j) {
+				continue;
+			}
+			if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+				continue;
+			}
+			if (nodemap->nodes[i].pnn == rec->pnn) {
+				continue;
+			}
+
+			remote_nodemap = remote_nodemaps[i];
+			remote_flags = remote_nodemap->nodes[j].flags;
+
+			if (local_flags != remote_flags) {
+				goto push;
+			}
+		}
+
+		continue;
+
+push:
+		D_NOTICE("Pushing updated flags for node %u (0x%x)\n",
+			 remote_pnn,
+			 local_flags);
+		ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags);
+		if (ret != 0) {
+			DBG_ERR("Unable to update flags on remote nodes\n");
+			talloc_free(mem_ctx);
+			return -1;
+		}
+	}
+	talloc_free(mem_ctx);
+	return 0;
+}
+
+
+/* Create a new random generation id.
+   The generation id can not be the INVALID_GENERATION id
+*/
+static uint32_t new_generation(void)
+{
+	uint32_t generation;
+
+	while (1) {
+		generation = random();
+
+		if (generation != INVALID_GENERATION) {
+			break;
+		}
+	}
+
+	return generation;
+}
+
+static bool cluster_lock_held(struct ctdb_recoverd *rec)
+{
+	return (rec->cluster_lock_handle != NULL);
+}
+
+struct ctdb_cluster_lock_handle {
+	bool done;
+	bool locked;
+	double latency;
+	struct ctdb_cluster_mutex_handle *h;
+	struct ctdb_recoverd *rec;
+};
+
+static void take_cluster_lock_handler(char status,
+				      double latency,
+				      void *private_data)
+{
+	struct ctdb_cluster_lock_handle *s =
+		(struct ctdb_cluster_lock_handle *) private_data;
+
+	s->locked = (status == '0') ;
+
+	/*
+	 * If unsuccessful then ensure the process has exited and that
+	 * the file descriptor event handler has been cancelled
+	 */
+	if (! s->locked) {
+		TALLOC_FREE(s->h);
+	}
+
+	switch (status) {
+	case '0':
+		s->latency = latency;
+		break;
+
+	case '1':
+		D_ERR("Unable to take cluster lock - contention\n");
+		break;
+
+	case '2':
+		D_ERR("Unable to take cluster lock - timeout\n");
+		break;
+
+	default:
+		D_ERR("Unable to take cluster lock - unknown error\n");
+	}
+
+	s->done = true;
+}
+
+static void force_election(struct ctdb_recoverd *rec);
+
+static void lost_cluster_lock_handler(void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+
+	D_ERR("Cluster lock helper terminated\n");
+	TALLOC_FREE(rec->cluster_lock_handle);
+
+	if (this_node_can_be_leader(rec)) {
+		force_election(rec);
+	}
+}
+
+static bool _cluster_lock_lock(struct ctdb_recoverd *rec)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct ctdb_cluster_mutex_handle *h;
+	struct ctdb_cluster_lock_handle *s;
+
+	s = talloc_zero(rec, struct ctdb_cluster_lock_handle);
+	if (s == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return false;
+	};
+
+	s->rec = rec;
+
+	h = ctdb_cluster_mutex(s,
+			       ctdb,
+			       ctdb->recovery_lock,
+			       120,
+			       take_cluster_lock_handler,
+			       s,
+			       lost_cluster_lock_handler,
+			       rec);
+	if (h == NULL) {
+		talloc_free(s);
+		return false;
+	}
+
+	rec->cluster_lock_handle = s;
+	s->h = h;
+
+	while (! s->done) {
+		tevent_loop_once(ctdb->ev);
+	}
+
+	if (! s->locked) {
+		TALLOC_FREE(rec->cluster_lock_handle);
+		return false;
+	}
+
+	ctdb_ctrl_report_recd_lock_latency(ctdb,
+					   CONTROL_TIMEOUT(),
+					   s->latency);
+
+	return true;
+}
+
+static void cluster_lock_release(struct ctdb_recoverd *rec)
+{
+	if (rec->cluster_lock_handle == NULL) {
+		return;
+	}
+
+	if (! rec->cluster_lock_handle->done) {
+		/*
+		 * Taking of cluster lock still in progress.  Free
+		 * the cluster mutex handle to release it but leave
+		 * the cluster lock handle in place to allow taking
+		 * of the lock to fail.
+		 */
+		D_NOTICE("Cancelling cluster lock\n");
+		TALLOC_FREE(rec->cluster_lock_handle->h);
+		rec->cluster_lock_handle->done = true;
+		rec->cluster_lock_handle->locked = false;
+		return;
+	}
+
+	D_NOTICE("Releasing cluster lock\n");
+	TALLOC_FREE(rec->cluster_lock_handle);
+}
+
+static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
+{
+	size_t len = talloc_array_length(rec->banning_state);
+	size_t i;
+
+
+	*self_ban = false;
+	for (i = 0; i < len; i++) {
+		struct ctdb_banning_state *ban_state = &rec->banning_state[i];
+
+		if (ban_state->count < 2 * rec->nodemap->num) {
+			continue;
+		}
+
+		D_NOTICE("Node %u reached %u banning credits\n",
+			 ban_state->pnn,
+			 ban_state->count);
+		ctdb_ban_node(rec, ban_state->pnn);
+		ban_state->count = 0;
+
+		/* Banning ourself? */
+		if (ban_state->pnn == rec->pnn) {
+			*self_ban = true;
+		}
+	}
+}
+
+struct helper_state {
+	int fd[2];
+	pid_t pid;
+	int result;
+	bool done;
+};
+
+static void helper_handler(struct tevent_context *ev,
+			   struct tevent_fd *fde,
+			   uint16_t flags, void *private_data)
+{
+	struct helper_state *state = talloc_get_type_abort(
+		private_data, struct helper_state);
+	int ret;
+
+	ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
+	if (ret != sizeof(state->result)) {
+		state->result = EPIPE;
+	}
+
+	state->done = true;
+}
+
+static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
+		      const char *prog, const char *arg, const char *type)
+{
+	struct helper_state *state;
+	struct tevent_fd *fde;
+	const char **args;
+	int nargs, ret;
+
+	state = talloc_zero(mem_ctx, struct helper_state);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+		return -1;
+	}
+
+	state->pid = -1;
+
+	ret = pipe(state->fd);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to create pipe for %s helper\n", type));
+		goto fail;
+	}
+
+	set_close_on_exec(state->fd[0]);
+
+	nargs = 4;
+	args = talloc_array(state, const char *, nargs);
+	if (args == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+		goto fail;
+	}
+
+	args[0] = talloc_asprintf(args, "%d", state->fd[1]);
+	if (args[0] == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+		goto fail;
+	}
+	args[1] = rec->ctdb->daemon.name;
+	args[2] = arg;
+	args[3] = NULL;
+
+	if (args[2] == NULL) {
+		nargs = 3;
+	}
+
+	state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
+	if (state->pid == -1) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to create child for %s helper\n", type));
+		goto fail;
+	}
+
+	close(state->fd[1]);
+	state->fd[1] = -1;
+
+	rec->helper_pid = state->pid;
+	state->done = false;
+
+	fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
+			    TEVENT_FD_READ, helper_handler, state);
+	if (fde == NULL) {
+		goto fail;
+	}
+	tevent_fd_set_auto_close(fde);
+
+	while (!state->done) {
+		tevent_loop_once(rec->ctdb->ev);
+
+		if (!this_node_is_leader(rec)) {
+			D_ERR("Leader changed to %u, aborting %s\n",
+			      rec->leader,
+			      type);
+			state->result = 1;
+			break;
+		}
+	}
+
+	close(state->fd[0]);
+	state->fd[0] = -1;
+
+	if (state->result != 0) {
+		goto fail;
+	}
+
+	rec->helper_pid = -1;
+	ctdb_kill(rec->ctdb, state->pid, SIGKILL);
+	talloc_free(state);
+	return 0;
+
+fail:
+	if (state->fd[0] != -1) {
+		close(state->fd[0]);
+	}
+	if (state->fd[1] != -1) {
+		close(state->fd[1]);
+	}
+	rec->helper_pid = -1;
+	if (state->pid != -1) {
+		ctdb_kill(rec->ctdb, state->pid, SIGKILL);
+	}
+	talloc_free(state);
+	return -1;
+}
+
+
+static int ctdb_takeover(struct ctdb_recoverd *rec,
+			 uint32_t *force_rebalance_nodes)
+{
+	static char prog[PATH_MAX+1] = "";
+	char *arg;
+	unsigned int i;
+	int ret;
+
+	if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
+			     "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
+			     "ctdb_takeover_helper")) {
+		ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
+	}
+
+	arg = NULL;
+	for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
+		uint32_t pnn = force_rebalance_nodes[i];
+		if (arg == NULL) {
+			arg = talloc_asprintf(rec, "%u", pnn);
+		} else {
+			arg = talloc_asprintf_append(arg, ",%u", pnn);
+		}
+		if (arg == NULL) {
+			DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+			return -1;
+		}
+	}
+
+	if (ctdb_config.failover_disabled) {
+		ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
+		if (ret != 0) {
+			D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
+			return -1;
+		}
+	}
+
+	return helper_run(rec, rec, prog, arg, "takeover");
+}
+
+static bool do_takeover_run(struct ctdb_recoverd *rec,
+			    struct ctdb_node_map_old *nodemap)
+{
+	uint32_t *nodes = NULL;
+	struct ctdb_disable_message dtr;
+	TDB_DATA data;
+	size_t i;
+	uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
+	int ret;
+	bool ok;
+
+	DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
+
+	if (ctdb_op_is_in_progress(rec->takeover_run)) {
+		DEBUG(DEBUG_ERR, (__location__
+				  " takeover run already in progress \n"));
+		ok = false;
+		goto done;
+	}
+
+	if (!ctdb_op_begin(rec->takeover_run)) {
+		ok = false;
+		goto done;
+	}
+
+	/* Disable IP checks (takeover runs, really) on other nodes
+	 * while doing this takeover run.  This will stop those other
+	 * nodes from triggering takeover runs when think they should
+	 * be hosting an IP but it isn't yet on an interface.  Don't
+	 * wait for replies since a failure here might cause some
+	 * noise in the logs but will not actually cause a problem.
+	 */
+	ZERO_STRUCT(dtr);
+	dtr.srvid = 0; /* No reply */
+	dtr.pnn = -1;
+
+	data.dptr  = (uint8_t*)&dtr;
+	data.dsize = sizeof(dtr);
+
+	nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
+
+	/* Disable for 60 seconds.  This can be a tunable later if
+	 * necessary.
+	 */
+	dtr.timeout = 60;
+	for (i = 0; i < talloc_array_length(nodes); i++) {
+		if (ctdb_client_send_message(rec->ctdb, nodes[i],
+					     CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+					     data) != 0) {
+			DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
+		}
+	}
+
+	ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
+
+	/* Re-enable takeover runs and IP checks on other nodes */
+	dtr.timeout = 0;
+	for (i = 0; i < talloc_array_length(nodes); i++) {
+		if (ctdb_client_send_message(rec->ctdb, nodes[i],
+					     CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+					     data) != 0) {
+			DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
+		}
+	}
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
+		ok = false;
+		goto done;
+	}
+
+	ok = true;
+	/* Takeover run was successful so clear force rebalance targets */
+	if (rebalance_nodes == rec->force_rebalance_nodes) {
+		TALLOC_FREE(rec->force_rebalance_nodes);
+	} else {
+		DEBUG(DEBUG_WARNING,
+		      ("Rebalance target nodes changed during takeover run - not clearing\n"));
+	}
+done:
+	rec->need_takeover_run = !ok;
+	talloc_free(nodes);
+	ctdb_op_end(rec->takeover_run);
+
+	DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
+	return ok;
+}
+
+static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
+{
+	static char prog[PATH_MAX+1] = "";
+	const char *arg;
+
+	if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
+			     "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
+			     "ctdb_recovery_helper")) {
+		ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
+	}
+
+	arg = talloc_asprintf(mem_ctx, "%u", new_generation());
+	if (arg == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+		return -1;
+	}
+
+	setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
+
+	return helper_run(rec, mem_ctx, prog, arg, "recovery");
+}
+
+/*
+ * Main recovery function, only run by leader
+ */
+static int do_recovery(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct ctdb_node_map_old *nodemap = rec->nodemap;
+	unsigned int i;
+	int ret;
+	bool self_ban;
+
+	DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
+
+	/* Check if the current node is still the leader.  It's possible that
+	 * re-election has changed the leader.
+	 */
+	if (!this_node_is_leader(rec)) {
+		D_NOTICE("Leader changed to %u, aborting recovery\n",
+			 rec->leader);
+		return -1;
+	}
+
+	/* if recovery fails, force it again */
+	rec->need_recovery = true;
+
+	if (!ctdb_op_begin(rec->recovery)) {
+		return -1;
+	}
+
+	if (rec->election_in_progress) {
+		/* an election is in progress */
+		DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
+		goto fail;
+	}
+
+	ban_misbehaving_nodes(rec, &self_ban);
+	if (self_ban) {
+		DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
+		goto fail;
+	}
+
+	if (cluster_lock_enabled(rec) && !cluster_lock_held(rec)) {
+		/* Leader can change in ban_misbehaving_nodes() */
+		if (!this_node_is_leader(rec)) {
+			D_NOTICE("Leader changed to %u, aborting recovery\n",
+				 rec->leader);
+			rec->need_recovery = false;
+			goto fail;
+		}
+
+		D_ERR("Cluster lock not held - abort recovery, ban node\n");
+		ctdb_ban_node(rec, rec->pnn);
+		goto fail;
+	}
+
+	DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
+
+	/* Retrieve capabilities from all connected nodes */
+	ret = update_capabilities(rec, nodemap);
+	if (ret!=0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
+		return -1;
+	}
+
+	/*
+	  update all nodes to have the same flags that we have
+	 */
+	for (i=0;i<nodemap->num;i++) {
+		if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+			continue;
+		}
+
+		ret = update_flags_on_all_nodes(rec,
+						nodemap->nodes[i].pnn,
+						nodemap->nodes[i].flags);
+		if (ret != 0) {
+			if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+				DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
+			} else {
+				DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+				return -1;
+			}
+		}
+	}
+
+	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
+	ret = db_recovery_parallel(rec, mem_ctx);
+	if (ret != 0) {
+		goto fail;
+	}
+
+	do_takeover_run(rec, nodemap);
+
+	/* send a message to all clients telling them that the cluster 
+	   has been reconfigured */
+	ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
+				       CTDB_SRVID_RECONFIGURE, tdb_null);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
+		goto fail;
+	}
+
+	DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
+
+	rec->need_recovery = false;
+	ctdb_op_end(rec->recovery);
+
+	/*
+	 * Completed a full recovery so forgive any past transgressions
+	 */
+	ban_counts_reset(rec);
+
+	/* We just finished a recovery successfully.
+	   We now wait for rerecovery_timeout before we allow
+	   another recovery to take place.
+	*/
+	DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
+	ctdb_op_disable(rec->recovery, ctdb->ev,
+			ctdb->tunable.rerecovery_timeout);
+	return 0;
+
+fail:
+	ctdb_op_end(rec->recovery);
+	return -1;
+}
+
+
+/*
+  elections are won by first checking the number of connected nodes, then
+  the priority time, then the pnn
+ */
+struct election_message {
+	uint32_t num_connected;
+	struct timeval priority_time;
+	uint32_t pnn;
+	uint32_t node_flags;
+};
+
+/*
+  form this nodes election data
+ */
+static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
+{
+	unsigned int i;
+	int ret;
+	struct ctdb_node_map_old *nodemap;
+	struct ctdb_context *ctdb = rec->ctdb;
+	bool ok;
+
+	ZERO_STRUCTP(em);
+
+	em->pnn = rec->pnn;
+	em->priority_time = rec->priority_time;
+
+	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
+		return;
+	}
+
+	ok = node_flags(rec, rec->pnn, &rec->node_flags);
+	if (!ok) {
+		DBG_ERR("Unable to get node flags for this node\n");
+		return;
+	}
+	em->node_flags = rec->node_flags;
+
+	for (i=0;i<nodemap->num;i++) {
+		if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
+			em->num_connected++;
+		}
+	}
+
+	if (!this_node_can_be_leader(rec)) {
+		/* Try to lose... */
+		em->num_connected = 0;
+		em->priority_time = timeval_current();
+	}
+
+	talloc_free(nodemap);
+}
+
+/*
+  see if the given election data wins
+ */
+static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
+{
+	struct election_message myem;
+	int cmp = 0;
+
+	ctdb_election_data(rec, &myem);
+
+	if (!this_node_can_be_leader(rec)) {
+		return false;
+	}
+
+	/* Automatically win if other node is banned or stopped */
+	if (em->node_flags & NODE_FLAGS_INACTIVE) {
+		return true;
+	}
+
+	/* then the longest running node */
+	if (cmp == 0) {
+		cmp = timeval_compare(&em->priority_time, &myem.priority_time);
+	}
+
+	if (cmp == 0) {
+		cmp = (int)myem.pnn - (int)em->pnn;
+	}
+
+	return cmp > 0;
+}
+
+/*
+  send out an election request
+ */
+static int send_election_request(struct ctdb_recoverd *rec)
+{
+	TDB_DATA election_data;
+	struct election_message emsg;
+	uint64_t srvid;
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	srvid = CTDB_SRVID_ELECTION;
+
+	ctdb_election_data(rec, &emsg);
+
+	election_data.dsize = sizeof(struct election_message);
+	election_data.dptr  = (unsigned char *)&emsg;
+
+
+	/* Assume this node will win the election, set leader accordingly */
+	rec->leader = rec->pnn;
+
+	/* send an election message to all active nodes */
+	DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
+	return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+}
+
+/*
+  we think we are winning the election - send a broadcast election request
+ */
+static void election_send_request(struct tevent_context *ev,
+				  struct tevent_timer *te,
+				  struct timeval t, void *p)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+	int ret;
+
+	ret = send_election_request(rec);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
+	}
+
+	TALLOC_FREE(rec->send_election_te);
+}
+
+/*
+  handler for memory dumps
+*/
+static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+	struct ctdb_context *ctdb = rec->ctdb;
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	TDB_DATA *dump;
+	int ret;
+	struct ctdb_srvid_message *rd;
+
+	if (data.dsize != sizeof(struct ctdb_srvid_message)) {
+		DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+		talloc_free(tmp_ctx);
+		return;
+	}
+	rd = (struct ctdb_srvid_message *)data.dptr;
+
+	dump = talloc_zero(tmp_ctx, TDB_DATA);
+	if (dump == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
+		talloc_free(tmp_ctx);
+		return;
+	}
+	ret = ctdb_dump_memory(ctdb, dump);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	DBG_ERR("recovery daemon memory dump\n");
+
+	ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	talloc_free(tmp_ctx);
+}
+
+/*
+  handler for reload_nodes
+*/
+static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
+				 void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+
+	DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
+
+	ctdb_load_nodes_file(rec->ctdb);
+}
+
+
+static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
+					void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+	struct ctdb_context *ctdb = rec->ctdb;
+	uint32_t pnn;
+	uint32_t *t;
+	int len;
+
+	if (!this_node_is_leader(rec)) {
+		return;
+	}
+
+	if (data.dsize != sizeof(uint32_t)) {
+		DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
+		return;
+	}
+
+	pnn = *(uint32_t *)&data.dptr[0];
+
+	DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
+
+	/* Copy any existing list of nodes.  There's probably some
+	 * sort of realloc variant that will do this but we need to
+	 * make sure that freeing the old array also cancels the timer
+	 * event for the timeout... not sure if realloc will do that.
+	 */
+	len = (rec->force_rebalance_nodes != NULL) ?
+		talloc_array_length(rec->force_rebalance_nodes) :
+		0;
+
+	/* This allows duplicates to be added but they don't cause
+	 * harm.  A call to add a duplicate PNN arguably means that
+	 * the timeout should be reset, so this is the simplest
+	 * solution.
+	 */
+	t = talloc_zero_array(rec, uint32_t, len+1);
+	CTDB_NO_MEMORY_VOID(ctdb, t);
+	if (len > 0) {
+		memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
+	}
+	t[len] = pnn;
+
+	talloc_free(rec->force_rebalance_nodes);
+
+	rec->force_rebalance_nodes = t;
+}
+
+
+
+static void srvid_disable_and_reply(struct ctdb_recoverd *rec,
+				    TDB_DATA data,
+				    struct ctdb_op_state *op_state)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct ctdb_disable_message *r;
+	uint32_t timeout;
+	TDB_DATA result;
+	int32_t ret = 0;
+
+	/* Validate input data */
+	if (data.dsize != sizeof(struct ctdb_disable_message)) {
+		DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
+				 "expecting %lu\n", (long unsigned)data.dsize,
+				 (long unsigned)sizeof(struct ctdb_srvid_message)));
+		return;
+	}
+	if (data.dptr == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
+		return;
+	}
+
+	r = (struct ctdb_disable_message *)data.dptr;
+	timeout = r->timeout;
+
+	ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
+	if (ret != 0) {
+		goto done;
+	}
+
+	/* Returning our PNN tells the caller that we succeeded */
+	ret = rec->pnn;
+done:
+	result.dsize = sizeof(int32_t);
+	result.dptr  = (uint8_t *)&ret;
+	srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
+}
+
+static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
+					  void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+
+	srvid_disable_and_reply(rec, data, rec->takeover_run);
+}
+
+/* Backward compatibility for this SRVID */
+static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
+				     void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+	uint32_t timeout;
+
+	if (data.dsize != sizeof(uint32_t)) {
+		DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
+				 "expecting %lu\n", (long unsigned)data.dsize,
+				 (long unsigned)sizeof(uint32_t)));
+		return;
+	}
+	if (data.dptr == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
+		return;
+	}
+
+	timeout = *((uint32_t *)data.dptr);
+
+	ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
+}
+
+static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
+				       void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+
+	srvid_disable_and_reply(rec, data, rec->recovery);
+}
+
+/*
+  handler for ip reallocate, just add it to the list of requests and 
+  handle this later in the monitor_cluster loop so we do not recurse
+  with other requests to takeover_run()
+*/
+static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
+				  void *private_data)
+{
+	struct ctdb_srvid_message *request;
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+
+	if (data.dsize != sizeof(struct ctdb_srvid_message)) {
+		DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+		return;
+	}
+
+	request = (struct ctdb_srvid_message *)data.dptr;
+
+	srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
+}
+
+static void process_ipreallocate_requests(struct ctdb_context *ctdb,
+					  struct ctdb_recoverd *rec)
+{
+	TDB_DATA result;
+	int32_t ret;
+	struct srvid_requests *current;
+
+	/* Only process requests that are currently pending.  More
+	 * might come in while the takeover run is in progress and
+	 * they will need to be processed later since they might
+	 * be in response flag changes.
+	 */
+	current = rec->reallocate_requests;
+	rec->reallocate_requests = NULL;
+
+	if (do_takeover_run(rec, rec->nodemap)) {
+		ret = rec->pnn;
+	} else {
+		ret = -1;
+	}
+
+	result.dsize = sizeof(int32_t);
+	result.dptr  = (uint8_t *)&ret;
+
+	srvid_requests_reply(ctdb, &current, result);
+}
+
+/*
+ * handler for assigning banning credits
+ */
+static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+	uint32_t ban_pnn;
+
+	/* Ignore if we are not leader */
+	if (!this_node_is_leader(rec)) {
+		return;
+	}
+
+	if (data.dsize != sizeof(uint32_t)) {
+		DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
+				  data.dsize));
+		return;
+	}
+
+	ban_pnn = *(uint32_t *)data.dptr;
+
+	ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
+}
+
+/*
+ * Handler for leader elections
+ */
+static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct election_message *em = (struct election_message *)data.dptr;
+
+	/* Ignore election packets from ourself */
+	if (rec->pnn == em->pnn) {
+		return;
+	}
+
+	/* we got an election packet - update the timeout for the election */
+	talloc_free(rec->election_timeout);
+	rec->election_in_progress = true;
+	rec->election_timeout = tevent_add_timer(
+			ctdb->ev, ctdb,
+			fast_start ?
+				timeval_current_ofs(0, 500000) :
+				timeval_current_ofs(ctdb->tunable.election_timeout, 0),
+			ctdb_election_timeout, rec);
+
+	/* someone called an election. check their election data
+	   and if we disagree and we would rather be the elected node, 
+	   send a new election message to all other nodes
+	 */
+	if (ctdb_election_win(rec, em)) {
+		if (!rec->send_election_te) {
+			rec->send_election_te = tevent_add_timer(
+					ctdb->ev, rec,
+					timeval_current_ofs(0, 500000),
+					election_send_request, rec);
+		}
+		return;
+	}
+
+	/* we didn't win */
+	TALLOC_FREE(rec->send_election_te);
+
+	/* Release the cluster lock file */
+	if (cluster_lock_held(rec)) {
+		cluster_lock_release(rec);
+	}
+
+	/* Set leader to the winner of this round */
+	rec->leader = em->pnn;
+
+	return;
+}
+
+static void cluster_lock_election(struct ctdb_recoverd *rec)
+{
+	bool ok;
+
+	if (!this_node_can_be_leader(rec)) {
+		if (cluster_lock_held(rec)) {
+			cluster_lock_release(rec);
+		}
+		goto done;
+	}
+
+	/*
+	 * Don't need to unconditionally release the lock and then
+	 * attempt to retake it.  This provides stability.
+	 */
+	if (cluster_lock_held(rec)) {
+		goto done;
+	}
+
+	rec->leader = CTDB_UNKNOWN_PNN;
+
+	ok = cluster_lock_take(rec);
+	if (ok) {
+		rec->leader = rec->pnn;
+		D_WARNING("Took cluster lock, leader=%"PRIu32"\n", rec->leader);
+	}
+
+done:
+	rec->election_in_progress = false;
+}
+
+/*
+  force the start of the election process
+ */
+static void force_election(struct ctdb_recoverd *rec)
+{
+	int ret;
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	D_ERR("Start election\n");
+
+	/* set all nodes to recovery mode to stop all internode traffic */
+	ret = set_recovery_mode(ctdb, rec, rec->nodemap, CTDB_RECOVERY_ACTIVE);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
+		return;
+	}
+
+	rec->election_in_progress = true;
+	/* Let other nodes know that an election is underway */
+	leader_broadcast_send(rec, CTDB_UNKNOWN_PNN);
+
+	if (cluster_lock_enabled(rec)) {
+		cluster_lock_election(rec);
+		return;
+	}
+
+	talloc_free(rec->election_timeout);
+	rec->election_timeout = tevent_add_timer(
+			ctdb->ev, ctdb,
+			fast_start ?
+				timeval_current_ofs(0, 500000) :
+				timeval_current_ofs(ctdb->tunable.election_timeout, 0),
+			ctdb_election_timeout, rec);
+
+	ret = send_election_request(rec);
+	if (ret!=0) {
+		DBG_ERR("Failed to initiate leader election\n");
+		return;
+	}
+
+	/* wait for a few seconds to collect all responses */
+	ctdb_wait_election(rec);
+}
+
+
+static void srvid_not_implemented(uint64_t srvid,
+				  TDB_DATA data,
+				  void *private_data)
+{
+	const char *s;
+
+	switch (srvid) {
+	case CTDB_SRVID_SET_NODE_FLAGS:
+		s = "CTDB_SRVID_SET_NODE_FLAGS";
+		break;
+	default:
+		s = "UNKNOWN";
+	}
+
+	D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid);
+}
+
+/*
+  handler for when we need to push out flag changes to all other nodes
+*/
+static void push_flags_handler(uint64_t srvid, TDB_DATA data,
+			       void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(
+		private_data, struct ctdb_recoverd);
+	struct ctdb_context *ctdb = rec->ctdb;
+	int ret;
+	struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+	struct ctdb_node_map_old *nodemap=NULL;
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	uint32_t *nodes;
+
+	/* read the node flags from the leader */
+	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->leader,
+				   tmp_ctx, &nodemap);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
+		talloc_free(tmp_ctx);
+		return;
+	}
+	if (c->pnn >= nodemap->num) {
+		DBG_ERR("Nodemap from leader does not contain node %d\n",
+			c->pnn);
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	/* send the flags update to all connected nodes */
+	nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
+				      nodes, 0, CONTROL_TIMEOUT(),
+				      false, data,
+				      NULL, NULL,
+				      NULL) != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
+
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	talloc_free(tmp_ctx);
+}
+
+static void leader_broadcast_timeout_handler(struct tevent_context *ev,
+					     struct tevent_timer *te,
+					     struct timeval current_time,
+					     void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+
+	rec->leader_broadcast_timeout_te = NULL;
+
+	D_NOTICE("Leader broadcast timeout\n");
+
+	force_election(rec);
+}
+
+static void leader_broadcast_timeout_cancel(struct ctdb_recoverd *rec)
+{
+	TALLOC_FREE(rec->leader_broadcast_timeout_te);
+}
+
+static int leader_broadcast_timeout_start(struct ctdb_recoverd *rec)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	/*
+	 * This should not be necessary.  However, there will be
+	 * interactions with election code here.  It will want to
+	 * cancel and restart the timer around potentially long
+	 * elections.
+	 */
+	leader_broadcast_timeout_cancel(rec);
+
+	rec->leader_broadcast_timeout_te =
+		tevent_add_timer(
+			ctdb->ev,
+			rec,
+			timeval_current_ofs(ctdb_config.leader_timeout, 0),
+			leader_broadcast_timeout_handler,
+			rec);
+	if (rec->leader_broadcast_timeout_te == NULL) {
+		D_ERR("Unable to start leader broadcast timeout\n");
+		return ENOMEM;
+	}
+
+	return 0;
+}
+
+static bool leader_broadcast_timeout_active(struct ctdb_recoverd *rec)
+{
+	return rec->leader_broadcast_timeout_te != NULL;
+}
+
+static void leader_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+	uint32_t pnn;
+	size_t npull;
+	int ret;
+
+	ret = ctdb_uint32_pull(data.dptr, data.dsize, &pnn, &npull);
+	if (ret != 0) {
+		DBG_WARNING("Unable to parse leader broadcast, ret=%d\n", ret);
+		return;
+	}
+
+	leader_broadcast_timeout_cancel(rec);
+
+	if (pnn == rec->leader) {
+		goto done;
+	}
+
+	if (pnn == CTDB_UNKNOWN_PNN) {
+		bool was_election_in_progress = rec->election_in_progress;
+
+		/*
+		 * Leader broadcast timeout was cancelled above - stop
+		 * main loop from restarting it until election is
+		 * complete
+		 */
+		rec->election_in_progress = true;
+
+		/*
+		 * This is the only notification for a cluster lock
+		 * election, so handle it here...
+		 */
+		if (cluster_lock_enabled(rec) && !was_election_in_progress) {
+			cluster_lock_election(rec);
+		}
+
+		return;
+	}
+
+	D_NOTICE("Received leader broadcast, leader=%"PRIu32"\n", pnn);
+	rec->leader = pnn;
+
+done:
+	leader_broadcast_timeout_start(rec);
+}
+
+struct verify_recmode_normal_data {
+	uint32_t count;
+	enum monitor_result status;
+};
+
+static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
+{
+	struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
+
+
+	/* one more node has responded with recmode data*/
+	rmdata->count--;
+
+	/* if we failed to get the recmode, then return an error and let
+	   the main loop try again.
+	*/
+	if (state->state != CTDB_CONTROL_DONE) {
+		if (rmdata->status == MONITOR_OK) {
+			rmdata->status = MONITOR_FAILED;
+		}
+		return;
+	}
+
+	/* if we got a response, then the recmode will be stored in the
+	   status field
+	*/
+	if (state->status != CTDB_RECOVERY_NORMAL) {
+		DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
+		rmdata->status = MONITOR_RECOVERY_NEEDED;
+	}
+
+	return;
+}
+
+
+/* verify that all nodes are in normal recovery mode */
+static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
+{
+	struct verify_recmode_normal_data *rmdata;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+	struct ctdb_client_control_state *state;
+	enum monitor_result status;
+	unsigned int j;
+
+	rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
+	rmdata->count  = 0;
+	rmdata->status = MONITOR_OK;
+
+	/* loop over all active nodes and send an async getrecmode call to 
+	   them*/
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, 
+					CONTROL_TIMEOUT(), 
+					nodemap->nodes[j].pnn);
+		if (state == NULL) {
+			/* we failed to send the control, treat this as 
+			   an error and try again next iteration
+			*/			
+			DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
+			talloc_free(mem_ctx);
+			return MONITOR_FAILED;
+		}
+
+		/* set up the callback functions */
+		state->async.fn = verify_recmode_normal_callback;
+		state->async.private_data = rmdata;
+
+		/* one more control to wait for to complete */
+		rmdata->count++;
+	}
+
+
+	/* now wait for up to the maximum number of seconds allowed
+	   or until all nodes we expect a response from has replied
+	*/
+	while (rmdata->count > 0) {
+		tevent_loop_once(ctdb->ev);
+	}
+
+	status = rmdata->status;
+	talloc_free(mem_ctx);
+	return status;
+}
+
+
+static bool interfaces_have_changed(struct ctdb_context *ctdb,
+				    struct ctdb_recoverd *rec)
+{
+	struct ctdb_iface_list_old *ifaces = NULL;
+	TALLOC_CTX *mem_ctx;
+	bool ret = false;
+
+	mem_ctx = talloc_new(NULL);
+
+	/* Read the interfaces from the local node */
+	if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
+				 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
+		D_ERR("Unable to get interfaces from local node %u\n", rec->pnn);
+		/* We could return an error.  However, this will be
+		 * rare so we'll decide that the interfaces have
+		 * actually changed, just in case.
+		 */
+		talloc_free(mem_ctx);
+		return true;
+	}
+
+	if (!rec->ifaces) {
+		/* We haven't been here before so things have changed */
+		DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
+		ret = true;
+	} else if (rec->ifaces->num != ifaces->num) {
+		/* Number of interfaces has changed */
+		DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
+				     rec->ifaces->num, ifaces->num));
+		ret = true;
+	} else {
+		/* See if interface names or link states have changed */
+		unsigned int i;
+		for (i = 0; i < rec->ifaces->num; i++) {
+			struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
+			if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
+				DEBUG(DEBUG_NOTICE,
+				      ("Interface in slot %d changed: %s => %s\n",
+				       i, iface->name, ifaces->ifaces[i].name));
+				ret = true;
+				break;
+			}
+			if (iface->link_state != ifaces->ifaces[i].link_state) {
+				DEBUG(DEBUG_NOTICE,
+				      ("Interface %s changed state: %d => %d\n",
+				       iface->name, iface->link_state,
+				       ifaces->ifaces[i].link_state));
+				ret = true;
+				break;
+			}
+		}
+	}
+
+	talloc_free(rec->ifaces);
+	rec->ifaces = talloc_steal(rec, ifaces);
+
+	talloc_free(mem_ctx);
+	return ret;
+}
+
+/* Check that the local allocation of public IP addresses is correct
+ * and do some house-keeping */
+static int verify_local_ip_allocation(struct ctdb_recoverd *rec)
+{
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	struct ctdb_context *ctdb = rec->ctdb;
+	unsigned int j;
+	int ret;
+	bool need_takeover_run = false;
+	struct ctdb_public_ip_list_old *ips = NULL;
+
+	/* If we are not the leader then do some housekeeping */
+	if (!this_node_is_leader(rec)) {
+		/* Ignore any IP reallocate requests - only leader
+		 * processes them
+		 */
+		TALLOC_FREE(rec->reallocate_requests);
+		/* Clear any nodes that should be force rebalanced in
+		 * the next takeover run.  If the leader has changed
+		 * then we don't want to process these some time in
+		 * the future.
+		 */
+		TALLOC_FREE(rec->force_rebalance_nodes);
+	}
+
+	/* Return early if disabled... */
+	if (ctdb_config.failover_disabled ||
+	    ctdb_op_is_disabled(rec->takeover_run)) {
+		talloc_free(mem_ctx);
+		return  0;
+	}
+
+	if (interfaces_have_changed(ctdb, rec)) {
+		need_takeover_run = true;
+	}
+
+	/* If there are unhosted IPs but this node can host them then
+	 * trigger an IP reallocation */
+
+	/* Read *available* IPs from local node */
+	ret = ctdb_ctrl_get_public_ips_flags(
+		ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
+		CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
+		talloc_free(mem_ctx);
+		return -1;
+	}
+
+	for (j=0; j<ips->num; j++) {
+		if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
+		    rec->nodemap->nodes[rec->pnn].flags == 0) {
+			DEBUG(DEBUG_WARNING,
+			      ("Unassigned IP %s can be served by this node\n",
+			       ctdb_addr_to_str(&ips->ips[j].addr)));
+			need_takeover_run = true;
+		}
+	}
+
+	talloc_free(ips);
+
+	if (!ctdb->do_checkpublicip) {
+		goto done;
+	}
+
+	/* Validate the IP addresses that this node has on network
+	 * interfaces.  If there is an inconsistency between reality
+	 * and the state expected by CTDB then try to fix it by
+	 * triggering an IP reallocation or releasing extraneous IP
+	 * addresses. */
+
+	/* Read *known* IPs from local node */
+	ret = ctdb_ctrl_get_public_ips_flags(
+		ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
+		talloc_free(mem_ctx);
+		return -1;
+	}
+
+	for (j=0; j<ips->num; j++) {
+		if (ips->ips[j].pnn == rec->pnn) {
+			if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+				DEBUG(DEBUG_ERR,
+				      ("Assigned IP %s not on an interface\n",
+				       ctdb_addr_to_str(&ips->ips[j].addr)));
+				need_takeover_run = true;
+			}
+		} else {
+			if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+				DEBUG(DEBUG_ERR,
+				      ("IP %s incorrectly on an interface\n",
+				       ctdb_addr_to_str(&ips->ips[j].addr)));
+				need_takeover_run = true;
+			}
+		}
+	}
+
+done:
+	if (need_takeover_run) {
+		struct ctdb_srvid_message rd;
+		TDB_DATA data;
+
+		DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
+
+		ZERO_STRUCT(rd);
+		rd.pnn = rec->pnn;
+		rd.srvid = 0;
+		data.dptr = (uint8_t *)&rd;
+		data.dsize = sizeof(rd);
+
+		ret = ctdb_client_send_message(ctdb,
+					       CTDB_BROADCAST_CONNECTED,
+					       CTDB_SRVID_TAKEOVER_RUN,
+					       data);
+		if (ret != 0) {
+			D_ERR("Failed to send takeover run request\n");
+		}
+	}
+	talloc_free(mem_ctx);
+	return 0;
+}
+
+
+struct remote_nodemaps_state {
+	struct ctdb_node_map_old **remote_nodemaps;
+	struct ctdb_recoverd *rec;
+};
+
+static void async_getnodemap_callback(struct ctdb_context *ctdb,
+				      uint32_t node_pnn,
+				      int32_t res,
+				      TDB_DATA outdata,
+				      void *callback_data)
+{
+	struct remote_nodemaps_state *state =
+		(struct remote_nodemaps_state *)callback_data;
+	struct ctdb_node_map_old **remote_nodemaps = state->remote_nodemaps;
+	struct ctdb_node_map_old *nodemap = state->rec->nodemap;
+	size_t i;
+
+	for (i = 0; i < nodemap->num; i++) {
+		if (nodemap->nodes[i].pnn == node_pnn) {
+			break;
+		}
+	}
+
+	if (i >= nodemap->num) {
+		DBG_ERR("Invalid PNN %"PRIu32"\n", node_pnn);
+		return;
+	}
+
+	remote_nodemaps[i] = (struct ctdb_node_map_old *)talloc_steal(
+					remote_nodemaps, outdata.dptr);
+
+}
+
+static void async_getnodemap_error(struct ctdb_context *ctdb,
+				   uint32_t node_pnn,
+				   int32_t res,
+				   TDB_DATA outdata,
+				   void *callback_data)
+{
+	struct remote_nodemaps_state *state =
+		(struct remote_nodemaps_state *)callback_data;
+	struct ctdb_recoverd *rec = state->rec;
+
+	DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn);
+	ctdb_set_culprit(rec, node_pnn);
+}
+
+static int get_remote_nodemaps(struct ctdb_recoverd *rec,
+			       TALLOC_CTX *mem_ctx,
+			       struct ctdb_node_map_old ***remote_nodemaps)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct ctdb_node_map_old **t;
+	uint32_t *nodes;
+	struct remote_nodemaps_state state;
+	int ret;
+
+	t = talloc_zero_array(mem_ctx,
+			      struct ctdb_node_map_old *,
+			      rec->nodemap->num);
+	if (t == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return -1;
+	}
+
+	nodes = list_of_connected_nodes(ctdb, rec->nodemap, mem_ctx, false);
+
+	state.remote_nodemaps = t;
+	state.rec = rec;
+
+	ret = ctdb_client_async_control(ctdb,
+					CTDB_CONTROL_GET_NODEMAP,
+					nodes,
+					0,
+					CONTROL_TIMEOUT(),
+					false,
+					tdb_null,
+					async_getnodemap_callback,
+					async_getnodemap_error,
+					&state);
+	talloc_free(nodes);
+
+	if (ret != 0) {
+		talloc_free(t);
+		return ret;
+	}
+
+	*remote_nodemaps = t;
+	return 0;
+}
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+		      TALLOC_CTX *mem_ctx)
+{
+	struct ctdb_node_map_old *nodemap=NULL;
+	struct ctdb_node_map_old **remote_nodemaps=NULL;
+	struct ctdb_vnn_map *vnnmap=NULL;
+	struct ctdb_vnn_map *remote_vnnmap=NULL;
+	uint32_t num_lmasters;
+	int32_t debug_level;
+	unsigned int i, j;
+	int ret;
+	bool self_ban;
+
+
+	/* verify that the main daemon is still running */
+	if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
+		DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
+		exit(-1);
+	}
+
+	/* ping the local daemon to tell it we are alive */
+	ctdb_ctrl_recd_ping(ctdb);
+
+	if (rec->election_in_progress) {
+		/* an election is in progress */
+		return;
+	}
+
+	/*
+	 * Start leader broadcasts if they are not active (1st time
+	 * through main loop?  Memory allocation error?)
+	 */
+	if (!leader_broadcast_loop_active(rec)) {
+		ret = leader_broadcast_loop(rec);
+		if (ret != 0) {
+			D_ERR("Failed to set up leader broadcast\n");
+			ctdb_set_culprit(rec, rec->pnn);
+		}
+	}
+	/*
+	 * Similar for leader broadcast timeouts.  These can also have
+	 * been stopped by another node receiving a leader broadcast
+	 * timeout and transmitting an "unknown leader broadcast".
+	 * Note that this should never be done during an election - at
+	 * the moment there is nothing between here and the above
+	 * election-in-progress check that can process an election
+	 * result (i.e. no event loop).
+	 */
+	if (!leader_broadcast_timeout_active(rec)) {
+		ret = leader_broadcast_timeout_start(rec);
+		if (ret != 0) {
+			ctdb_set_culprit(rec, rec->pnn);
+		}
+	}
+
+
+	/* read the debug level from the parent and update locally */
+	ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
+	if (ret !=0) {
+		DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
+		return;
+	}
+	debuglevel_set(debug_level);
+
+	/* get relevant tunables */
+	ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
+		return;
+	}
+
+	/* get runstate */
+	ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
+				     CTDB_CURRENT_NODE, &ctdb->runstate);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
+		return;
+	}
+
+	/* get nodemap */
+	ret = ctdb_ctrl_getnodemap(ctdb,
+				   CONTROL_TIMEOUT(),
+				   rec->pnn,
+				   rec,
+				   &nodemap);
+	if (ret != 0) {
+		DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", rec->pnn);
+		return;
+	}
+	talloc_free(rec->nodemap);
+	rec->nodemap = nodemap;
+
+	/* remember our own node flags */
+	rec->node_flags = nodemap->nodes[rec->pnn].flags;
+
+	ban_misbehaving_nodes(rec, &self_ban);
+	if (self_ban) {
+		DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
+		return;
+	}
+
+	ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
+				   CTDB_CURRENT_NODE, &ctdb->recovery_mode);
+	if (ret != 0) {
+		D_ERR("Failed to read recmode from local node\n");
+		return;
+	}
+
+	/* if the local daemon is STOPPED or BANNED, we verify that the databases are
+	   also frozen and that the recmode is set to active.
+	*/
+	if (rec->node_flags & NODE_FLAGS_INACTIVE) {
+		/* If this node has become inactive then we want to
+		 * reduce the chances of it taking over the leader
+		 * role when it becomes active again.  This
+		 * helps to stabilise the leader role so that
+		 * it stays on the most stable node.
+		 */
+		rec->priority_time = timeval_current();
+
+		if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
+			DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
+
+			ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+			if (ret != 0) {
+				DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
+
+				return;
+			}
+		}
+		if (! rec->frozen_on_inactive) {
+			ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
+					       CTDB_CURRENT_NODE);
+			if (ret != 0) {
+				DEBUG(DEBUG_ERR,
+				      (__location__ " Failed to freeze node "
+				       "in STOPPED or BANNED state\n"));
+				return;
+			}
+
+			rec->frozen_on_inactive = true;
+		}
+
+		/* If this node is stopped or banned then it is not the recovery
+		 * master, so don't do anything. This prevents stopped or banned
+		 * node from starting election and sending unnecessary controls.
+		 */
+		return;
+	}
+
+	rec->frozen_on_inactive = false;
+
+	/* Retrieve capabilities from all connected nodes */
+	ret = update_capabilities(rec, nodemap);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
+		return;
+	}
+
+	if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
+		/* Check if an IP takeover run is needed and trigger one if
+		 * necessary */
+		verify_local_ip_allocation(rec);
+	}
+
+	/* If this node is not the leader then skip recovery checks */
+	if (!this_node_is_leader(rec)) {
+		return;
+	}
+
+
+	/* Get the nodemaps for all connected remote nodes */
+	ret = get_remote_nodemaps(rec, mem_ctx, &remote_nodemaps);
+	if (ret != 0) {
+		DBG_ERR("Failed to read remote nodemaps\n");
+		return;
+	}
+
+	/* Ensure our local and remote flags are correct */
+	ret = update_flags(rec, nodemap, remote_nodemaps);
+	if (ret != 0) {
+		D_ERR("Unable to update flags\n");
+		return;
+	}
+
+	if (ctdb->num_nodes != nodemap->num) {
+		DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+		ctdb_load_nodes_file(ctdb);
+		return;
+	}
+
+	/* get the vnnmap */
+	ret = ctdb_ctrl_getvnnmap(ctdb,
+				  CONTROL_TIMEOUT(),
+				  rec->pnn,
+				  mem_ctx,
+				  &vnnmap);
+	if (ret != 0) {
+		DBG_ERR("Unable to get vnnmap from node %u\n", rec->pnn);
+		return;
+	}
+
+	if (rec->need_recovery) {
+		/* a previous recovery didn't finish */
+		do_recovery(rec, mem_ctx);
+		return;
+	}
+
+	/* verify that all active nodes are in normal mode 
+	   and not in recovery mode 
+	*/
+	switch (verify_recmode(ctdb, nodemap)) {
+	case MONITOR_RECOVERY_NEEDED:
+		do_recovery(rec, mem_ctx);
+		return;
+	case MONITOR_FAILED:
+		return;
+	case MONITOR_ELECTION_NEEDED:
+		/* can not happen */
+	case MONITOR_OK:
+		break;
+	}
+
+	if (cluster_lock_enabled(rec)) {
+		/* We must already hold the cluster lock */
+		if (!cluster_lock_held(rec)) {
+			D_ERR("Failed cluster lock sanity check\n");
+			ctdb_set_culprit(rec, rec->pnn);
+			do_recovery(rec, mem_ctx);
+			return;
+		}
+	}
+
+
+	/* If recoveries are disabled then there is no use doing any
+	 * nodemap or flags checks.  Recoveries might be disabled due
+	 * to "reloadnodes", so doing these checks might cause an
+	 * unnecessary recovery.  */
+	if (ctdb_op_is_disabled(rec->recovery)) {
+		goto takeover_run_checks;
+	}
+
+	/* verify that all other nodes have the same nodemap as we have
+	*/
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].pnn == rec->pnn) {
+			continue;
+		}
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+
+ 		/* if the nodes disagree on how many nodes there are
+		   then this is a good reason to try recovery
+		 */
+		if (remote_nodemaps[j]->num != nodemap->num) {
+			DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
+				  nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
+			ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+			do_recovery(rec, mem_ctx);
+			return;
+		}
+
+		/* if the nodes disagree on which nodes exist and are
+		   active, then that is also a good reason to do recovery
+		 */
+		for (i=0;i<nodemap->num;i++) {
+			if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
+				DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n", 
+					  nodemap->nodes[j].pnn, i, 
+					  remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
+				ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+				do_recovery(rec, mem_ctx);
+				return;
+			}
+		}
+	}
+
+	/* count how many active nodes there are */
+	num_lmasters  = 0;
+	for (i=0; i<nodemap->num; i++) {
+		if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
+			if (ctdb_node_has_capabilities(rec->caps,
+						       ctdb->nodes[i]->pnn,
+						       CTDB_CAP_LMASTER)) {
+				num_lmasters++;
+			}
+		}
+	}
+
+
+	/* There must be the same number of lmasters in the vnn map as
+	 * there are active nodes with the lmaster capability...  or
+	 * do a recovery.
+	 */
+	if (vnnmap->size != num_lmasters) {
+		DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
+			  vnnmap->size, num_lmasters));
+		ctdb_set_culprit(rec, rec->pnn);
+		do_recovery(rec, mem_ctx);
+		return;
+	}
+
+	/*
+	 * Verify that all active lmaster nodes in the nodemap also
+	 * exist in the vnnmap
+	 */
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		if (! ctdb_node_has_capabilities(rec->caps,
+						 nodemap->nodes[j].pnn,
+						 CTDB_CAP_LMASTER)) {
+			continue;
+		}
+		if (nodemap->nodes[j].pnn == rec->pnn) {
+			continue;
+		}
+
+		for (i=0; i<vnnmap->size; i++) {
+			if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
+				break;
+			}
+		}
+		if (i == vnnmap->size) {
+			D_ERR("Active LMASTER node %u is not in the vnnmap\n",
+			      nodemap->nodes[j].pnn);
+			ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+			do_recovery(rec, mem_ctx);
+			return;
+		}
+	}
+
+	
+	/* verify that all other nodes have the same vnnmap
+	   and are from the same generation
+	 */
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		if (nodemap->nodes[j].pnn == rec->pnn) {
+			continue;
+		}
+
+		ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+					  mem_ctx, &remote_vnnmap);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n", 
+				  nodemap->nodes[j].pnn));
+			return;
+		}
+
+		/* verify the vnnmap generation is the same */
+		if (vnnmap->generation != remote_vnnmap->generation) {
+			DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n", 
+				  nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
+			ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+			do_recovery(rec, mem_ctx);
+			return;
+		}
+
+		/* verify the vnnmap size is the same */
+		if (vnnmap->size != remote_vnnmap->size) {
+			DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n", 
+				  nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
+			ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+			do_recovery(rec, mem_ctx);
+			return;
+		}
+
+		/* verify the vnnmap is the same */
+		for (i=0;i<vnnmap->size;i++) {
+			if (remote_vnnmap->map[i] != vnnmap->map[i]) {
+				DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n", 
+					  nodemap->nodes[j].pnn));
+				ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+				do_recovery(rec, mem_ctx);
+				return;
+			}
+		}
+	}
+
+	/* FIXME: Add remote public IP checking to ensure that nodes
+	 * have the IP addresses that are allocated to them. */
+
+takeover_run_checks:
+
+	/* If there are IP takeover runs requested or the previous one
+	 * failed then perform one and notify the waiters */
+	if (!ctdb_op_is_disabled(rec->takeover_run) &&
+	    (rec->reallocate_requests || rec->need_takeover_run)) {
+		process_ipreallocate_requests(ctdb, rec);
+	}
+}
+
+static void recd_sig_term_handler(struct tevent_context *ev,
+				  struct tevent_signal *se, int signum,
+				  int count, void *dont_care,
+				  void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+
+	DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
+	cluster_lock_release(rec);
+	exit(0);
+}
+
+/*
+ * Periodically log elements of the cluster state
+ *
+ * This can be used to confirm a split brain has occurred
+ */
+static void maybe_log_cluster_state(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval current_time,
+				    void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+	struct ctdb_context *ctdb = rec->ctdb;
+	struct tevent_timer *tt;
+
+	static struct timeval start_incomplete = {
+		.tv_sec = 0,
+	};
+
+	bool is_complete;
+	bool was_complete;
+	unsigned int i;
+	double seconds;
+	unsigned int minutes;
+	unsigned int num_connected;
+
+	if (!this_node_is_leader(rec)) {
+		goto done;
+	}
+
+	if (rec->nodemap == NULL) {
+		goto done;
+	}
+
+	is_complete = true;
+	num_connected = 0;
+	for (i = 0; i < rec->nodemap->num; i++) {
+		struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
+
+		if (n->pnn == rec->pnn) {
+			continue;
+		}
+		if ((n->flags & NODE_FLAGS_DELETED) != 0) {
+			continue;
+		}
+		if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
+			is_complete = false;
+			continue;
+		}
+
+		num_connected++;
+	}
+
+	was_complete = timeval_is_zero(&start_incomplete);
+
+	if (is_complete) {
+		if (! was_complete) {
+			D_WARNING("Cluster complete with leader=%u\n",
+				  rec->leader);
+			start_incomplete = timeval_zero();
+		}
+		goto done;
+	}
+
+	/* Cluster is newly incomplete... */
+	if (was_complete) {
+		start_incomplete = current_time;
+		minutes = 0;
+		goto log;
+	}
+
+	/*
+	 * Cluster has been incomplete since previous check, so figure
+	 * out how long (in minutes) and decide whether to log anything
+	 */
+	seconds = timeval_elapsed2(&start_incomplete, &current_time);
+	minutes = (unsigned int)seconds / 60;
+	if (minutes >= 60) {
+		/* Over an hour, log every hour */
+		if (minutes % 60 != 0) {
+			goto done;
+		}
+	} else if (minutes >= 10) {
+		/* Over 10 minutes, log every 10 minutes */
+		if (minutes % 10 != 0) {
+			goto done;
+		}
+	}
+
+log:
+	D_WARNING("Cluster incomplete with leader=%u, elapsed=%u minutes, "
+		  "connected=%u\n",
+		  rec->leader,
+		  minutes,
+		  num_connected);
+
+done:
+	tt = tevent_add_timer(ctdb->ev,
+			      rec,
+			      timeval_current_ofs(60, 0),
+			      maybe_log_cluster_state,
+			      rec);
+	if (tt == NULL) {
+		DBG_WARNING("Failed to set up cluster state timer\n");
+	}
+}
+
+static void recd_sighup_hook(void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type_abort(
+		private_data, struct ctdb_recoverd);
+
+	if (rec->helper_pid > 0) {
+		kill(rec->helper_pid, SIGHUP);
+	}
+}
+
+/*
+  the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+	struct tevent_signal *se;
+	struct ctdb_recoverd *rec;
+	bool status;
+
+	DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+
+	rec = talloc_zero(ctdb, struct ctdb_recoverd);
+	CTDB_NO_MEMORY_FATAL(ctdb, rec);
+
+	rec->ctdb = ctdb;
+	rec->leader = CTDB_UNKNOWN_PNN;
+	rec->pnn = ctdb_get_pnn(ctdb);
+	rec->cluster_lock_handle = NULL;
+	rec->helper_pid = -1;
+
+	rec->takeover_run = ctdb_op_init(rec, "takeover runs");
+	CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
+
+	rec->recovery = ctdb_op_init(rec, "recoveries");
+	CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
+
+	rec->priority_time = timeval_current();
+	rec->frozen_on_inactive = false;
+
+	status = logging_setup_sighup_handler(rec->ctdb->ev,
+					      rec,
+					      recd_sighup_hook,
+					      rec);
+	if (!status) {
+		D_ERR("Failed to install SIGHUP handler\n");
+		exit(1);
+	}
+
+	se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
+			       recd_sig_term_handler, rec);
+	if (se == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
+		exit(1);
+	}
+
+	if (!cluster_lock_enabled(rec)) {
+		struct tevent_timer *tt;
+
+		tt = tevent_add_timer(ctdb->ev,
+				      rec,
+				      timeval_current_ofs(60, 0),
+				      maybe_log_cluster_state,
+				      rec);
+		if (tt == NULL) {
+			DBG_WARNING("Failed to set up cluster state timer\n");
+		}
+	}
+
+	/* register a message port for sending memory dumps */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+	/* when a node is assigned banning credits */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
+					banning_handler, rec);
+
+	/* register a message port for recovery elections */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
+
+	ctdb_client_set_message_handler(ctdb,
+					CTDB_SRVID_SET_NODE_FLAGS,
+					srvid_not_implemented,
+					rec);
+
+	/* when we are asked to puch out a flag change */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+	/* register a message port for reloadnodes  */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+	/* register a message port for performing a takeover run */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+	/* register a message port for disabling the ip check for a short while */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+	/* register a message port for forcing a rebalance of a node next
+	   reallocation */
+	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
+
+	/* Register a message port for disabling takeover runs */
+	ctdb_client_set_message_handler(ctdb,
+					CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+					disable_takeover_runs_handler, rec);
+
+	/* Register a message port for disabling recoveries */
+	ctdb_client_set_message_handler(ctdb,
+					CTDB_SRVID_DISABLE_RECOVERIES,
+					disable_recoveries_handler, rec);
+
+	ctdb_client_set_message_handler(ctdb,
+					CTDB_SRVID_LEADER,
+					leader_handler,
+					rec);
+
+	for (;;) {
+		TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+		struct timeval start;
+		double elapsed;
+
+		if (!mem_ctx) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  " Failed to create temp context\n"));
+			exit(-1);
+		}
+
+		start = timeval_current();
+		main_loop(ctdb, rec, mem_ctx);
+		talloc_free(mem_ctx);
+
+		/* we only check for recovery once every second */
+		elapsed = timeval_elapsed(&start);
+		if (elapsed < ctdb->tunable.recover_interval) {
+			ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+					  - elapsed);
+		}
+	}
+}
+
+/*
+  event handler for when the main ctdbd dies
+ */
+static void ctdb_recoverd_parent(struct tevent_context *ev,
+				 struct tevent_fd *fde,
+				 uint16_t flags, void *private_data)
+{
+	DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
+	_exit(1);
+}
+
+/*
+  called regularly to verify that the recovery daemon is still running
+ */
+static void ctdb_check_recd(struct tevent_context *ev,
+			    struct tevent_timer *te,
+			    struct timeval yt, void *p)
+{
+	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+	if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
+		DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
+
+		tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
+				 ctdb_restart_recd, ctdb);
+
+		return;
+	}
+
+	tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
+			 timeval_current_ofs(30, 0),
+			 ctdb_check_recd, ctdb);
+}
+
+static void recd_sig_child_handler(struct tevent_context *ev,
+				   struct tevent_signal *se, int signum,
+				   int count, void *dont_care,
+				   void *private_data)
+{
+//	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	int status;
+	pid_t pid = -1;
+
+	while (pid != 0) {
+		pid = waitpid(-1, &status, WNOHANG);
+		if (pid == -1) {
+			if (errno != ECHILD) {
+				DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
+			}
+			return;
+		}
+		if (pid > 0) {
+			DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
+		}
+	}
+}
+
+/*
+  startup the recovery daemon as a child of the main ctdb daemon
+ */
+int ctdb_start_recoverd(struct ctdb_context *ctdb)
+{
+	int fd[2];
+	struct tevent_signal *se;
+	struct tevent_fd *fde;
+	int ret;
+
+	if (pipe(fd) != 0) {
+		return -1;
+	}
+
+	ctdb->recoverd_pid = ctdb_fork(ctdb);
+	if (ctdb->recoverd_pid == -1) {
+		return -1;
+	}
+
+	if (ctdb->recoverd_pid != 0) {
+		talloc_free(ctdb->recd_ctx);
+		ctdb->recd_ctx = talloc_new(ctdb);
+		CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
+
+		close(fd[0]);
+		tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
+				 timeval_current_ofs(30, 0),
+				 ctdb_check_recd, ctdb);
+		return 0;
+	}
+
+	close(fd[1]);
+
+	srandom(getpid() ^ time(NULL));
+
+	ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
+	if (ret != 0) {
+		return -1;
+	}
+
+	prctl_set_comment("ctdb_recoverd");
+	if (switch_from_server_to_client(ctdb) != 0) {
+		DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
+		exit(1);
+	}
+
+	DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+
+	fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
+			    ctdb_recoverd_parent, &fd[0]);
+	tevent_fd_set_auto_close(fde);
+
+	/* set up a handler to pick up sigchld */
+	se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
+			       recd_sig_child_handler, ctdb);
+	if (se == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
+		exit(1);
+	}
+
+	monitor_cluster(ctdb);
+
+	DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
+	return -1;
+}
+
+/*
+  shutdown the recovery daemon
+ */
+void ctdb_stop_recoverd(struct ctdb_context *ctdb)
+{
+	if (ctdb->recoverd_pid == 0) {
+		return;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
+	ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
+
+	TALLOC_FREE(ctdb->recd_ctx);
+	TALLOC_FREE(ctdb->recd_ping_count);
+}
+
+static void ctdb_restart_recd(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+	DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
+	ctdb_stop_recoverd(ctdb);
+	ctdb_start_recoverd(ctdb);
+}
diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c
new file mode 100644
index 0000000..4df4841
--- /dev/null
+++ b/ctdb/server/ctdb_recovery_helper.c
@@ -0,0 +1,3200 @@
+/*
+   ctdb parallel database recovery
+
+   Copyright (C) Amitay Isaacs  2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+#include <libgen.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+#include "lib/util/tevent_unix.h"
+#include "lib/util/util.h"
+#include "lib/util/smb_strtox.h"
+
+#include "protocol/protocol.h"
+#include "protocol/protocol_api.h"
+#include "client/client.h"
+
+#include "common/logging.h"
+
+static int recover_timeout = 30;
+
+#define NUM_RETRIES	3
+
+#define TIMEOUT()	timeval_current_ofs(recover_timeout, 0)
+
+/*
+ * Utility functions
+ */
+
+static bool generic_recv(struct tevent_req *req, int *perr)
+{
+	int err;
+
+	if (tevent_req_is_unix_error(req, &err)) {
+		if (perr != NULL) {
+			*perr = err;
+		}
+		return false;
+	}
+
+	return true;
+}
+
+static uint64_t rec_srvid = CTDB_SRVID_RECOVERY;
+
+static uint64_t srvid_next(void)
+{
+	rec_srvid += 1;
+	return rec_srvid;
+}
+
+/*
+ * Node related functions
+ */
+
+struct node_list {
+	uint32_t *pnn_list;
+	uint32_t *caps;
+	uint32_t *ban_credits;
+	unsigned int size;
+	unsigned int count;
+};
+
+static struct node_list *node_list_init(TALLOC_CTX *mem_ctx, unsigned int size)
+{
+	struct node_list *nlist;
+	unsigned int i;
+
+	nlist = talloc_zero(mem_ctx, struct node_list);
+	if (nlist == NULL) {
+		return NULL;
+	}
+
+	nlist->pnn_list = talloc_array(nlist, uint32_t, size);
+	nlist->caps = talloc_zero_array(nlist, uint32_t, size);
+	nlist->ban_credits = talloc_zero_array(nlist, uint32_t, size);
+
+	if (nlist->pnn_list == NULL ||
+	    nlist->caps == NULL ||
+	    nlist->ban_credits == NULL) {
+		talloc_free(nlist);
+		return NULL;
+	}
+	nlist->size = size;
+
+	for (i=0; i<nlist->size; i++) {
+		nlist->pnn_list[i] = CTDB_UNKNOWN_PNN;
+	}
+
+	return nlist;
+}
+
+static bool node_list_add(struct node_list *nlist, uint32_t pnn)
+{
+	unsigned int i;
+
+	if (nlist->count == nlist->size) {
+		return false;
+	}
+
+	for (i=0; i<nlist->count; i++) {
+		if (nlist->pnn_list[i] == pnn) {
+			return false;
+		}
+	}
+
+	nlist->pnn_list[nlist->count] = pnn;
+	nlist->count += 1;
+
+	return true;
+}
+
+static uint32_t *node_list_lmaster(struct node_list *nlist,
+				   TALLOC_CTX *mem_ctx,
+				   unsigned int *pnn_count)
+{
+	uint32_t *pnn_list;
+	unsigned int count, i;
+
+	pnn_list = talloc_zero_array(mem_ctx, uint32_t, nlist->count);
+	if (pnn_list == NULL) {
+		return NULL;
+	}
+
+	count = 0;
+	for (i=0; i<nlist->count; i++) {
+		if (!(nlist->caps[i] & CTDB_CAP_LMASTER)) {
+			continue;
+		}
+
+		pnn_list[count] = nlist->pnn_list[i];
+		count += 1;
+	}
+
+	*pnn_count = count;
+	return pnn_list;
+}
+
+static void node_list_ban_credits(struct node_list *nlist, uint32_t pnn)
+{
+	unsigned int i;
+
+	for (i=0; i<nlist->count; i++) {
+		if (nlist->pnn_list[i] == pnn) {
+			nlist->ban_credits[i] += 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Database list functions
+ *
+ * Simple, naive implementation that could be updated to a db_hash or similar
+ */
+
+struct db {
+	struct db *prev, *next;
+
+	uint32_t db_id;
+	uint32_t db_flags;
+	uint32_t *pnn_list;
+	unsigned int num_nodes;
+};
+
+struct db_list {
+	unsigned int num_dbs;
+	struct db *db;
+	unsigned int num_nodes;
+};
+
+static struct db_list *db_list_init(TALLOC_CTX *mem_ctx, unsigned int num_nodes)
+{
+	struct db_list *l;
+
+	l = talloc_zero(mem_ctx, struct db_list);
+	l->num_nodes = num_nodes;
+
+	return l;
+}
+
+static struct db *db_list_find(struct db_list *dblist, uint32_t db_id)
+{
+	struct db *db;
+
+	if (dblist == NULL) {
+		return NULL;
+	}
+
+	db = dblist->db;
+	while (db != NULL && db->db_id != db_id) {
+		db = db->next;
+	}
+
+	return db;
+}
+
+static int db_list_add(struct db_list *dblist,
+		       uint32_t db_id,
+		       uint32_t db_flags,
+		       uint32_t node)
+{
+	struct db *db = NULL;
+
+	if (dblist == NULL) {
+		return EINVAL;
+	}
+
+	db = talloc_zero(dblist, struct db);
+	if (db == NULL) {
+		return ENOMEM;
+	}
+
+	db->db_id = db_id;
+	db->db_flags = db_flags;
+	db->pnn_list = talloc_zero_array(db, uint32_t, dblist->num_nodes);
+	if (db->pnn_list == NULL) {
+		talloc_free(db);
+		return ENOMEM;
+	}
+	db->pnn_list[0] = node;
+	db->num_nodes = 1;
+
+	DLIST_ADD_END(dblist->db, db);
+	dblist->num_dbs++;
+
+	return 0;
+}
+
+static int db_list_check_and_add(struct db_list *dblist,
+		       uint32_t db_id,
+		       uint32_t db_flags,
+		       uint32_t node)
+{
+	struct db *db = NULL;
+	int ret;
+
+	/*
+	 * These flags are masked out because they are only set on a
+	 * node when a client attaches to that node, so they might not
+	 * be set yet.  They can't be passed as part of the attach, so
+	 * they're no use here.
+	 */
+	db_flags &= ~(CTDB_DB_FLAGS_READONLY | CTDB_DB_FLAGS_STICKY);
+
+	if (dblist == NULL) {
+		return EINVAL;
+	}
+
+	db = db_list_find(dblist, db_id);
+	if (db == NULL) {
+		ret = db_list_add(dblist, db_id, db_flags, node);
+		return ret;
+	}
+
+	if (db->db_flags != db_flags) {
+		D_ERR("Incompatible database flags for 0x%"PRIx32" "
+		      "(0x%"PRIx32" != 0x%"PRIx32")\n",
+		      db_id,
+		      db_flags,
+		      db->db_flags);
+		return EINVAL;
+	}
+
+	if (db->num_nodes >= dblist->num_nodes) {
+		return EINVAL;
+	}
+
+	db->pnn_list[db->num_nodes] = node;
+	db->num_nodes++;
+
+	return 0;
+}
+
+/*
+ * Create database on nodes where it is missing
+ */
+
+struct db_create_missing_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+
+	struct node_list *nlist;
+
+	const char *db_name;
+	uint32_t *missing_pnn_list;
+	int missing_num_nodes;
+};
+
+static void db_create_missing_done(struct tevent_req *subreq);
+
+static struct tevent_req *db_create_missing_send(
+					TALLOC_CTX *mem_ctx,
+					struct tevent_context *ev,
+					struct ctdb_client_context *client,
+					struct node_list *nlist,
+					const char *db_name,
+					struct db *db)
+{
+	struct tevent_req *req, *subreq;
+	struct db_create_missing_state *state;
+	struct ctdb_req_control request;
+	unsigned int i, j;
+
+	req = tevent_req_create(mem_ctx,
+				&state,
+				struct db_create_missing_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->nlist = nlist;
+	state->db_name = db_name;
+
+	if (nlist->count == db->num_nodes) {
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	state->missing_pnn_list = talloc_array(mem_ctx, uint32_t, nlist->count);
+	if (tevent_req_nomem(state->missing_pnn_list, req)) {
+		return tevent_req_post(req, ev);
+	}
+
+	for (i = 0; i < nlist->count; i++) {
+		uint32_t pnn = nlist->pnn_list[i] ;
+
+		for (j = 0; j < db->num_nodes; j++) {
+			if (pnn == db->pnn_list[j]) {
+				break;
+			}
+		}
+
+		if (j < db->num_nodes) {
+			continue;
+		}
+
+		DBG_INFO("Create database %s on node %u\n",
+			 state->db_name,
+			 pnn);
+		state->missing_pnn_list[state->missing_num_nodes] = pnn;
+		state->missing_num_nodes++;
+	}
+
+	if (db->db_flags & CTDB_DB_FLAGS_PERSISTENT) {
+		ctdb_req_control_db_attach_persistent(&request, db_name);
+	} else if (db->db_flags & CTDB_DB_FLAGS_REPLICATED) {
+		ctdb_req_control_db_attach_replicated(&request, db_name);
+	} else {
+		ctdb_req_control_db_attach(&request, db_name);
+	}
+	request.flags = CTDB_CTRL_FLAG_ATTACH_RECOVERY;
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->missing_pnn_list,
+						state->missing_num_nodes,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, db_create_missing_done, req);
+
+	return req;
+}
+
+static void db_create_missing_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct db_create_missing_state *state = tevent_req_data(
+		req, struct db_create_missing_state);
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq,
+						&ret,
+						NULL,
+						&err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(
+						state->missing_pnn_list,
+						state->missing_num_nodes,
+						err_list,
+						&pnn);
+		if (ret2 != 0) {
+			D_ERR("control DB_ATTACH failed for db %s"
+			      " on node %u, ret=%d\n",
+			      state->db_name,
+			      pnn,
+			      ret2);
+			node_list_ban_credits(state->nlist, pnn);
+		} else {
+			D_ERR("control DB_ATTACH failed for db %s, ret=%d\n",
+			      state->db_name,
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool db_create_missing_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/*
+ * Recovery database functions
+ */
+
+struct recdb_context {
+	uint32_t db_id;
+	const char *db_name;
+	const char *db_path;
+	struct tdb_wrap *db;
+	bool persistent;
+};
+
+static struct recdb_context *recdb_create(TALLOC_CTX *mem_ctx, uint32_t db_id,
+					  const char *db_name,
+					  const char *db_path,
+					  uint32_t hash_size, bool persistent)
+{
+	static char *db_dir_state = NULL;
+	struct recdb_context *recdb;
+	unsigned int tdb_flags;
+
+	recdb = talloc(mem_ctx, struct recdb_context);
+	if (recdb == NULL) {
+		return NULL;
+	}
+
+	if (db_dir_state == NULL) {
+		db_dir_state = getenv("CTDB_DBDIR_STATE");
+	}
+
+	recdb->db_name = db_name;
+	recdb->db_id = db_id;
+	recdb->db_path = talloc_asprintf(recdb, "%s/recdb.%s",
+					 db_dir_state != NULL ?
+					    db_dir_state :
+					    dirname(discard_const(db_path)),
+					 db_name);
+	if (recdb->db_path == NULL) {
+		talloc_free(recdb);
+		return NULL;
+	}
+	unlink(recdb->db_path);
+
+	tdb_flags = TDB_NOLOCK | TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING;
+	recdb->db = tdb_wrap_open(mem_ctx, recdb->db_path, hash_size,
+				  tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
+	if (recdb->db == NULL) {
+		talloc_free(recdb);
+		D_ERR("failed to create recovery db %s\n", recdb->db_path);
+		return NULL;
+	}
+
+	recdb->persistent = persistent;
+
+	return recdb;
+}
+
+static uint32_t recdb_id(struct recdb_context *recdb)
+{
+	return recdb->db_id;
+}
+
+static const char *recdb_name(struct recdb_context *recdb)
+{
+	return recdb->db_name;
+}
+
+static const char *recdb_path(struct recdb_context *recdb)
+{
+	return recdb->db_path;
+}
+
+static struct tdb_context *recdb_tdb(struct recdb_context *recdb)
+{
+	return recdb->db->tdb;
+}
+
+static bool recdb_persistent(struct recdb_context *recdb)
+{
+	return recdb->persistent;
+}
+
+struct recdb_add_traverse_state {
+	struct recdb_context *recdb;
+	uint32_t mypnn;
+};
+
+static int recdb_add_traverse(uint32_t reqid, struct ctdb_ltdb_header *header,
+			      TDB_DATA key, TDB_DATA data,
+			      void *private_data)
+{
+	struct recdb_add_traverse_state *state =
+		(struct recdb_add_traverse_state *)private_data;
+	struct ctdb_ltdb_header *hdr;
+	TDB_DATA prev_data;
+	int ret;
+
+	/* header is not marshalled separately in the pulldb control */
+	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+		return -1;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	/* fetch the existing record, if any */
+	prev_data = tdb_fetch(recdb_tdb(state->recdb), key);
+
+	if (prev_data.dptr != NULL) {
+		struct ctdb_ltdb_header prev_hdr;
+
+		prev_hdr = *(struct ctdb_ltdb_header *)prev_data.dptr;
+		free(prev_data.dptr);
+		if (hdr->rsn < prev_hdr.rsn ||
+		    (hdr->rsn == prev_hdr.rsn &&
+		     prev_hdr.dmaster != state->mypnn)) {
+			return 0;
+		}
+	}
+
+	ret = tdb_store(recdb_tdb(state->recdb), key, data, TDB_REPLACE);
+	if (ret != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+static bool recdb_add(struct recdb_context *recdb, int mypnn,
+		      struct ctdb_rec_buffer *recbuf)
+{
+	struct recdb_add_traverse_state state;
+	int ret;
+
+	state.recdb = recdb;
+	state.mypnn = mypnn;
+
+	ret = ctdb_rec_buffer_traverse(recbuf, recdb_add_traverse, &state);
+	if (ret != 0) {
+		return false;
+	}
+
+	return true;
+}
+
+/* This function decides which records from recdb are retained */
+static int recbuf_filter_add(struct ctdb_rec_buffer *recbuf, bool persistent,
+			     uint32_t reqid, uint32_t dmaster,
+			     TDB_DATA key, TDB_DATA data)
+{
+	struct ctdb_ltdb_header *header;
+	int ret;
+
+	/* Skip empty records */
+	if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
+		return 0;
+	}
+
+	/* update the dmaster field to point to us */
+	header = (struct ctdb_ltdb_header *)data.dptr;
+	if (!persistent) {
+		header->dmaster = dmaster;
+		header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+	}
+
+	ret = ctdb_rec_buffer_add(recbuf, recbuf, reqid, NULL, key, data);
+	if (ret != 0) {
+		return ret;
+	}
+
+	return 0;
+}
+
+struct recdb_file_traverse_state {
+	struct ctdb_rec_buffer *recbuf;
+	struct recdb_context *recdb;
+	TALLOC_CTX *mem_ctx;
+	uint32_t dmaster;
+	uint32_t reqid;
+	bool persistent;
+	bool failed;
+	int fd;
+	size_t max_size;
+	unsigned int num_buffers;
+};
+
+static int recdb_file_traverse(struct tdb_context *tdb,
+			       TDB_DATA key, TDB_DATA data,
+			       void *private_data)
+{
+	struct recdb_file_traverse_state *state =
+		(struct recdb_file_traverse_state *)private_data;
+	int ret;
+
+	ret = recbuf_filter_add(state->recbuf, state->persistent,
+				state->reqid, state->dmaster, key, data);
+	if (ret != 0) {
+		state->failed = true;
+		return ret;
+	}
+
+	if (ctdb_rec_buffer_len(state->recbuf) > state->max_size) {
+		ret = ctdb_rec_buffer_write(state->recbuf, state->fd);
+		if (ret != 0) {
+			D_ERR("Failed to collect recovery records for %s\n",
+			      recdb_name(state->recdb));
+			state->failed = true;
+			return ret;
+		}
+
+		state->num_buffers += 1;
+
+		TALLOC_FREE(state->recbuf);
+		state->recbuf = ctdb_rec_buffer_init(state->mem_ctx,
+						     recdb_id(state->recdb));
+		if (state->recbuf == NULL) {
+			state->failed = true;
+			return ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int recdb_file(struct recdb_context *recdb, TALLOC_CTX *mem_ctx,
+		      uint32_t dmaster, int fd, int max_size)
+{
+	struct recdb_file_traverse_state state;
+	int ret;
+
+	state.recbuf = ctdb_rec_buffer_init(mem_ctx, recdb_id(recdb));
+	if (state.recbuf == NULL) {
+		return -1;
+	}
+	state.recdb = recdb;
+	state.mem_ctx = mem_ctx;
+	state.dmaster = dmaster;
+	state.reqid = 0;
+	state.persistent = recdb_persistent(recdb);
+	state.failed = false;
+	state.fd = fd;
+	state.max_size = max_size;
+	state.num_buffers = 0;
+
+	ret = tdb_traverse_read(recdb_tdb(recdb), recdb_file_traverse, &state);
+	if (ret == -1 || state.failed) {
+		TALLOC_FREE(state.recbuf);
+		return -1;
+	}
+
+	ret = ctdb_rec_buffer_write(state.recbuf, fd);
+	if (ret != 0) {
+		D_ERR("Failed to collect recovery records for %s\n",
+		      recdb_name(recdb));
+		TALLOC_FREE(state.recbuf);
+		return -1;
+	}
+	state.num_buffers += 1;
+
+	D_DEBUG("Wrote %d buffers of recovery records for %s\n",
+		state.num_buffers, recdb_name(recdb));
+
+	return state.num_buffers;
+}
+
+/*
+ * Pull database from a single node
+ */
+
+struct pull_database_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct recdb_context *recdb;
+	uint32_t pnn;
+	uint64_t srvid;
+	unsigned int num_records;
+	int result;
+};
+
+static void pull_database_handler(uint64_t srvid, TDB_DATA data,
+				  void *private_data);
+static void pull_database_register_done(struct tevent_req *subreq);
+static void pull_database_unregister_done(struct tevent_req *subreq);
+static void pull_database_done(struct tevent_req *subreq);
+
+static struct tevent_req *pull_database_send(
+			TALLOC_CTX *mem_ctx,
+			struct tevent_context *ev,
+			struct ctdb_client_context *client,
+			uint32_t pnn,
+			struct recdb_context *recdb)
+{
+	struct tevent_req *req, *subreq;
+	struct pull_database_state *state;
+
+	req = tevent_req_create(mem_ctx, &state, struct pull_database_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->recdb = recdb;
+	state->pnn = pnn;
+	state->srvid = srvid_next();
+
+	subreq = ctdb_client_set_message_handler_send(
+					state, state->ev, state->client,
+					state->srvid, pull_database_handler,
+					req);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+
+	tevent_req_set_callback(subreq, pull_database_register_done, req);
+
+	return req;
+}
+
+static void pull_database_handler(uint64_t srvid, TDB_DATA data,
+				  void *private_data)
+{
+	struct tevent_req *req = talloc_get_type_abort(
+		private_data, struct tevent_req);
+	struct pull_database_state *state = tevent_req_data(
+		req, struct pull_database_state);
+	struct ctdb_rec_buffer *recbuf;
+	size_t np;
+	int ret;
+	bool status;
+
+	if (srvid != state->srvid) {
+		return;
+	}
+
+	ret = ctdb_rec_buffer_pull(data.dptr, data.dsize, state, &recbuf, &np);
+	if (ret != 0) {
+		D_ERR("Invalid data received for DB_PULL messages\n");
+		return;
+	}
+
+	if (recbuf->db_id != recdb_id(state->recdb)) {
+		talloc_free(recbuf);
+		D_ERR("Invalid dbid:%08x for DB_PULL messages for %s\n",
+		      recbuf->db_id, recdb_name(state->recdb));
+		return;
+	}
+
+	status = recdb_add(state->recdb, ctdb_client_pnn(state->client),
+			   recbuf);
+	if (! status) {
+		talloc_free(recbuf);
+		D_ERR("Failed to add records to recdb for %s\n",
+		      recdb_name(state->recdb));
+		return;
+	}
+
+	state->num_records += recbuf->count;
+	talloc_free(recbuf);
+}
+
+static void pull_database_register_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct pull_database_state *state = tevent_req_data(
+		req, struct pull_database_state);
+	struct ctdb_req_control request;
+	struct ctdb_pulldb_ext pulldb_ext;
+	int ret;
+	bool status;
+
+	status = ctdb_client_set_message_handler_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("Failed to set message handler for DB_PULL for %s\n",
+		      recdb_name(state->recdb));
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	pulldb_ext.db_id = recdb_id(state->recdb);
+	pulldb_ext.lmaster = CTDB_LMASTER_ANY;
+	pulldb_ext.srvid = state->srvid;
+
+	ctdb_req_control_db_pull(&request, &pulldb_ext);
+	subreq = ctdb_client_control_send(state, state->ev, state->client,
+					  state->pnn, TIMEOUT(), &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, pull_database_done, req);
+}
+
+static void pull_database_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct pull_database_state *state = tevent_req_data(
+		req, struct pull_database_state);
+	struct ctdb_reply_control *reply;
+	uint32_t num_records;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("control DB_PULL failed for %s on node %u, ret=%d\n",
+		      recdb_name(state->recdb), state->pnn, ret);
+		state->result = ret;
+		goto unregister;
+	}
+
+	ret = ctdb_reply_control_db_pull(reply, &num_records);
+	talloc_free(reply);
+	if (num_records != state->num_records) {
+		D_ERR("mismatch (%u != %u) in DB_PULL records for db %s\n",
+		      num_records, state->num_records,
+		      recdb_name(state->recdb));
+		state->result = EIO;
+		goto unregister;
+	}
+
+	D_INFO("Pulled %d records for db %s from node %d\n",
+	       state->num_records, recdb_name(state->recdb), state->pnn);
+
+unregister:
+
+	subreq = ctdb_client_remove_message_handler_send(
+					state, state->ev, state->client,
+					state->srvid, req);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, pull_database_unregister_done, req);
+}
+
+static void pull_database_unregister_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct pull_database_state *state = tevent_req_data(
+		req, struct pull_database_state);
+	int ret;
+	bool status;
+
+	status = ctdb_client_remove_message_handler_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("failed to remove message handler for DB_PULL for db %s\n",
+		      recdb_name(state->recdb));
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	if (state->result != 0) {
+		tevent_req_error(req, state->result);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool pull_database_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/*
+ * Push database to specified nodes (new style)
+ */
+
+struct push_database_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct recdb_context *recdb;
+	uint32_t *pnn_list;
+	unsigned int count;
+	uint64_t srvid;
+	uint32_t dmaster;
+	int fd;
+	int num_buffers;
+	int num_buffers_sent;
+	unsigned int num_records;
+};
+
+static void push_database_started(struct tevent_req *subreq);
+static void push_database_send_msg(struct tevent_req *req);
+static void push_database_send_done(struct tevent_req *subreq);
+static void push_database_confirmed(struct tevent_req *subreq);
+
+static struct tevent_req *push_database_send(
+			TALLOC_CTX *mem_ctx,
+			struct tevent_context *ev,
+			struct ctdb_client_context *client,
+			uint32_t *pnn_list,
+			unsigned int count,
+			struct recdb_context *recdb,
+			int max_size)
+{
+	struct tevent_req *req, *subreq;
+	struct push_database_state *state;
+	struct ctdb_req_control request;
+	struct ctdb_pulldb_ext pulldb_ext;
+	char *filename;
+	off_t offset;
+
+	req = tevent_req_create(mem_ctx, &state,
+				struct push_database_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->recdb = recdb;
+	state->pnn_list = pnn_list;
+	state->count = count;
+
+	state->srvid = srvid_next();
+	state->dmaster = ctdb_client_pnn(client);
+	state->num_buffers_sent = 0;
+	state->num_records = 0;
+
+	filename = talloc_asprintf(state, "%s.dat", recdb_path(recdb));
+	if (tevent_req_nomem(filename, req)) {
+		return tevent_req_post(req, ev);
+	}
+
+	state->fd = open(filename, O_RDWR|O_CREAT, 0644);
+	if (state->fd == -1) {
+		tevent_req_error(req, errno);
+		return tevent_req_post(req, ev);
+	}
+	unlink(filename);
+	talloc_free(filename);
+
+	state->num_buffers = recdb_file(recdb, state, state->dmaster,
+					state->fd, max_size);
+	if (state->num_buffers == -1) {
+		tevent_req_error(req, ENOMEM);
+		return tevent_req_post(req, ev);
+	}
+
+	offset = lseek(state->fd, 0, SEEK_SET);
+	if (offset != 0) {
+		tevent_req_error(req, EIO);
+		return tevent_req_post(req, ev);
+	}
+
+	pulldb_ext.db_id = recdb_id(recdb);
+	pulldb_ext.srvid = state->srvid;
+
+	ctdb_req_control_db_push_start(&request, &pulldb_ext);
+	subreq = ctdb_client_control_multi_send(state, ev, client,
+						pnn_list, count,
+						TIMEOUT(), &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, push_database_started, req);
+
+	return req;
+}
+
+static void push_database_started(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct push_database_state *state = tevent_req_data(
+		req, struct push_database_state);
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state,
+						&err_list, NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->pnn_list,
+						       state->count,
+						       err_list, &pnn);
+		if (ret2 != 0) {
+			D_ERR("control DB_PUSH_START failed for db %s"
+			      " on node %u, ret=%d\n",
+			      recdb_name(state->recdb), pnn, ret2);
+		} else {
+			D_ERR("control DB_PUSH_START failed for db %s,"
+			      " ret=%d\n",
+			      recdb_name(state->recdb), ret);
+		}
+		talloc_free(err_list);
+
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	push_database_send_msg(req);
+}
+
+static void push_database_send_msg(struct tevent_req *req)
+{
+	struct push_database_state *state = tevent_req_data(
+		req, struct push_database_state);
+	struct tevent_req *subreq;
+	struct ctdb_rec_buffer *recbuf;
+	struct ctdb_req_message message;
+	TDB_DATA data;
+	size_t np;
+	int ret;
+
+	if (state->num_buffers_sent == state->num_buffers) {
+		struct ctdb_req_control request;
+
+		ctdb_req_control_db_push_confirm(&request,
+						 recdb_id(state->recdb));
+		subreq = ctdb_client_control_multi_send(state, state->ev,
+							state->client,
+							state->pnn_list,
+							state->count,
+							TIMEOUT(), &request);
+		if (tevent_req_nomem(subreq, req)) {
+			return;
+		}
+		tevent_req_set_callback(subreq, push_database_confirmed, req);
+		return;
+	}
+
+	ret = ctdb_rec_buffer_read(state->fd, state, &recbuf);
+	if (ret != 0) {
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	data.dsize = ctdb_rec_buffer_len(recbuf);
+	data.dptr = talloc_size(state, data.dsize);
+	if (tevent_req_nomem(data.dptr, req)) {
+		return;
+	}
+
+	ctdb_rec_buffer_push(recbuf, data.dptr, &np);
+
+	message.srvid = state->srvid;
+	message.data.data = data;
+
+	D_DEBUG("Pushing buffer %d with %d records for db %s\n",
+		state->num_buffers_sent, recbuf->count,
+		recdb_name(state->recdb));
+
+	subreq = ctdb_client_message_multi_send(state, state->ev,
+						state->client,
+						state->pnn_list, state->count,
+						&message);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, push_database_send_done, req);
+
+	state->num_records += recbuf->count;
+
+	talloc_free(data.dptr);
+	talloc_free(recbuf);
+}
+
+static void push_database_send_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct push_database_state *state = tevent_req_data(
+		req, struct push_database_state);
+	bool status;
+	int ret;
+
+	status = ctdb_client_message_multi_recv(subreq, &ret, NULL, NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("Sending recovery records failed for %s\n",
+		      recdb_name(state->recdb));
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	state->num_buffers_sent += 1;
+
+	push_database_send_msg(req);
+}
+
+static void push_database_confirmed(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct push_database_state *state = tevent_req_data(
+		req, struct push_database_state);
+	struct ctdb_reply_control **reply;
+	int *err_list;
+	bool status;
+	unsigned int i;
+	int ret;
+	uint32_t num_records;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state,
+						&err_list, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->pnn_list,
+						       state->count, err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control DB_PUSH_CONFIRM failed for db %s"
+			      " on node %u, ret=%d\n",
+			      recdb_name(state->recdb), pnn, ret2);
+		} else {
+			D_ERR("control DB_PUSH_CONFIRM failed for db %s,"
+			      " ret=%d\n",
+			      recdb_name(state->recdb), ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	for (i=0; i<state->count; i++) {
+		ret = ctdb_reply_control_db_push_confirm(reply[i],
+							 &num_records);
+		if (ret != 0) {
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+
+		if (num_records != state->num_records) {
+			D_ERR("Node %u received %d of %d records for %s\n",
+			      state->pnn_list[i], num_records,
+			      state->num_records, recdb_name(state->recdb));
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+	}
+
+	talloc_free(reply);
+
+	D_INFO("Pushed %d records for db %s\n",
+	       state->num_records, recdb_name(state->recdb));
+
+	tevent_req_done(req);
+}
+
+static bool push_database_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/*
+ * Collect databases using highest sequence number
+ */
+
+struct collect_highseqnum_db_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct node_list *nlist;
+	uint32_t db_id;
+	struct recdb_context *recdb;
+
+	uint32_t max_pnn;
+};
+
+static void collect_highseqnum_db_seqnum_done(struct tevent_req *subreq);
+static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq);
+
+static struct tevent_req *collect_highseqnum_db_send(
+			TALLOC_CTX *mem_ctx,
+			struct tevent_context *ev,
+			struct ctdb_client_context *client,
+			struct node_list *nlist,
+			uint32_t db_id,
+			struct recdb_context *recdb)
+{
+	struct tevent_req *req, *subreq;
+	struct collect_highseqnum_db_state *state;
+	struct ctdb_req_control request;
+
+	req = tevent_req_create(mem_ctx, &state,
+				struct collect_highseqnum_db_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->nlist = nlist;
+	state->db_id = db_id;
+	state->recdb = recdb;
+
+	ctdb_req_control_get_db_seqnum(&request, db_id);
+	subreq = ctdb_client_control_multi_send(mem_ctx,
+						ev,
+						client,
+						nlist->pnn_list,
+						nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, collect_highseqnum_db_seqnum_done,
+				req);
+
+	return req;
+}
+
+static void collect_highseqnum_db_seqnum_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct collect_highseqnum_db_state *state = tevent_req_data(
+		req, struct collect_highseqnum_db_state);
+	struct ctdb_reply_control **reply;
+	int *err_list;
+	bool status;
+	unsigned int i;
+	int ret;
+	uint64_t seqnum, max_seqnum;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state,
+						&err_list, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control GET_DB_SEQNUM failed for db %s"
+			      " on node %u, ret=%d\n",
+			      recdb_name(state->recdb), pnn, ret2);
+		} else {
+			D_ERR("control GET_DB_SEQNUM failed for db %s,"
+			      " ret=%d\n",
+			      recdb_name(state->recdb), ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	max_seqnum = 0;
+	state->max_pnn = state->nlist->pnn_list[0];
+	for (i=0; i<state->nlist->count; i++) {
+		ret = ctdb_reply_control_get_db_seqnum(reply[i], &seqnum);
+		if (ret != 0) {
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+
+		if (max_seqnum < seqnum) {
+			max_seqnum = seqnum;
+			state->max_pnn = state->nlist->pnn_list[i];
+		}
+	}
+
+	talloc_free(reply);
+
+	D_INFO("Pull persistent db %s from node %d with seqnum 0x%"PRIx64"\n",
+	       recdb_name(state->recdb), state->max_pnn, max_seqnum);
+
+	subreq = pull_database_send(state,
+				    state->ev,
+				    state->client,
+				    state->max_pnn,
+				    state->recdb);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, collect_highseqnum_db_pulldb_done,
+				req);
+}
+
+static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct collect_highseqnum_db_state *state = tevent_req_data(
+		req, struct collect_highseqnum_db_state);
+	int ret;
+	bool status;
+
+	status = pull_database_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		node_list_ban_credits(state->nlist, state->max_pnn);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool collect_highseqnum_db_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/*
+ * Collect all databases
+ */
+
+struct collect_all_db_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct node_list *nlist;
+	uint32_t db_id;
+	struct recdb_context *recdb;
+
+	struct ctdb_pulldb pulldb;
+	unsigned int index;
+};
+
+static void collect_all_db_pulldb_done(struct tevent_req *subreq);
+
+static struct tevent_req *collect_all_db_send(
+			TALLOC_CTX *mem_ctx,
+			struct tevent_context *ev,
+			struct ctdb_client_context *client,
+			struct node_list *nlist,
+			uint32_t db_id,
+			struct recdb_context *recdb)
+{
+	struct tevent_req *req, *subreq;
+	struct collect_all_db_state *state;
+
+	req = tevent_req_create(mem_ctx, &state,
+				struct collect_all_db_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->nlist = nlist;
+	state->db_id = db_id;
+	state->recdb = recdb;
+	state->index = 0;
+
+	subreq = pull_database_send(state,
+				    ev,
+				    client,
+				    nlist->pnn_list[state->index],
+				    recdb);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, collect_all_db_pulldb_done, req);
+
+	return req;
+}
+
+static void collect_all_db_pulldb_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct collect_all_db_state *state = tevent_req_data(
+		req, struct collect_all_db_state);
+	int ret;
+	bool status;
+
+	status = pull_database_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		node_list_ban_credits(state->nlist,
+				      state->nlist->pnn_list[state->index]);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	state->index += 1;
+	if (state->index == state->nlist->count) {
+		tevent_req_done(req);
+		return;
+	}
+
+	subreq = pull_database_send(state,
+				    state->ev,
+				    state->client,
+				    state->nlist->pnn_list[state->index],
+				    state->recdb);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, collect_all_db_pulldb_done, req);
+}
+
+static bool collect_all_db_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+
+/**
+ * For each database do the following:
+ *  - Get DB name from all nodes
+ *  - Attach database on missing nodes
+ *  - Get DB path
+ *  - Freeze database on all nodes
+ *  - Start transaction on all nodes
+ *  - Collect database from all nodes
+ *  - Wipe database on all nodes
+ *  - Push database to all nodes
+ *  - Commit transaction on all nodes
+ *  - Thaw database on all nodes
+ */
+
+struct recover_db_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct ctdb_tunable_list *tun_list;
+	struct node_list *nlist;
+	struct db *db;
+
+	uint32_t destnode;
+	struct ctdb_transdb transdb;
+
+	const char *db_name, *db_path;
+	struct recdb_context *recdb;
+};
+
+static void recover_db_name_done(struct tevent_req *subreq);
+static void recover_db_create_missing_done(struct tevent_req *subreq);
+static void recover_db_path_done(struct tevent_req *subreq);
+static void recover_db_freeze_done(struct tevent_req *subreq);
+static void recover_db_transaction_started(struct tevent_req *subreq);
+static void recover_db_collect_done(struct tevent_req *subreq);
+static void recover_db_wipedb_done(struct tevent_req *subreq);
+static void recover_db_pushdb_done(struct tevent_req *subreq);
+static void recover_db_transaction_committed(struct tevent_req *subreq);
+static void recover_db_thaw_done(struct tevent_req *subreq);
+
+static struct tevent_req *recover_db_send(TALLOC_CTX *mem_ctx,
+					  struct tevent_context *ev,
+					  struct ctdb_client_context *client,
+					  struct ctdb_tunable_list *tun_list,
+					  struct node_list *nlist,
+					  uint32_t generation,
+					  struct db *db)
+{
+	struct tevent_req *req, *subreq;
+	struct recover_db_state *state;
+	struct ctdb_req_control request;
+
+	req = tevent_req_create(mem_ctx, &state, struct recover_db_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->tun_list = tun_list;
+	state->nlist = nlist;
+	state->db = db;
+
+	state->destnode = ctdb_client_pnn(client);
+	state->transdb.db_id = db->db_id;
+	state->transdb.tid = generation;
+
+	ctdb_req_control_get_dbname(&request, db->db_id);
+	subreq = ctdb_client_control_multi_send(state,
+						ev,
+						client,
+						state->db->pnn_list,
+						state->db->num_nodes,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, recover_db_name_done, req);
+
+	return req;
+}
+
+static void recover_db_name_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_reply_control **reply;
+	int *err_list;
+	unsigned int i;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq,
+						&ret,
+						state,
+						&err_list,
+						&reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->db->pnn_list,
+						       state->db->num_nodes,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control GET_DBNAME failed on node %u,"
+			      " ret=%d\n",
+			      pnn,
+			      ret2);
+		} else {
+			D_ERR("control GET_DBNAME failed, ret=%d\n",
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	for (i = 0; i < state->db->num_nodes; i++) {
+		const char *db_name;
+		uint32_t pnn;
+
+		pnn = state->nlist->pnn_list[i];
+
+		ret = ctdb_reply_control_get_dbname(reply[i],
+						    state,
+						    &db_name);
+		if (ret != 0) {
+			D_ERR("control GET_DBNAME failed on node %u "
+			      "for db=0x%x, ret=%d\n",
+			      pnn,
+			      state->db->db_id,
+			      ret);
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+
+		if (state->db_name == NULL) {
+			state->db_name = db_name;
+			continue;
+		}
+
+		if (strcmp(state->db_name, db_name) != 0) {
+			D_ERR("Incompatible database name for 0x%"PRIx32" "
+			      "(%s != %s) on node %"PRIu32"\n",
+			      state->db->db_id,
+			      db_name,
+			      state->db_name,
+			      pnn);
+			node_list_ban_credits(state->nlist, pnn);
+			tevent_req_error(req, ret);
+			return;
+		}
+	}
+
+	talloc_free(reply);
+
+	subreq = db_create_missing_send(state,
+					state->ev,
+					state->client,
+					state->nlist,
+					state->db_name,
+					state->db);
+
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_create_missing_done, req);
+}
+
+static void recover_db_create_missing_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_req_control request;
+	int ret;
+	bool status;
+
+	/* Could sanity check the db_id here */
+	status = db_create_missing_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ctdb_req_control_getdbpath(&request, state->db->db_id);
+	subreq = ctdb_client_control_send(state, state->ev, state->client,
+					  state->destnode, TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_path_done, req);
+}
+
+static void recover_db_path_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_reply_control *reply;
+	struct ctdb_req_control request;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("control GETDBPATH failed for db %s, ret=%d\n",
+		      state->db_name, ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_getdbpath(reply, state, &state->db_path);
+	if (ret != 0) {
+		D_ERR("control GETDBPATH failed for db %s, ret=%d\n",
+		      state->db_name, ret);
+		tevent_req_error(req, EPROTO);
+		return;
+	}
+
+	talloc_free(reply);
+
+	ctdb_req_control_db_freeze(&request, state->db->db_id);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_freeze_done, req);
+}
+
+static void recover_db_freeze_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_req_control request;
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control FREEZE_DB failed for db %s"
+			      " on node %u, ret=%d\n",
+			      state->db_name, pnn, ret2);
+
+			node_list_ban_credits(state->nlist, pnn);
+		} else {
+			D_ERR("control FREEZE_DB failed for db %s, ret=%d\n",
+			      state->db_name, ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ctdb_req_control_db_transaction_start(&request, &state->transdb);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_transaction_started, req);
+}
+
+static void recover_db_transaction_started(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	int *err_list;
+	uint32_t flags;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control TRANSACTION_DB failed for db=%s"
+			      " on node %u, ret=%d\n",
+			      state->db_name, pnn, ret2);
+		} else {
+			D_ERR("control TRANSACTION_DB failed for db=%s,"
+			      " ret=%d\n", state->db_name, ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	flags = state->db->db_flags;
+	state->recdb = recdb_create(state,
+				    state->db->db_id,
+				    state->db_name,
+				    state->db_path,
+				    state->tun_list->database_hash_size,
+				    flags & CTDB_DB_FLAGS_PERSISTENT);
+	if (tevent_req_nomem(state->recdb, req)) {
+		return;
+	}
+
+	if ((flags & CTDB_DB_FLAGS_PERSISTENT) ||
+	    (flags & CTDB_DB_FLAGS_REPLICATED)) {
+		subreq = collect_highseqnum_db_send(state,
+						    state->ev,
+						    state->client,
+						    state->nlist,
+						    state->db->db_id,
+						    state->recdb);
+	} else {
+		subreq = collect_all_db_send(state,
+					     state->ev,
+					     state->client,
+					     state->nlist,
+					     state->db->db_id,
+					     state->recdb);
+	}
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_collect_done, req);
+}
+
+static void recover_db_collect_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_req_control request;
+	int ret;
+	bool status;
+
+	if ((state->db->db_flags & CTDB_DB_FLAGS_PERSISTENT) ||
+	    (state->db->db_flags & CTDB_DB_FLAGS_REPLICATED)) {
+		status = collect_highseqnum_db_recv(subreq, &ret);
+	} else {
+		status = collect_all_db_recv(subreq, &ret);
+	}
+	TALLOC_FREE(subreq);
+	if (! status) {
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ctdb_req_control_wipe_database(&request, &state->transdb);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_wipedb_done, req);
+}
+
+static void recover_db_wipedb_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control WIPEDB failed for db %s on node %u,"
+			      " ret=%d\n", state->db_name, pnn, ret2);
+		} else {
+			D_ERR("control WIPEDB failed for db %s, ret=%d\n",
+			      state->db_name, ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	subreq = push_database_send(state,
+				    state->ev,
+				    state->client,
+				    state->nlist->pnn_list,
+				    state->nlist->count,
+				    state->recdb,
+				    state->tun_list->rec_buffer_size_limit);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_pushdb_done, req);
+}
+
+static void recover_db_pushdb_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_req_control request;
+	int ret;
+	bool status;
+
+	status = push_database_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	TALLOC_FREE(state->recdb);
+
+	ctdb_req_control_db_transaction_commit(&request, &state->transdb);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_transaction_committed, req);
+}
+
+static void recover_db_transaction_committed(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	struct ctdb_req_control request;
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control DB_TRANSACTION_COMMIT failed for db %s"
+			      " on node %u, ret=%d\n",
+			      state->db_name, pnn, ret2);
+		} else {
+			D_ERR("control DB_TRANSACTION_COMMIT failed for db %s,"
+			      " ret=%d\n", state->db_name, ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ctdb_req_control_db_thaw(&request, state->db->db_id);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recover_db_thaw_done, req);
+}
+
+static void recover_db_thaw_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recover_db_state *state = tevent_req_data(
+		req, struct recover_db_state);
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control DB_THAW failed for db %s on node %u,"
+			      " ret=%d\n", state->db_name, pnn, ret2);
+		} else {
+			D_ERR("control DB_THAW failed for db %s, ret=%d\n",
+			      state->db_name, ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool recover_db_recv(struct tevent_req *req)
+{
+	return generic_recv(req, NULL);
+}
+
+
+/*
+ * Start database recovery for each database
+ *
+ * Try to recover each database 5 times before failing recovery.
+ */
+
+struct db_recovery_state {
+	struct tevent_context *ev;
+	struct db_list *dblist;
+	unsigned int num_replies;
+	unsigned int num_failed;
+};
+
+struct db_recovery_one_state {
+	struct tevent_req *req;
+	struct ctdb_client_context *client;
+	struct db_list *dblist;
+	struct ctdb_tunable_list *tun_list;
+	struct node_list *nlist;
+	uint32_t generation;
+	struct db *db;
+	int num_fails;
+};
+
+static void db_recovery_one_done(struct tevent_req *subreq);
+
+static struct tevent_req *db_recovery_send(TALLOC_CTX *mem_ctx,
+					   struct tevent_context *ev,
+					   struct ctdb_client_context *client,
+					   struct db_list *dblist,
+					   struct ctdb_tunable_list *tun_list,
+					   struct node_list *nlist,
+					   uint32_t generation)
+{
+	struct tevent_req *req, *subreq;
+	struct db_recovery_state *state;
+	struct db *db;
+
+	req = tevent_req_create(mem_ctx, &state, struct db_recovery_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->dblist = dblist;
+	state->num_replies = 0;
+	state->num_failed = 0;
+
+	if (dblist->num_dbs == 0) {
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	for (db = dblist->db; db != NULL; db = db->next) {
+		struct db_recovery_one_state *substate;
+
+		substate = talloc_zero(state, struct db_recovery_one_state);
+		if (tevent_req_nomem(substate, req)) {
+			return tevent_req_post(req, ev);
+		}
+
+		substate->req = req;
+		substate->client = client;
+		substate->dblist = dblist;
+		substate->tun_list = tun_list;
+		substate->nlist = nlist;
+		substate->generation = generation;
+		substate->db = db;
+
+		subreq = recover_db_send(state,
+					 ev,
+					 client,
+					 tun_list,
+					 nlist,
+					 generation,
+					 substate->db);
+		if (tevent_req_nomem(subreq, req)) {
+			return tevent_req_post(req, ev);
+		}
+		tevent_req_set_callback(subreq, db_recovery_one_done,
+					substate);
+		D_NOTICE("recover database 0x%08x\n", substate->db->db_id);
+	}
+
+	return req;
+}
+
+static void db_recovery_one_done(struct tevent_req *subreq)
+{
+	struct db_recovery_one_state *substate = tevent_req_callback_data(
+		subreq, struct db_recovery_one_state);
+	struct tevent_req *req = substate->req;
+	struct db_recovery_state *state = tevent_req_data(
+		req, struct db_recovery_state);
+	bool status;
+
+	status = recover_db_recv(subreq);
+	TALLOC_FREE(subreq);
+
+	if (status) {
+		talloc_free(substate);
+		goto done;
+	}
+
+	substate->num_fails += 1;
+	if (substate->num_fails < NUM_RETRIES) {
+		subreq = recover_db_send(state,
+					 state->ev,
+					 substate->client,
+					 substate->tun_list,
+					 substate->nlist,
+					 substate->generation,
+					 substate->db);
+		if (tevent_req_nomem(subreq, req)) {
+			goto failed;
+		}
+		tevent_req_set_callback(subreq, db_recovery_one_done, substate);
+		D_NOTICE("recover database 0x%08x, attempt %d\n",
+			 substate->db->db_id, substate->num_fails+1);
+		return;
+	}
+
+failed:
+	state->num_failed += 1;
+
+done:
+	state->num_replies += 1;
+
+	if (state->num_replies == state->dblist->num_dbs) {
+		tevent_req_done(req);
+	}
+}
+
+static bool db_recovery_recv(struct tevent_req *req, unsigned int *count)
+{
+	struct db_recovery_state *state = tevent_req_data(
+		req, struct db_recovery_state);
+	int err;
+
+	if (tevent_req_is_unix_error(req, &err)) {
+		*count = 0;
+		return false;
+	}
+
+	*count = state->num_replies - state->num_failed;
+
+	if (state->num_failed > 0) {
+		return false;
+	}
+
+	return true;
+}
+
+struct ban_node_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct ctdb_tunable_list *tun_list;
+	struct node_list *nlist;
+	uint32_t destnode;
+
+	uint32_t max_pnn;
+};
+
+static bool ban_node_check(struct tevent_req *req);
+static void ban_node_check_done(struct tevent_req *subreq);
+static void ban_node_done(struct tevent_req *subreq);
+
+static struct tevent_req *ban_node_send(TALLOC_CTX *mem_ctx,
+					struct tevent_context *ev,
+					struct ctdb_client_context *client,
+					struct ctdb_tunable_list *tun_list,
+					struct node_list *nlist)
+{
+	struct tevent_req *req;
+	struct ban_node_state *state;
+	bool ok;
+
+	req = tevent_req_create(mem_ctx, &state, struct ban_node_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->tun_list = tun_list;
+	state->nlist = nlist;
+	state->destnode = ctdb_client_pnn(client);
+
+	/* Bans are not enabled */
+	if (state->tun_list->enable_bans == 0) {
+		D_ERR("Bans are not enabled\n");
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	ok = ban_node_check(req);
+	if (!ok) {
+		return tevent_req_post(req, ev);
+	}
+
+	return req;
+}
+
+static bool ban_node_check(struct tevent_req *req)
+{
+	struct tevent_req *subreq;
+	struct ban_node_state *state = tevent_req_data(
+		req, struct ban_node_state);
+	struct ctdb_req_control request;
+	unsigned max_credits = 0, i;
+
+	for (i=0; i<state->nlist->count; i++) {
+		if (state->nlist->ban_credits[i] > max_credits) {
+			state->max_pnn = state->nlist->pnn_list[i];
+			max_credits = state->nlist->ban_credits[i];
+		}
+	}
+
+	if (max_credits < NUM_RETRIES) {
+		tevent_req_done(req);
+		return false;
+	}
+
+	ctdb_req_control_get_nodemap(&request);
+	subreq = ctdb_client_control_send(state,
+					  state->ev,
+					  state->client,
+					  state->max_pnn,
+					  TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return false;
+	}
+	tevent_req_set_callback(subreq, ban_node_check_done, req);
+
+	return true;
+}
+
+static void ban_node_check_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct ban_node_state *state = tevent_req_data(
+		req, struct ban_node_state);
+	struct ctdb_reply_control *reply;
+	struct ctdb_node_map *nodemap;
+	struct ctdb_req_control request;
+	struct ctdb_ban_state ban;
+	unsigned int i;
+	int ret;
+	bool ok;
+
+	ok = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (!ok) {
+		D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+		      state->max_pnn, ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+	if (ret != 0) {
+		D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	for (i=0; i<nodemap->num; i++) {
+		if (nodemap->node[i].pnn != state->max_pnn) {
+			continue;
+		}
+
+		/* If the node became inactive, reset ban_credits */
+		if (nodemap->node[i].flags & NODE_FLAGS_INACTIVE) {
+			unsigned int j;
+
+			for (j=0; j<state->nlist->count; j++) {
+				if (state->nlist->pnn_list[j] ==
+						state->max_pnn) {
+					state->nlist->ban_credits[j] = 0;
+					break;
+				}
+			}
+			state->max_pnn = CTDB_UNKNOWN_PNN;
+		}
+	}
+
+	talloc_free(nodemap);
+	talloc_free(reply);
+
+	/* If node becomes inactive during recovery, pick next */
+	if (state->max_pnn == CTDB_UNKNOWN_PNN) {
+		(void) ban_node_check(req);
+		return;
+	}
+
+	ban = (struct ctdb_ban_state) {
+		.pnn = state->max_pnn,
+		.time = state->tun_list->recovery_ban_period,
+	};
+
+	D_ERR("Banning node %u for %u seconds\n", ban.pnn, ban.time);
+
+	ctdb_req_control_set_ban_state(&request, &ban);
+	subreq = ctdb_client_control_send(state,
+					  state->ev,
+					  state->client,
+					  ban.pnn,
+					  TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, ban_node_done, req);
+}
+
+static void ban_node_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct ban_node_state *state = tevent_req_data(
+		req, struct ban_node_state);
+	struct ctdb_reply_control *reply;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_set_ban_state(reply);
+	if (ret != 0) {
+		D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	talloc_free(reply);
+	tevent_req_done(req);
+}
+
+static bool ban_node_recv(struct tevent_req *req, int *perr)
+{
+	if (tevent_req_is_unix_error(req, perr)) {
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Run the parallel database recovery
+ *
+ * - Get tunables
+ * - Get nodemap from all nodes
+ * - Get capabilities from all nodes
+ * - Get dbmap
+ * - Set RECOVERY_ACTIVE
+ * - Send START_RECOVERY
+ * - Update vnnmap on all nodes
+ * - Run database recovery
+ * - Set RECOVERY_NORMAL
+ * - Send END_RECOVERY
+ */
+
+struct recovery_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	uint32_t generation;
+	uint32_t destnode;
+	struct node_list *nlist;
+	struct ctdb_tunable_list *tun_list;
+	struct ctdb_vnn_map *vnnmap;
+	struct db_list *dblist;
+};
+
+static void recovery_tunables_done(struct tevent_req *subreq);
+static void recovery_nodemap_done(struct tevent_req *subreq);
+static void recovery_nodemap_verify(struct tevent_req *subreq);
+static void recovery_capabilities_done(struct tevent_req *subreq);
+static void recovery_dbmap_done(struct tevent_req *subreq);
+static void recovery_active_done(struct tevent_req *subreq);
+static void recovery_start_recovery_done(struct tevent_req *subreq);
+static void recovery_vnnmap_update_done(struct tevent_req *subreq);
+static void recovery_db_recovery_done(struct tevent_req *subreq);
+static void recovery_failed_done(struct tevent_req *subreq);
+static void recovery_normal_done(struct tevent_req *subreq);
+static void recovery_end_recovery_done(struct tevent_req *subreq);
+
+static struct tevent_req *recovery_send(TALLOC_CTX *mem_ctx,
+					struct tevent_context *ev,
+					struct ctdb_client_context *client,
+					uint32_t generation)
+{
+	struct tevent_req *req, *subreq;
+	struct recovery_state *state;
+	struct ctdb_req_control request;
+
+	req = tevent_req_create(mem_ctx, &state, struct recovery_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->generation = generation;
+	state->destnode = ctdb_client_pnn(client);
+
+	ctdb_req_control_get_all_tunables(&request);
+	subreq = ctdb_client_control_send(state, state->ev, state->client,
+					  state->destnode, TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, recovery_tunables_done, req);
+
+	return req;
+}
+
+static void recovery_tunables_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_reply_control *reply;
+	struct ctdb_req_control request;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_get_all_tunables(reply, state,
+						  &state->tun_list);
+	if (ret != 0) {
+		D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+		tevent_req_error(req, EPROTO);
+		return;
+	}
+
+	talloc_free(reply);
+
+	recover_timeout = state->tun_list->recover_timeout;
+
+	ctdb_req_control_get_nodemap(&request);
+	subreq = ctdb_client_control_send(state, state->ev, state->client,
+					  state->destnode, TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_nodemap_done, req);
+}
+
+static void recovery_nodemap_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_reply_control *reply;
+	struct ctdb_req_control request;
+	struct ctdb_node_map *nodemap;
+	unsigned int i;
+	bool status;
+	int ret;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+		      state->destnode, ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+	if (ret != 0) {
+		D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	state->nlist = node_list_init(state, nodemap->num);
+	if (tevent_req_nomem(state->nlist, req)) {
+		return;
+	}
+
+	for (i=0; i<nodemap->num; i++) {
+		bool ok;
+
+		if (nodemap->node[i].flags & NODE_FLAGS_DISCONNECTED) {
+			continue;
+		}
+
+		ok = node_list_add(state->nlist, nodemap->node[i].pnn);
+		if (!ok) {
+			tevent_req_error(req, EINVAL);
+			return;
+		}
+	}
+
+	talloc_free(nodemap);
+	talloc_free(reply);
+
+	/* Verify flags by getting local node information from each node */
+	ctdb_req_control_get_nodemap(&request);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_nodemap_verify, req);
+}
+
+static void recovery_nodemap_verify(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_req_control request;
+	struct ctdb_reply_control **reply;
+	struct node_list *nlist;
+	unsigned int i;
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq,
+						&ret,
+						state,
+						&err_list,
+						&reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control GET_NODEMAP failed on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	nlist = node_list_init(state, state->nlist->size);
+	if (tevent_req_nomem(nlist, req)) {
+		return;
+	}
+
+	for (i=0; i<state->nlist->count; i++) {
+		struct ctdb_node_map *nodemap = NULL;
+		uint32_t pnn, flags;
+		unsigned int j;
+		bool ok;
+
+		pnn = state->nlist->pnn_list[i];
+		ret = ctdb_reply_control_get_nodemap(reply[i],
+						     state,
+						     &nodemap);
+		if (ret != 0) {
+			D_ERR("control GET_NODEMAP failed on node %u\n", pnn);
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+
+		flags = NODE_FLAGS_DISCONNECTED;
+		for (j=0; j<nodemap->num; j++) {
+			if (nodemap->node[j].pnn == pnn) {
+				flags = nodemap->node[j].flags;
+				break;
+			}
+		}
+
+		TALLOC_FREE(nodemap);
+
+		if (flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+
+		ok = node_list_add(nlist, pnn);
+		if (!ok) {
+			tevent_req_error(req, EINVAL);
+			return;
+		}
+	}
+
+	talloc_free(reply);
+
+	talloc_free(state->nlist);
+	state->nlist = nlist;
+
+	ctdb_req_control_get_capabilities(&request);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_capabilities_done, req);
+}
+
+static void recovery_capabilities_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_reply_control **reply;
+	struct ctdb_req_control request;
+	int *err_list;
+	unsigned int i;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+						&reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control GET_CAPABILITIES failed on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("control GET_CAPABILITIES failed, ret=%d\n",
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	for (i=0; i<state->nlist->count; i++) {
+		uint32_t caps;
+
+		ret = ctdb_reply_control_get_capabilities(reply[i], &caps);
+		if (ret != 0) {
+			D_ERR("control GET_CAPABILITIES failed on node %u\n",
+			      state->nlist->pnn_list[i]);
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+
+		state->nlist->caps[i] = caps;
+	}
+
+	talloc_free(reply);
+
+	ctdb_req_control_get_dbmap(&request);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_dbmap_done, req);
+}
+
+static void recovery_dbmap_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_reply_control **reply;
+	struct ctdb_req_control request;
+	int *err_list;
+	unsigned int i, j;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq,
+						&ret,
+						state,
+						&err_list,
+						&reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("control GET_DBMAP failed on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("control GET_DBMAP failed, ret=%d\n",
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	state->dblist = db_list_init(state, state->nlist->count);
+	if (tevent_req_nomem(state->dblist, req)) {
+		D_ERR("memory allocation error\n");
+		return;
+	}
+
+	for (i = 0; i < state->nlist->count; i++) {
+		struct ctdb_dbid_map *dbmap = NULL;
+		uint32_t pnn;
+
+		pnn = state->nlist->pnn_list[i];
+
+		ret = ctdb_reply_control_get_dbmap(reply[i], state, &dbmap);
+		if (ret != 0) {
+			D_ERR("control GET_DBMAP failed on node %u\n",
+			      pnn);
+			tevent_req_error(req, EPROTO);
+			return;
+		}
+
+		for (j = 0; j < dbmap->num; j++) {
+			ret = db_list_check_and_add(state->dblist,
+						    dbmap->dbs[j].db_id,
+						    dbmap->dbs[j].flags,
+						    pnn);
+			if (ret != 0) {
+				D_ERR("failed to add database list entry, "
+				      "ret=%d\n",
+				      ret);
+				tevent_req_error(req, ret);
+				return;
+			}
+		}
+
+		TALLOC_FREE(dbmap);
+	}
+
+	ctdb_req_control_set_recmode(&request, CTDB_RECOVERY_ACTIVE);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_active_done, req);
+}
+
+static void recovery_active_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_req_control request;
+	struct ctdb_vnn_map *vnnmap;
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("failed to set recovery mode ACTIVE on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("failed to set recovery mode ACTIVE, ret=%d\n",
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	D_ERR("Set recovery mode to ACTIVE\n");
+
+	/* Calculate new VNNMAP */
+	vnnmap = talloc_zero(state, struct ctdb_vnn_map);
+	if (tevent_req_nomem(vnnmap, req)) {
+		return;
+	}
+
+	vnnmap->map = node_list_lmaster(state->nlist, vnnmap, &vnnmap->size);
+	if (tevent_req_nomem(vnnmap->map, req)) {
+		return;
+	}
+
+	if (vnnmap->size == 0) {
+		D_WARNING("No active lmasters found. Adding recmaster anyway\n");
+		vnnmap->map[0] = state->destnode;
+		vnnmap->size = 1;
+	}
+
+	vnnmap->generation = state->generation;
+
+	state->vnnmap = vnnmap;
+
+	ctdb_req_control_start_recovery(&request);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_start_recovery_done, req);
+}
+
+static void recovery_start_recovery_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_req_control request;
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("failed to run start_recovery event on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("failed to run start_recovery event, ret=%d\n",
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	D_ERR("start_recovery event finished\n");
+
+	ctdb_req_control_setvnnmap(&request, state->vnnmap);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_vnnmap_update_done, req);
+}
+
+static void recovery_vnnmap_update_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("failed to update VNNMAP on node %u, ret=%d\n",
+			      pnn, ret2);
+		} else {
+			D_ERR("failed to update VNNMAP, ret=%d\n", ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	D_NOTICE("updated VNNMAP\n");
+
+	subreq = db_recovery_send(state,
+				  state->ev,
+				  state->client,
+				  state->dblist,
+				  state->tun_list,
+				  state->nlist,
+				  state->vnnmap->generation);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_db_recovery_done, req);
+}
+
+static void recovery_db_recovery_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_req_control request;
+	bool status;
+	unsigned int count;
+
+	status = db_recovery_recv(subreq, &count);
+	TALLOC_FREE(subreq);
+
+	D_ERR("%d of %d databases recovered\n", count, state->dblist->num_dbs);
+
+	if (! status) {
+		subreq = ban_node_send(state,
+				       state->ev,
+				       state->client,
+				       state->tun_list,
+				       state->nlist);
+		if (tevent_req_nomem(subreq, req)) {
+			return;
+		}
+		tevent_req_set_callback(subreq, recovery_failed_done, req);
+		return;
+	}
+
+	ctdb_req_control_set_recmode(&request, CTDB_RECOVERY_NORMAL);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_normal_done, req);
+}
+
+static void recovery_failed_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	int ret;
+	bool status;
+
+	status = ban_node_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("failed to ban node, ret=%d\n", ret);
+	}
+
+	tevent_req_error(req, EIO);
+}
+
+static void recovery_normal_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_req_control request;
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("failed to set recovery mode NORMAL on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("failed to set recovery mode NORMAL, ret=%d\n",
+			      ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	D_ERR("Set recovery mode to NORMAL\n");
+
+	ctdb_req_control_end_recovery(&request);
+	subreq = ctdb_client_control_multi_send(state,
+						state->ev,
+						state->client,
+						state->nlist->pnn_list,
+						state->nlist->count,
+						TIMEOUT(),
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, recovery_end_recovery_done, req);
+}
+
+static void recovery_end_recovery_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	int *err_list;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+						NULL);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		int ret2;
+		uint32_t pnn;
+
+		ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+						       state->nlist->count,
+						       err_list,
+						       &pnn);
+		if (ret2 != 0) {
+			D_ERR("failed to run recovered event on node %u,"
+			      " ret=%d\n", pnn, ret2);
+		} else {
+			D_ERR("failed to run recovered event, ret=%d\n", ret);
+		}
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	D_ERR("recovered event finished\n");
+
+	tevent_req_done(req);
+}
+
+static void recovery_recv(struct tevent_req *req, int *perr)
+{
+	generic_recv(req, perr);
+}
+
+static void usage(const char *progname)
+{
+	fprintf(stderr, "\nUsage: %s <output-fd> <ctdb-socket-path> <generation>\n",
+		progname);
+}
+
+
+/*
+ * Arguments - log fd, write fd, socket path, generation
+ */
+int main(int argc, char *argv[])
+{
+	int write_fd;
+	const char *sockpath;
+	TALLOC_CTX *mem_ctx = NULL;
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	bool status;
+	int ret = 0;
+	struct tevent_req *req;
+	uint32_t generation;
+
+	if (argc != 4) {
+		usage(argv[0]);
+		exit(1);
+	}
+
+	write_fd = atoi(argv[1]);
+	sockpath = argv[2];
+	generation = (uint32_t)smb_strtoul(argv[3],
+					   NULL,
+					   0,
+					   &ret,
+					   SMB_STR_STANDARD);
+	if (ret != 0) {
+		fprintf(stderr, "recovery: unable to initialize generation\n");
+		goto failed;
+	}
+
+	mem_ctx = talloc_new(NULL);
+	if (mem_ctx == NULL) {
+		fprintf(stderr, "recovery: talloc_new() failed\n");
+		goto failed;
+	}
+
+	ret = logging_init(mem_ctx, NULL, NULL, "ctdb-recovery");
+	if (ret != 0) {
+		fprintf(stderr, "recovery: Unable to initialize logging\n");
+		goto failed;
+	}
+
+	ev = tevent_context_init(mem_ctx);
+	if (ev == NULL) {
+		D_ERR("tevent_context_init() failed\n");
+		goto failed;
+	}
+
+	status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL);
+	if (!status) {
+		D_ERR("logging_setup_sighup_handler() failed\n");
+		goto failed;
+	}
+
+	ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
+	if (ret != 0) {
+		D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
+		goto failed;
+	}
+
+	req = recovery_send(mem_ctx, ev, client, generation);
+	if (req == NULL) {
+		D_ERR("database_recover_send() failed\n");
+		goto failed;
+	}
+
+	if (! tevent_req_poll(req, ev)) {
+		D_ERR("tevent_req_poll() failed\n");
+		goto failed;
+	}
+
+	recovery_recv(req, &ret);
+	TALLOC_FREE(req);
+	if (ret != 0) {
+		D_ERR("database recovery failed, ret=%d\n", ret);
+		goto failed;
+	}
+
+	sys_write(write_fd, &ret, sizeof(ret));
+	return 0;
+
+failed:
+	TALLOC_FREE(mem_ctx);
+	return 1;
+}
diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c
new file mode 100644
index 0000000..b602cee
--- /dev/null
+++ b/ctdb/server/ctdb_server.c
@@ -0,0 +1,608 @@
+/*
+   ctdb main protocol code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+  choose the transport we will use
+*/
+int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
+{
+	ctdb->transport = talloc_strdup(ctdb, transport);
+	CTDB_NO_MEMORY(ctdb, ctdb->transport);
+
+	return 0;
+}
+
+/* Return the node structure for nodeip, NULL if nodeip is invalid */
+struct ctdb_node *ctdb_ip_to_node(struct ctdb_context *ctdb,
+				  const ctdb_sock_addr *nodeip)
+{
+	unsigned int nodeid;
+
+	for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) {
+		if (ctdb->nodes[nodeid]->flags & NODE_FLAGS_DELETED) {
+			continue;
+		}
+		if (ctdb_same_ip(&ctdb->nodes[nodeid]->address, nodeip)) {
+			return ctdb->nodes[nodeid];
+		}
+	}
+
+	return NULL;
+}
+
+/* Return the PNN for nodeip, CTDB_UNKNOWN_PNN if nodeip is invalid */
+uint32_t ctdb_ip_to_pnn(struct ctdb_context *ctdb,
+			const ctdb_sock_addr *nodeip)
+{
+	struct ctdb_node *node;
+
+	node = ctdb_ip_to_node(ctdb, nodeip);
+	if (node == NULL) {
+		return CTDB_UNKNOWN_PNN;
+	}
+
+	return node->pnn;
+}
+
+/* Load a nodes list file into a nodes array */
+static int convert_node_map_to_list(struct ctdb_context *ctdb,
+				    TALLOC_CTX *mem_ctx,
+				    struct ctdb_node_map_old *node_map,
+				    struct ctdb_node ***nodes,
+				    uint32_t *num_nodes)
+{
+	unsigned int i;
+
+	*nodes = talloc_zero_array(mem_ctx,
+					struct ctdb_node *, node_map->num);
+	CTDB_NO_MEMORY(ctdb, *nodes);
+	*num_nodes = node_map->num;
+
+	for (i = 0; i < node_map->num; i++) {
+		struct ctdb_node *node;
+
+		node = talloc_zero(*nodes, struct ctdb_node);
+		CTDB_NO_MEMORY(ctdb, node);
+		(*nodes)[i] = node;
+
+		node->address = node_map->nodes[i].addr;
+		node->name = talloc_asprintf(node, "%s:%u",
+					     ctdb_addr_to_str(&node->address),
+					     ctdb_addr_to_port(&node->address));
+
+		node->flags = node_map->nodes[i].flags;
+		if (!(node->flags & NODE_FLAGS_DELETED)) {
+			node->flags = NODE_FLAGS_UNHEALTHY;
+		}
+		node->flags |= NODE_FLAGS_DISCONNECTED;
+
+		node->pnn = i;
+		node->ctdb = ctdb;
+		node->dead_count = 0;
+	}
+
+	return 0;
+}
+
+/* Load the nodes list from a file */
+void ctdb_load_nodes_file(struct ctdb_context *ctdb)
+{
+	struct ctdb_node_map_old *node_map;
+	int ret;
+
+	node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
+	if (node_map == NULL) {
+		goto fail;
+	}
+
+	TALLOC_FREE(ctdb->nodes);
+	ret = convert_node_map_to_list(ctdb, ctdb, node_map,
+				       &ctdb->nodes, &ctdb->num_nodes);
+	if (ret == -1) {
+		goto fail;
+	}
+
+	talloc_free(node_map);
+	return;
+
+fail:
+	DEBUG(DEBUG_ERR, ("Failed to load nodes file \"%s\"\n",
+			  ctdb->nodes_file));
+	talloc_free(node_map);
+	exit(1);
+}
+
+/*
+  setup the local node address
+*/
+int ctdb_set_address(struct ctdb_context *ctdb, const char *address)
+{
+	ctdb->address = talloc(ctdb, ctdb_sock_addr);
+	CTDB_NO_MEMORY(ctdb, ctdb->address);
+
+	if (ctdb_parse_address(ctdb, address, ctdb->address) != 0) {
+		return -1;
+	}
+
+	ctdb->name = talloc_asprintf(ctdb, "%s:%u",
+				     ctdb_addr_to_str(ctdb->address),
+				     ctdb_addr_to_port(ctdb->address));
+	return 0;
+}
+
+
+/*
+  return the number of active nodes
+*/
+uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb)
+{
+	unsigned int i;
+	uint32_t count=0;
+	for (i=0; i < ctdb->num_nodes; i++) {
+		if (!(ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE)) {
+			count++;
+		}
+	}
+	return count;
+}
+
+
+/*
+  called when we need to process a packet. This can be a requeued packet
+  after a lockwait, or a real packet from another node
+*/
+void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	TALLOC_CTX *tmp_ctx;
+
+	/* place the packet as a child of the tmp_ctx. We then use
+	   talloc_free() below to free it. If any of the calls want
+	   to keep it, then they will steal it somewhere else, and the
+	   talloc_free() will only free the tmp_ctx */
+	tmp_ctx = talloc_new(ctdb);
+	talloc_steal(tmp_ctx, hdr);
+
+	DEBUG(DEBUG_DEBUG,(__location__ " ctdb request %u of type %u length %u from "
+		 "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+		 hdr->srcnode, hdr->destnode));
+
+	switch (hdr->operation) {
+	case CTDB_REQ_CALL:
+	case CTDB_REPLY_CALL:
+	case CTDB_REQ_DMASTER:
+	case CTDB_REPLY_DMASTER:
+		/* we don't allow these calls when banned */
+		if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_BANNED) {
+			DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
+				" request %u"
+				" length %u from node %u to %u while node"
+				" is banned\n",
+				 hdr->operation, hdr->reqid,
+				 hdr->length,
+				 hdr->srcnode, hdr->destnode));
+			goto done;
+		}
+
+		/* for ctdb_call inter-node operations verify that the
+		   remote node that sent us the call is running in the
+		   same generation instance as this node
+		*/
+		if (ctdb->vnn_map->generation != hdr->generation) {
+			DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
+				" request %u"
+				" length %u from node %u to %u had an"
+				" invalid generation id:%u while our"
+				" generation id is:%u\n",
+				 hdr->operation, hdr->reqid,
+				 hdr->length,
+				 hdr->srcnode, hdr->destnode,
+				 hdr->generation, ctdb->vnn_map->generation));
+			goto done;
+		}
+	}
+
+	switch (hdr->operation) {
+	case CTDB_REQ_CALL:
+		CTDB_INCREMENT_STAT(ctdb, node.req_call);
+		ctdb_request_call(ctdb, hdr);
+		break;
+
+	case CTDB_REPLY_CALL:
+		CTDB_INCREMENT_STAT(ctdb, node.reply_call);
+		ctdb_reply_call(ctdb, hdr);
+		break;
+
+	case CTDB_REPLY_ERROR:
+		CTDB_INCREMENT_STAT(ctdb, node.reply_error);
+		ctdb_reply_error(ctdb, hdr);
+		break;
+
+	case CTDB_REQ_DMASTER:
+		CTDB_INCREMENT_STAT(ctdb, node.req_dmaster);
+		ctdb_request_dmaster(ctdb, hdr);
+		break;
+
+	case CTDB_REPLY_DMASTER:
+		CTDB_INCREMENT_STAT(ctdb, node.reply_dmaster);
+		ctdb_reply_dmaster(ctdb, hdr);
+		break;
+
+	case CTDB_REQ_MESSAGE:
+		CTDB_INCREMENT_STAT(ctdb, node.req_message);
+		ctdb_request_message(ctdb, hdr);
+		break;
+
+	case CTDB_REQ_CONTROL:
+		CTDB_INCREMENT_STAT(ctdb, node.req_control);
+		ctdb_request_control(ctdb, hdr);
+		break;
+
+	case CTDB_REPLY_CONTROL:
+		CTDB_INCREMENT_STAT(ctdb, node.reply_control);
+		ctdb_reply_control(ctdb, hdr);
+		break;
+
+	case CTDB_REQ_KEEPALIVE:
+		CTDB_INCREMENT_STAT(ctdb, keepalive_packets_recv);
+		ctdb_request_keepalive(ctdb, hdr);
+		break;
+
+	case CTDB_REQ_TUNNEL:
+		CTDB_INCREMENT_STAT(ctdb, node.req_tunnel);
+		ctdb_request_tunnel(ctdb, hdr);
+		break;
+
+	default:
+		DEBUG(DEBUG_CRIT,("%s: Packet with unknown operation %u\n",
+			 __location__, hdr->operation));
+		break;
+	}
+
+done:
+	talloc_free(tmp_ctx);
+}
+
+
+/*
+  called by the transport layer when a node is dead
+*/
+void ctdb_node_dead(struct ctdb_node *node)
+{
+	if (node->ctdb->methods == NULL) {
+		DBG_ERR("Can not restart transport while shutting down\n");
+		return;
+	}
+	node->ctdb->methods->restart(node);
+
+	if (node->flags & NODE_FLAGS_DISCONNECTED) {
+		DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n",
+			 node->ctdb->name, node->name,
+			 node->ctdb->num_connected));
+		return;
+	}
+	node->ctdb->num_connected--;
+	node->flags |= NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY;
+	node->rx_cnt = 0;
+	node->dead_count = 0;
+
+	DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n",
+		 node->ctdb->name, node->name, node->ctdb->num_connected));
+	ctdb_daemon_cancel_controls(node->ctdb, node);
+}
+
+/*
+  called by the transport layer when a node is connected
+*/
+void ctdb_node_connected(struct ctdb_node *node)
+{
+	if (!(node->flags & NODE_FLAGS_DISCONNECTED)) {
+		DEBUG(DEBUG_INFO,("%s: node %s is already marked connected: %u connected\n",
+			 node->ctdb->name, node->name,
+			 node->ctdb->num_connected));
+		return;
+	}
+	node->ctdb->num_connected++;
+	node->dead_count = 0;
+	node->flags &= ~NODE_FLAGS_DISCONNECTED;
+	DEBUG(DEBUG_ERR,
+	      ("%s: connected to %s - %u connected\n",
+	       node->ctdb->name, node->name, node->ctdb->num_connected));
+}
+
+struct queue_next {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+};
+
+
+/*
+  triggered when a deferred packet is due
+ */
+static void queue_next_trigger(struct tevent_context *ev,
+			       struct tevent_timer *te,
+			       struct timeval t, void *private_data)
+{
+	struct queue_next *q = talloc_get_type(private_data, struct queue_next);
+	ctdb_input_pkt(q->ctdb, q->hdr);
+	talloc_free(q);
+}
+
+/*
+  defer a packet, so it is processed on the next event loop
+  this is used for sending packets to ourselves
+ */
+static void ctdb_defer_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct queue_next *q;
+	q = talloc(ctdb, struct queue_next);
+	if (q == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to allocate deferred packet\n"));
+		return;
+	}
+	q->ctdb = ctdb;
+	q->hdr = talloc_memdup(q, hdr, hdr->length);
+	if (q->hdr == NULL) {
+		talloc_free(q);
+		DEBUG(DEBUG_ERR,("Error copying deferred packet to self\n"));
+		return;
+	}
+#if 0
+	/* use this to put packets directly into our recv function */
+	ctdb_input_pkt(q->ctdb, q->hdr);
+#else
+	tevent_add_timer(ctdb->ev, q, timeval_zero(), queue_next_trigger, q);
+#endif
+}
+
+
+/*
+  broadcast a packet to all nodes
+*/
+static void ctdb_broadcast_packet_all(struct ctdb_context *ctdb,
+				      struct ctdb_req_header *hdr)
+{
+	unsigned int i;
+	for (i=0; i < ctdb->num_nodes; i++) {
+		if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+			continue;
+		}
+		hdr->destnode = ctdb->nodes[i]->pnn;
+		ctdb_queue_packet(ctdb, hdr);
+	}
+}
+
+/*
+  broadcast a packet to all active nodes
+*/
+static void ctdb_broadcast_packet_active(struct ctdb_context *ctdb,
+					 struct ctdb_req_header *hdr)
+{
+	unsigned int i;
+	for (i = 0; i < ctdb->num_nodes; i++) {
+		if (ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+
+		hdr->destnode = ctdb->nodes[i]->pnn;
+		ctdb_queue_packet(ctdb, hdr);
+	}
+}
+
+/*
+  broadcast a packet to all connected nodes
+*/
+static void ctdb_broadcast_packet_connected(struct ctdb_context *ctdb,
+					    struct ctdb_req_header *hdr)
+{
+	unsigned int i;
+	for (i=0; i < ctdb->num_nodes; i++) {
+		if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+			continue;
+		}
+		if (!(ctdb->nodes[i]->flags & NODE_FLAGS_DISCONNECTED)) {
+			hdr->destnode = ctdb->nodes[i]->pnn;
+			ctdb_queue_packet(ctdb, hdr);
+		}
+	}
+}
+
+/*
+  queue a packet or die
+*/
+void ctdb_queue_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+	struct ctdb_node *node;
+
+	switch (hdr->destnode) {
+	case CTDB_BROADCAST_ALL:
+		ctdb_broadcast_packet_all(ctdb, hdr);
+		return;
+	case CTDB_BROADCAST_ACTIVE:
+		ctdb_broadcast_packet_active(ctdb, hdr);
+		return;
+	case CTDB_BROADCAST_CONNECTED:
+		ctdb_broadcast_packet_connected(ctdb, hdr);
+		return;
+	}
+
+	CTDB_INCREMENT_STAT(ctdb, node_packets_sent);
+
+	if (!ctdb_validate_pnn(ctdb, hdr->destnode)) {
+		DEBUG(DEBUG_CRIT,(__location__ " can't send to node %u that does not exist\n",
+			 hdr->destnode));
+		return;
+	}
+
+	node = ctdb->nodes[hdr->destnode];
+
+	if (node->flags & NODE_FLAGS_DELETED) {
+		DEBUG(DEBUG_ERR, (__location__ " Can not queue packet to DELETED node %d\n", hdr->destnode));
+		return;
+	}
+
+	if (node->pnn == ctdb->pnn) {
+		ctdb_defer_packet(ctdb, hdr);
+		return;
+	}
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_ALERT, (__location__ " Can not queue packet. "
+				    "Transport is DOWN\n"));
+		return;
+	}
+
+	node->tx_cnt++;
+	if (ctdb->methods->queue_pkt(node, (uint8_t *)hdr, hdr->length) != 0) {
+		ctdb_fatal(ctdb, "Unable to queue packet\n");
+	}
+}
+
+
+
+
+/*
+  a valgrind hack to allow us to get opcode specific backtraces
+  very ugly, and relies on no compiler optimisation!
+*/
+void ctdb_queue_packet_opcode(struct ctdb_context *ctdb, struct ctdb_req_header *hdr, unsigned opcode)
+{
+	switch (opcode) {
+#define DO_OP(x) case x: ctdb_queue_packet(ctdb, hdr); break
+		DO_OP(1);
+		DO_OP(2);
+		DO_OP(3);
+		DO_OP(4);
+		DO_OP(5);
+		DO_OP(6);
+		DO_OP(7);
+		DO_OP(8);
+		DO_OP(9);
+		DO_OP(10);
+		DO_OP(11);
+		DO_OP(12);
+		DO_OP(13);
+		DO_OP(14);
+		DO_OP(15);
+		DO_OP(16);
+		DO_OP(17);
+		DO_OP(18);
+		DO_OP(19);
+		DO_OP(20);
+		DO_OP(21);
+		DO_OP(22);
+		DO_OP(23);
+		DO_OP(24);
+		DO_OP(25);
+		DO_OP(26);
+		DO_OP(27);
+		DO_OP(28);
+		DO_OP(29);
+		DO_OP(30);
+		DO_OP(31);
+		DO_OP(32);
+		DO_OP(33);
+		DO_OP(34);
+		DO_OP(35);
+		DO_OP(36);
+		DO_OP(37);
+		DO_OP(38);
+		DO_OP(39);
+		DO_OP(40);
+		DO_OP(41);
+		DO_OP(42);
+		DO_OP(43);
+		DO_OP(44);
+		DO_OP(45);
+		DO_OP(46);
+		DO_OP(47);
+		DO_OP(48);
+		DO_OP(49);
+		DO_OP(50);
+		DO_OP(51);
+		DO_OP(52);
+		DO_OP(53);
+		DO_OP(54);
+		DO_OP(55);
+		DO_OP(56);
+		DO_OP(57);
+		DO_OP(58);
+		DO_OP(59);
+		DO_OP(60);
+		DO_OP(61);
+		DO_OP(62);
+		DO_OP(63);
+		DO_OP(64);
+		DO_OP(65);
+		DO_OP(66);
+		DO_OP(67);
+		DO_OP(68);
+		DO_OP(69);
+		DO_OP(70);
+		DO_OP(71);
+		DO_OP(72);
+		DO_OP(73);
+		DO_OP(74);
+		DO_OP(75);
+		DO_OP(76);
+		DO_OP(77);
+		DO_OP(78);
+		DO_OP(79);
+		DO_OP(80);
+		DO_OP(81);
+		DO_OP(82);
+		DO_OP(83);
+		DO_OP(84);
+		DO_OP(85);
+		DO_OP(86);
+		DO_OP(87);
+		DO_OP(88);
+		DO_OP(89);
+		DO_OP(90);
+		DO_OP(91);
+		DO_OP(92);
+		DO_OP(93);
+		DO_OP(94);
+		DO_OP(95);
+		DO_OP(96);
+		DO_OP(97);
+		DO_OP(98);
+		DO_OP(99);
+		DO_OP(100);
+	default:
+		ctdb_queue_packet(ctdb, hdr);
+		break;
+	}
+}
diff --git a/ctdb/server/ctdb_statistics.c b/ctdb/server/ctdb_statistics.c
new file mode 100644
index 0000000..4cf8f9e
--- /dev/null
+++ b/ctdb/server/ctdb_statistics.c
@@ -0,0 +1,93 @@
+/* 
+   ctdb statistics code
+
+   Copyright (C) Ronnie Sahlberg 2010
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+
+#include "common/logging.h"
+
+static void ctdb_statistics_update(struct tevent_context *ev,
+				   struct tevent_timer *te,
+				   struct timeval t, void *p)
+{
+	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+	memmove(&ctdb->statistics_history[1], &ctdb->statistics_history[0], (MAX_STAT_HISTORY-1)*sizeof(struct ctdb_statistics));
+	memcpy(&ctdb->statistics_history[0], &ctdb->statistics_current, sizeof(struct ctdb_statistics));
+	ctdb->statistics_history[0].statistics_current_time = timeval_current();
+
+
+	bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
+	ctdb->statistics_current.statistics_start_time = timeval_current();
+
+	tevent_add_timer(ctdb->ev, ctdb,
+			 timeval_current_ofs(ctdb->tunable.stat_history_interval, 0),
+			 ctdb_statistics_update, ctdb);
+}
+
+int ctdb_statistics_init(struct ctdb_context *ctdb)
+{
+	bzero(&ctdb->statistics, sizeof(struct ctdb_statistics));
+	ctdb->statistics.statistics_start_time = timeval_current();
+
+	bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
+	ctdb->statistics_current.statistics_start_time = timeval_current();
+
+	bzero(ctdb->statistics_history, sizeof(ctdb->statistics_history));
+
+	tevent_add_timer(ctdb->ev, ctdb,
+			 timeval_current_ofs(ctdb->tunable.stat_history_interval, 0),
+			 ctdb_statistics_update, ctdb);
+	return 0;
+}
+
+
+int32_t ctdb_control_get_stat_history(struct ctdb_context *ctdb, 
+				      struct ctdb_req_control_old *c,
+				      TDB_DATA *outdata)
+{
+	int len;
+	struct ctdb_statistics_list_old *s;
+
+	len = offsetof(struct ctdb_statistics_list_old, stats) +
+		MAX_STAT_HISTORY*sizeof(struct ctdb_statistics);
+
+	s = talloc_size(outdata, len);
+	if (s == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to allocate statistics history structure\n"));
+		return -1;
+	}
+
+	s->num = MAX_STAT_HISTORY;
+	memcpy(&s->stats[0], &ctdb->statistics_history[0], sizeof(ctdb->statistics_history));
+
+	outdata->dsize = len;
+	outdata->dptr  = (uint8_t *)s;
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
new file mode 100644
index 0000000..b622faf
--- /dev/null
+++ b/ctdb/server/ctdb_takeover.c
@@ -0,0 +1,2751 @@
+/* 
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "protocol/protocol_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/system_socket.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "server/ctdb_config.h"
+
+#include "server/ipalloc.h"
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+#define CTDB_ARP_INTERVAL 1
+#define CTDB_ARP_REPEAT   3
+
+struct ctdb_interface {
+	struct ctdb_interface *prev, *next;
+	const char *name;
+	bool link_up;
+	uint32_t references;
+};
+
+struct vnn_interface {
+	struct vnn_interface *prev, *next;
+	struct ctdb_interface *iface;
+};
+
+/* state associated with a public ip address */
+struct ctdb_vnn {
+	struct ctdb_vnn *prev, *next;
+
+	struct ctdb_interface *iface;
+	struct vnn_interface *ifaces;
+	ctdb_sock_addr public_address;
+	uint8_t public_netmask_bits;
+
+	/*
+	 * The node number that is serving this public address - set
+	 * to CTDB_UNKNOWN_PNN if node is serving it
+	 */
+	uint32_t pnn;
+
+	/* List of clients to tickle for this public address */
+	struct ctdb_tcp_array *tcp_array;
+
+	/* whether we need to update the other nodes with changes to our list
+	   of connected clients */
+	bool tcp_update_needed;
+
+	/* a context to hang sending gratious arp events off */
+	TALLOC_CTX *takeover_ctx;
+
+	/* Set to true any time an update to this VNN is in flight.
+	   This helps to avoid races. */
+	bool update_in_flight;
+
+	/* If CTDB_CONTROL_DEL_PUBLIC_IP is received for this IP
+	 * address then this flag is set.  It will be deleted in the
+	 * release IP callback. */
+	bool delete_pending;
+};
+
+static const char *iface_string(const struct ctdb_interface *iface)
+{
+	return (iface != NULL ? iface->name : "__none__");
+}
+
+static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
+{
+	return iface_string(vnn->iface);
+}
+
+static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
+					      const char *iface);
+
+static struct ctdb_interface *
+ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
+{
+	struct ctdb_interface *i;
+
+	if (strlen(iface) > CTDB_IFACE_SIZE) {
+		DEBUG(DEBUG_ERR, ("Interface name too long \"%s\"\n", iface));
+		return NULL;
+	}
+
+	/* Verify that we don't have an entry for this ip yet */
+	i = ctdb_find_iface(ctdb, iface);
+	if (i != NULL) {
+		return i;
+	}
+
+	/* create a new structure for this interface */
+	i = talloc_zero(ctdb, struct ctdb_interface);
+	if (i == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+		return NULL;
+	}
+	i->name = talloc_strdup(i, iface);
+	if (i->name == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+		talloc_free(i);
+		return NULL;
+	}
+
+	i->link_up = true;
+
+	DLIST_ADD(ctdb->ifaces, i);
+
+	return i;
+}
+
+static bool vnn_has_interface(struct ctdb_vnn *vnn,
+			      const struct ctdb_interface *iface)
+{
+	struct vnn_interface *i;
+
+	for (i = vnn->ifaces; i != NULL; i = i->next) {
+		if (iface == i->iface) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/* If any interfaces now have no possible IPs then delete them.  This
+ * implementation is naive (i.e. simple) rather than clever
+ * (i.e. complex).  Given that this is run on delip and that operation
+ * is rare, this doesn't need to be efficient - it needs to be
+ * foolproof.  One alternative is reference counting, where the logic
+ * is distributed and can, therefore, be broken in multiple places.
+ * Another alternative is to build a red-black tree of interfaces that
+ * can have addresses (by walking ctdb->vnn once) and then walking
+ * ctdb->ifaces once and deleting those not in the tree.  Let's go to
+ * one of those if the naive implementation causes problems...  :-)
+ */
+static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
+					struct ctdb_vnn *vnn)
+{
+	struct ctdb_interface *i, *next;
+
+	/* For each interface, check if there's an IP using it. */
+	for (i = ctdb->ifaces; i != NULL; i = next) {
+		struct ctdb_vnn *tv;
+		bool found;
+		next = i->next;
+
+		/* Only consider interfaces named in the given VNN. */
+		if (!vnn_has_interface(vnn, i)) {
+			continue;
+		}
+
+		/* Search for a vnn with this interface. */
+		found = false;
+		for (tv=ctdb->vnn; tv; tv=tv->next) {
+			if (vnn_has_interface(tv, i)) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			/* None of the VNNs are using this interface. */
+			DLIST_REMOVE(ctdb->ifaces, i);
+			talloc_free(i);
+		}
+	}
+}
+
+
+static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
+					      const char *iface)
+{
+	struct ctdb_interface *i;
+
+	for (i=ctdb->ifaces;i;i=i->next) {
+		if (strcmp(i->name, iface) == 0) {
+			return i;
+		}
+	}
+
+	return NULL;
+}
+
+static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
+						  struct ctdb_vnn *vnn)
+{
+	struct vnn_interface *i;
+	struct ctdb_interface *cur = NULL;
+	struct ctdb_interface *best = NULL;
+
+	for (i = vnn->ifaces; i != NULL; i = i->next) {
+
+		cur = i->iface;
+
+		if (!cur->link_up) {
+			continue;
+		}
+
+		if (best == NULL) {
+			best = cur;
+			continue;
+		}
+
+		if (cur->references < best->references) {
+			best = cur;
+			continue;
+		}
+	}
+
+	return best;
+}
+
+static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
+				     struct ctdb_vnn *vnn)
+{
+	struct ctdb_interface *best = NULL;
+
+	if (vnn->iface) {
+		DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+				   "still assigned to iface '%s'\n",
+				   ctdb_addr_to_str(&vnn->public_address),
+				   ctdb_vnn_iface_string(vnn)));
+		return 0;
+	}
+
+	best = ctdb_vnn_best_iface(ctdb, vnn);
+	if (best == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
+				  "cannot assign to iface any iface\n",
+				  ctdb_addr_to_str(&vnn->public_address)));
+		return -1;
+	}
+
+	vnn->iface = best;
+	best->references++;
+	vnn->pnn = ctdb->pnn;
+
+	DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+			   "now assigned to iface '%s' refs[%d]\n",
+			   ctdb_addr_to_str(&vnn->public_address),
+			   ctdb_vnn_iface_string(vnn),
+			   best->references));
+	return 0;
+}
+
+static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
+				    struct ctdb_vnn *vnn)
+{
+	DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+			   "now unassigned (old iface '%s' refs[%d])\n",
+			   ctdb_addr_to_str(&vnn->public_address),
+			   ctdb_vnn_iface_string(vnn),
+			   vnn->iface?vnn->iface->references:0));
+	if (vnn->iface) {
+		vnn->iface->references--;
+	}
+	vnn->iface = NULL;
+	if (vnn->pnn == ctdb->pnn) {
+		vnn->pnn = CTDB_UNKNOWN_PNN;
+	}
+}
+
+static bool ctdb_vnn_available(struct ctdb_context *ctdb,
+			       struct ctdb_vnn *vnn)
+{
+	uint32_t flags;
+	struct vnn_interface *i;
+
+	/* Nodes that are not RUNNING can not host IPs */
+	if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
+		return false;
+	}
+
+	flags = ctdb->nodes[ctdb->pnn]->flags;
+	if ((flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED)) != 0) {
+		return false;
+	}
+
+	if (vnn->delete_pending) {
+		return false;
+	}
+
+	if (vnn->iface && vnn->iface->link_up) {
+		return true;
+	}
+
+	for (i = vnn->ifaces; i != NULL; i = i->next) {
+		if (i->iface->link_up) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+struct ctdb_takeover_arp {
+	struct ctdb_context *ctdb;
+	uint32_t count;
+	ctdb_sock_addr addr;
+	struct ctdb_tcp_array *tcparray;
+	struct ctdb_vnn *vnn;
+};
+
+
+/*
+  lists of tcp endpoints
+ */
+struct ctdb_tcp_list {
+	struct ctdb_tcp_list *prev, *next;
+	struct ctdb_client *client;
+	struct ctdb_connection connection;
+};
+
+/*
+  send a gratuitous arp
+ */
+static void ctdb_control_send_arp(struct tevent_context *ev,
+				  struct tevent_timer *te,
+				  struct timeval t, void *private_data)
+{
+	struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
+							struct ctdb_takeover_arp);
+	int ret;
+	struct ctdb_tcp_array *tcparray;
+	const char *iface;
+
+	/* IP address might have been released between sends */
+	if (arp->vnn->iface == NULL) {
+		DBG_INFO("Cancelling ARP send for released IP %s\n",
+			 ctdb_addr_to_str(&arp->vnn->public_address));
+		talloc_free(arp);
+		return;
+	}
+
+	iface = ctdb_vnn_iface_string(arp->vnn);
+	ret = ctdb_sys_send_arp(&arp->addr, iface);
+	if (ret != 0) {
+		DBG_ERR("Failed to send ARP on interface %s: %s\n",
+			iface, strerror(ret));
+	}
+
+	tcparray = arp->tcparray;
+	if (tcparray) {
+		unsigned int i;
+
+		for (i=0;i<tcparray->num;i++) {
+			struct ctdb_connection *tcon;
+			char buf[128];
+
+			tcon = &tcparray->connections[i];
+			ret = ctdb_connection_to_buf(buf,
+						     sizeof(buf),
+						     tcon,
+						     false,
+						     " -> ");
+			if (ret != 0) {
+				strlcpy(buf, "UNKNOWN", sizeof(buf));
+			}
+			D_INFO("Send TCP tickle ACK: %s\n", buf);
+			ret = ctdb_sys_send_tcp(
+				&tcon->src,
+				&tcon->dst,
+				0, 0, 0);
+			if (ret != 0) {
+				DBG_ERR("Failed to send TCP tickle ACK: %s\n",
+					buf);
+			}
+		}
+	}
+
+	arp->count++;
+
+	if (arp->count == CTDB_ARP_REPEAT) {
+		talloc_free(arp);
+		return;
+	}
+
+	tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
+			 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
+			 ctdb_control_send_arp, arp);
+}
+
+static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
+				       struct ctdb_vnn *vnn)
+{
+	struct ctdb_takeover_arp *arp;
+	struct ctdb_tcp_array *tcparray;
+
+	if (!vnn->takeover_ctx) {
+		vnn->takeover_ctx = talloc_new(vnn);
+		if (!vnn->takeover_ctx) {
+			return -1;
+		}
+	}
+
+	arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
+	if (!arp) {
+		return -1;
+	}
+
+	arp->ctdb = ctdb;
+	arp->addr = vnn->public_address;
+	arp->vnn  = vnn;
+
+	tcparray = vnn->tcp_array;
+	if (tcparray) {
+		/* add all of the known tcp connections for this IP to the
+		   list of tcp connections to send tickle acks for */
+		arp->tcparray = talloc_steal(arp, tcparray);
+
+		vnn->tcp_array = NULL;
+		vnn->tcp_update_needed = true;
+	}
+
+	tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
+			 timeval_zero(), ctdb_control_send_arp, arp);
+
+	return 0;
+}
+
+struct ctdb_do_takeip_state {
+	struct ctdb_req_control_old *c;
+	struct ctdb_vnn *vnn;
+};
+
+/*
+  called when takeip event finishes
+ */
+static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
+				    void *private_data)
+{
+	struct ctdb_do_takeip_state *state =
+		talloc_get_type(private_data, struct ctdb_do_takeip_state);
+	int32_t ret;
+	TDB_DATA data;
+
+	if (status != 0) {
+		if (status == -ETIMEDOUT) {
+			ctdb_ban_self(ctdb);
+		}
+		DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+				 ctdb_addr_to_str(&state->vnn->public_address),
+				 ctdb_vnn_iface_string(state->vnn)));
+		ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+
+		talloc_free(state);
+		return;
+	}
+
+	if (ctdb->do_checkpublicip) {
+
+	ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+	if (ret != 0) {
+		ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+		talloc_free(state);
+		return;
+	}
+
+	}
+
+	data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
+	data.dsize = strlen((char *)data.dptr) + 1;
+	DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
+
+	ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
+
+
+	/* the control succeeded */
+	ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+	talloc_free(state);
+	return;
+}
+
+static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
+{
+	state->vnn->update_in_flight = false;
+	return 0;
+}
+
+/*
+  take over an ip address
+ */
+static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
+			      struct ctdb_req_control_old *c,
+			      struct ctdb_vnn *vnn)
+{
+	int ret;
+	struct ctdb_do_takeip_state *state;
+
+	if (vnn->update_in_flight) {
+		DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
+				    "update for this IP already in flight\n",
+				    ctdb_addr_to_str(&vnn->public_address),
+				    vnn->public_netmask_bits));
+		return -1;
+	}
+
+	ret = ctdb_vnn_assign_iface(ctdb, vnn);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
+				 "assign a usable interface\n",
+				 ctdb_addr_to_str(&vnn->public_address),
+				 vnn->public_netmask_bits));
+		return -1;
+	}
+
+	state = talloc(vnn, struct ctdb_do_takeip_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->c = NULL;
+	state->vnn   = vnn;
+
+	vnn->update_in_flight = true;
+	talloc_set_destructor(state, ctdb_takeip_destructor);
+
+	DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
+			    ctdb_addr_to_str(&vnn->public_address),
+			    vnn->public_netmask_bits,
+			    ctdb_vnn_iface_string(vnn)));
+
+	ret = ctdb_event_script_callback(ctdb,
+					 state,
+					 ctdb_do_takeip_callback,
+					 state,
+					 CTDB_EVENT_TAKE_IP,
+					 "%s %s %u",
+					 ctdb_vnn_iface_string(vnn),
+					 ctdb_addr_to_str(&vnn->public_address),
+					 vnn->public_netmask_bits);
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+			ctdb_addr_to_str(&vnn->public_address),
+			ctdb_vnn_iface_string(vnn)));
+		talloc_free(state);
+		return -1;
+	}
+
+	state->c = talloc_steal(ctdb, c);
+	return 0;
+}
+
+struct ctdb_do_updateip_state {
+	struct ctdb_req_control_old *c;
+	struct ctdb_interface *old;
+	struct ctdb_vnn *vnn;
+};
+
+/*
+  called when updateip event finishes
+ */
+static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
+				      void *private_data)
+{
+	struct ctdb_do_updateip_state *state =
+		talloc_get_type(private_data, struct ctdb_do_updateip_state);
+
+	if (status != 0) {
+		if (status == -ETIMEDOUT) {
+			ctdb_ban_self(ctdb);
+		}
+		DEBUG(DEBUG_ERR,
+		      ("Failed update of IP %s from interface %s to %s\n",
+		       ctdb_addr_to_str(&state->vnn->public_address),
+		       iface_string(state->old),
+		       ctdb_vnn_iface_string(state->vnn)));
+
+		/*
+		 * All we can do is reset the old interface
+		 * and let the next run fix it
+		 */
+		ctdb_vnn_unassign_iface(ctdb, state->vnn);
+		state->vnn->iface = state->old;
+		state->vnn->iface->references++;
+
+		ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+		talloc_free(state);
+		return;
+	}
+
+	/* the control succeeded */
+	ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+	talloc_free(state);
+	return;
+}
+
+static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
+{
+	state->vnn->update_in_flight = false;
+	return 0;
+}
+
+/*
+  update (move) an ip address
+ */
+static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
+				struct ctdb_req_control_old *c,
+				struct ctdb_vnn *vnn)
+{
+	int ret;
+	struct ctdb_do_updateip_state *state;
+	struct ctdb_interface *old = vnn->iface;
+	const char *old_name = iface_string(old);
+	const char *new_name;
+
+	if (vnn->update_in_flight) {
+		DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
+				    "update for this IP already in flight\n",
+				    ctdb_addr_to_str(&vnn->public_address),
+				    vnn->public_netmask_bits));
+		return -1;
+	}
+
+	ctdb_vnn_unassign_iface(ctdb, vnn);
+	ret = ctdb_vnn_assign_iface(ctdb, vnn);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Update of IP %s/%u failed to "
+				 "assign a usable interface (old iface '%s')\n",
+				 ctdb_addr_to_str(&vnn->public_address),
+				 vnn->public_netmask_bits,
+				 old_name));
+		return -1;
+	}
+
+	if (old == vnn->iface) {
+		/* A benign update from one interface onto itself.
+		 * no need to run the eventscripts in this case, just return
+		 * success.
+		 */
+		ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
+		return 0;
+	}
+
+	state = talloc(vnn, struct ctdb_do_updateip_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->c = NULL;
+	state->old = old;
+	state->vnn = vnn;
+
+	vnn->update_in_flight = true;
+	talloc_set_destructor(state, ctdb_updateip_destructor);
+
+	new_name = ctdb_vnn_iface_string(vnn);
+	DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
+			    "interface %s to %s\n",
+			    ctdb_addr_to_str(&vnn->public_address),
+			    vnn->public_netmask_bits,
+			    old_name,
+			    new_name));
+
+	ret = ctdb_event_script_callback(ctdb,
+					 state,
+					 ctdb_do_updateip_callback,
+					 state,
+					 CTDB_EVENT_UPDATE_IP,
+					 "%s %s %s %u",
+					 old_name,
+					 new_name,
+					 ctdb_addr_to_str(&vnn->public_address),
+					 vnn->public_netmask_bits);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed update IP %s from interface %s to %s\n",
+		       ctdb_addr_to_str(&vnn->public_address),
+		       old_name, new_name));
+		talloc_free(state);
+		return -1;
+	}
+
+	state->c = talloc_steal(ctdb, c);
+	return 0;
+}
+
+/*
+  Find the vnn of the node that has a public ip address
+  returns -1 if the address is not known as a public address
+ */
+static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
+{
+	struct ctdb_vnn *vnn;
+
+	for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+		if (ctdb_same_ip(&vnn->public_address, addr)) {
+			return vnn;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+  take over an ip address
+ */
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
+				 struct ctdb_req_control_old *c,
+				 TDB_DATA indata,
+				 bool *async_reply)
+{
+	int ret;
+	struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
+	struct ctdb_vnn *vnn;
+	bool have_ip = false;
+	bool do_updateip = false;
+	bool do_takeip = false;
+	struct ctdb_interface *best_iface = NULL;
+
+	if (pip->pnn != ctdb->pnn) {
+		DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
+				 "with pnn %d, but we're node %d\n",
+				 ctdb_addr_to_str(&pip->addr),
+				 pip->pnn, ctdb->pnn));
+		return -1;
+	}
+
+	/* update out vnn list */
+	vnn = find_public_ip_vnn(ctdb, &pip->addr);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
+			ctdb_addr_to_str(&pip->addr)));
+		return 0;
+	}
+
+	if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) {
+		have_ip = ctdb_sys_have_ip(&pip->addr);
+	}
+	best_iface = ctdb_vnn_best_iface(ctdb, vnn);
+	if (best_iface == NULL) {
+		DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
+				 "a usable interface (old %s, have_ip %d)\n",
+				 ctdb_addr_to_str(&vnn->public_address),
+				 vnn->public_netmask_bits,
+				 ctdb_vnn_iface_string(vnn),
+				 have_ip));
+		return -1;
+	}
+
+	if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != CTDB_UNKNOWN_PNN) {
+		DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+				  "and we have it on iface[%s], but it was assigned to node %d"
+				  "and we are node %d, banning ourself\n",
+				 ctdb_addr_to_str(&vnn->public_address),
+				 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
+		ctdb_ban_self(ctdb);
+		return -1;
+	}
+
+	if (vnn->pnn == CTDB_UNKNOWN_PNN && have_ip) {
+		/* This will cause connections to be reset and
+		 * reestablished.  However, this is a very unusual
+		 * situation and doing this will completely repair the
+		 * inconsistency in the VNN.
+		 */
+		DEBUG(DEBUG_WARNING,
+		      (__location__
+		       " Doing updateip for IP %s already on an interface\n",
+		       ctdb_addr_to_str(&vnn->public_address)));
+		do_updateip = true;
+	}
+
+	if (vnn->iface) {
+		if (vnn->iface != best_iface) {
+			if (!vnn->iface->link_up) {
+				do_updateip = true;
+			} else if (vnn->iface->references > (best_iface->references + 1)) {
+				/* only move when the rebalance gains something */
+					do_updateip = true;
+			}
+		}
+	}
+
+	if (!have_ip) {
+		if (do_updateip) {
+			ctdb_vnn_unassign_iface(ctdb, vnn);
+			do_updateip = false;
+		}
+		do_takeip = true;
+	}
+
+	if (do_takeip) {
+		ret = ctdb_do_takeip(ctdb, c, vnn);
+		if (ret != 0) {
+			return -1;
+		}
+	} else if (do_updateip) {
+		ret = ctdb_do_updateip(ctdb, c, vnn);
+		if (ret != 0) {
+			return -1;
+		}
+	} else {
+		/*
+		 * The interface is up and the kernel known the ip
+		 * => do nothing
+		 */
+		DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
+			ctdb_addr_to_str(&pip->addr),
+			vnn->public_netmask_bits,
+			ctdb_vnn_iface_string(vnn)));
+		return 0;
+	}
+
+	/* tell ctdb_control.c that we will be replying asynchronously */
+	*async_reply = true;
+
+	return 0;
+}
+
+static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
+{
+	DLIST_REMOVE(ctdb->vnn, vnn);
+	ctdb_vnn_unassign_iface(ctdb, vnn);
+	ctdb_remove_orphaned_ifaces(ctdb, vnn);
+	talloc_free(vnn);
+}
+
+static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb,
+					struct ctdb_vnn *vnn,
+					ctdb_sock_addr *addr)
+{
+	TDB_DATA data;
+
+	/* Send a message to all clients of this node telling them
+	 * that the cluster has been reconfigured and they should
+	 * close any connections on this IP address
+	 */
+	data.dptr = (uint8_t *)ctdb_addr_to_str(addr);
+	data.dsize = strlen((char *)data.dptr)+1;
+	DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr));
+	ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
+
+	ctdb_vnn_unassign_iface(ctdb, vnn);
+
+	/* Process the IP if it has been marked for deletion */
+	if (vnn->delete_pending) {
+		do_delete_ip(ctdb, vnn);
+		return NULL;
+	}
+
+	return vnn;
+}
+
+struct release_ip_callback_state {
+	struct ctdb_req_control_old *c;
+	ctdb_sock_addr *addr;
+	struct ctdb_vnn *vnn;
+	uint32_t target_pnn;
+};
+
+/*
+  called when releaseip event finishes
+ */
+static void release_ip_callback(struct ctdb_context *ctdb, int status,
+				void *private_data)
+{
+	struct release_ip_callback_state *state =
+		talloc_get_type(private_data, struct release_ip_callback_state);
+
+	if (status == -ETIMEDOUT) {
+		ctdb_ban_self(ctdb);
+	}
+
+	if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) {
+		if  (ctdb_sys_have_ip(state->addr)) {
+			DEBUG(DEBUG_ERR,
+			      ("IP %s still hosted during release IP callback, failing\n",
+			       ctdb_addr_to_str(state->addr)));
+			ctdb_request_control_reply(ctdb, state->c,
+						   NULL, -1, NULL);
+			talloc_free(state);
+			return;
+		}
+	}
+
+	state->vnn->pnn = state->target_pnn;
+	state->vnn = release_ip_post(ctdb, state->vnn, state->addr);
+
+	/* the control succeeded */
+	ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+	talloc_free(state);
+}
+
+static int ctdb_releaseip_destructor(struct release_ip_callback_state *state)
+{
+	if (state->vnn != NULL) {
+		state->vnn->update_in_flight = false;
+	}
+	return 0;
+}
+
+/*
+  release an ip address
+ */
+int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
+				struct ctdb_req_control_old *c,
+				TDB_DATA indata, 
+				bool *async_reply)
+{
+	int ret;
+	struct release_ip_callback_state *state;
+	struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
+	struct ctdb_vnn *vnn;
+	const char *iface;
+
+	/* update our vnn list */
+	vnn = find_public_ip_vnn(ctdb, &pip->addr);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
+			ctdb_addr_to_str(&pip->addr)));
+		return 0;
+	}
+
+	/* stop any previous arps */
+	talloc_free(vnn->takeover_ctx);
+	vnn->takeover_ctx = NULL;
+
+	/* RELEASE_IP controls are sent to all nodes that should not
+	 * be hosting a particular IP.  This serves 2 purposes.  The
+	 * first is to help resolve any inconsistencies.  If a node
+	 * does unexpectedly host an IP then it will be released.  The
+	 * 2nd is to use a "redundant release" to tell non-takeover
+	 * nodes where an IP is moving to.  This is how "ctdb ip" can
+	 * report the (likely) location of an IP by only asking the
+	 * local node.  Redundant releases need to update the PNN but
+	 * are otherwise ignored.
+	 */
+	if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) {
+		if (!ctdb_sys_have_ip(&pip->addr)) {
+			DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
+				ctdb_addr_to_str(&pip->addr),
+				vnn->public_netmask_bits,
+				ctdb_vnn_iface_string(vnn)));
+			vnn->pnn = pip->pnn;
+			ctdb_vnn_unassign_iface(ctdb, vnn);
+			return 0;
+		}
+	} else {
+		if (vnn->iface == NULL) {
+			DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
+					   ctdb_addr_to_str(&pip->addr),
+					   vnn->public_netmask_bits));
+			vnn->pnn = pip->pnn;
+			return 0;
+		}
+	}
+
+	/* There is a potential race between take_ip and us because we
+	 * update the VNN via a callback that run when the
+	 * eventscripts have been run.  Avoid the race by allowing one
+	 * update to be in flight at a time.
+	 */
+	if (vnn->update_in_flight) {
+		DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
+				    "update for this IP already in flight\n",
+				    ctdb_addr_to_str(&vnn->public_address),
+				    vnn->public_netmask_bits));
+		return -1;
+	}
+
+	iface = ctdb_vnn_iface_string(vnn);
+
+	DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
+		ctdb_addr_to_str(&pip->addr),
+		vnn->public_netmask_bits,
+		iface,
+		pip->pnn));
+
+	state = talloc(ctdb, struct release_ip_callback_state);
+	if (state == NULL) {
+		ctdb_set_error(ctdb, "Out of memory at %s:%d",
+			       __FILE__, __LINE__);
+		return -1;
+	}
+
+	state->c = NULL;
+	state->addr = talloc(state, ctdb_sock_addr);
+	if (state->addr == NULL) {
+		ctdb_set_error(ctdb, "Out of memory at %s:%d",
+			       __FILE__, __LINE__);
+		talloc_free(state);
+		return -1;
+	}
+	*state->addr = pip->addr;
+	state->target_pnn = pip->pnn;
+	state->vnn   = vnn;
+
+	vnn->update_in_flight = true;
+	talloc_set_destructor(state, ctdb_releaseip_destructor);
+
+	ret = ctdb_event_script_callback(ctdb, 
+					 state, release_ip_callback, state,
+					 CTDB_EVENT_RELEASE_IP,
+					 "%s %s %u",
+					 iface,
+					 ctdb_addr_to_str(&pip->addr),
+					 vnn->public_netmask_bits);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
+			ctdb_addr_to_str(&pip->addr),
+			ctdb_vnn_iface_string(vnn)));
+		talloc_free(state);
+		return -1;
+	}
+
+	/* tell the control that we will be reply asynchronously */
+	*async_reply = true;
+	state->c = talloc_steal(state, c);
+	return 0;
+}
+
+static int ctdb_add_public_address(struct ctdb_context *ctdb,
+				   ctdb_sock_addr *addr,
+				   unsigned mask, const char *ifaces,
+				   bool check_address)
+{
+	struct ctdb_vnn      *vnn;
+	char *tmp;
+	const char *iface;
+
+	/* Verify that we don't have an entry for this IP yet */
+	for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
+		if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
+			D_ERR("Duplicate public IP address '%s'\n",
+			      ctdb_addr_to_str(addr));
+			return -1;
+		}
+	}
+
+	/* Create a new VNN structure for this IP address */
+	vnn = talloc_zero(ctdb, struct ctdb_vnn);
+	if (vnn == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return -1;
+	}
+	tmp = talloc_strdup(vnn, ifaces);
+	if (tmp == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		talloc_free(vnn);
+		return -1;
+	}
+	for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
+		struct vnn_interface *vnn_iface;
+		struct ctdb_interface *i;
+
+		if (!ctdb_sys_check_iface_exists(iface)) {
+			D_ERR("Unknown interface %s for public address %s\n",
+			      iface,
+			      ctdb_addr_to_str(addr));
+			talloc_free(vnn);
+			return -1;
+		}
+
+		i = ctdb_add_local_iface(ctdb, iface);
+		if (i == NULL) {
+			D_ERR("Failed to add interface '%s' "
+			      "for public address %s\n",
+			      iface,
+			      ctdb_addr_to_str(addr));
+			talloc_free(vnn);
+			return -1;
+		}
+
+		vnn_iface = talloc_zero(vnn, struct vnn_interface);
+		if (vnn_iface == NULL) {
+			DBG_ERR("Memory allocation error\n");
+			talloc_free(vnn);
+			return -1;
+		}
+
+		vnn_iface->iface = i;
+		DLIST_ADD_END(vnn->ifaces, vnn_iface);
+	}
+	talloc_free(tmp);
+	vnn->public_address      = *addr;
+	vnn->public_netmask_bits = mask;
+	vnn->pnn                 = -1;
+
+	DLIST_ADD(ctdb->vnn, vnn);
+
+	return 0;
+}
+
+/*
+  setup the public address lists from a file
+*/
+int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
+{
+	bool ok;
+	char **lines;
+	int nlines;
+	int i;
+
+	/* If no public addresses file given then try the default */
+	if (ctdb->public_addresses_file == NULL) {
+		const char *b = getenv("CTDB_BASE");
+		if (b == NULL) {
+			DBG_ERR("CTDB_BASE not set\n");
+			return -1;
+		}
+		ctdb->public_addresses_file = talloc_asprintf(
+					ctdb, "%s/%s", b, "public_addresses");
+		if (ctdb->public_addresses_file == NULL) {
+			DBG_ERR("Out of memory\n");
+			return -1;
+		}
+	}
+
+	/* If the file doesn't exist then warn and do nothing */
+	ok = file_exist(ctdb->public_addresses_file);
+	if (!ok) {
+		D_WARNING("Not loading public addresses, no file %s\n",
+			  ctdb->public_addresses_file);
+		return 0;
+	}
+
+	lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
+	if (lines == NULL) {
+		ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
+		return -1;
+	}
+	while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
+		nlines--;
+	}
+
+	for (i=0;i<nlines;i++) {
+		unsigned mask;
+		ctdb_sock_addr addr;
+		const char *addrstr;
+		const char *ifaces;
+		char *tok, *line;
+		int ret;
+
+		line = lines[i];
+		while ((*line == ' ') || (*line == '\t')) {
+			line++;
+		}
+		if (*line == '#') {
+			continue;
+		}
+		if (strcmp(line, "") == 0) {
+			continue;
+		}
+		tok = strtok(line, " \t");
+		addrstr = tok;
+
+		tok = strtok(NULL, " \t");
+		if (tok == NULL) {
+			D_ERR("No interface specified at line %u "
+			      "of public addresses file\n", i+1);
+			talloc_free(lines);
+			return -1;
+		}
+		ifaces = tok;
+
+		if (addrstr == NULL) {
+			D_ERR("Badly formed line %u in public address list\n",
+			      i+1);
+			talloc_free(lines);
+			return -1;
+		}
+
+		ret = ctdb_sock_addr_mask_from_string(addrstr, &addr, &mask);
+		if (ret != 0) {
+			D_ERR("Badly formed line %u in public address list\n",
+			      i+1);
+			talloc_free(lines);
+			return -1;
+		}
+
+		if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
+			DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
+			talloc_free(lines);
+			return -1;
+		}
+	}
+
+
+	D_NOTICE("Loaded public addresses from %s\n",
+		 ctdb->public_addresses_file);
+
+	talloc_free(lines);
+	return 0;
+}
+
+/*
+  destroy a ctdb_tcp_list structure
+ */
+static int ctdb_tcp_list_destructor(struct ctdb_tcp_list *tcp)
+{
+	struct ctdb_client *client = tcp->client;
+	struct ctdb_connection *conn = &tcp->connection;
+	char conn_str[132] = { 0, };
+	int ret;
+
+	ret = ctdb_connection_to_buf(conn_str,
+				     sizeof(conn_str),
+				     conn,
+				     false,
+				     " -> ");
+	if (ret != 0) {
+		strlcpy(conn_str, "UNKNOWN", sizeof(conn_str));
+	}
+
+	D_DEBUG("removing client TCP connection %s "
+		"(client_id %u pid %d)\n",
+		conn_str, client->client_id, client->pid);
+
+	DLIST_REMOVE(client->tcp_list, tcp);
+
+	/*
+	 * We don't call ctdb_remove_connection(vnn, conn) here
+	 * as we want the caller to decide if it's called
+	 * directly (local only) or indirectly via a
+	 * CTDB_CONTROL_TCP_REMOVE broadcast
+	 */
+
+	return 0;
+}
+
+/*
+  called by a client to inform us of a TCP connection that it is managing
+  that should tickled with an ACK when IP takeover is done
+ */
+int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
+				TDB_DATA indata)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	struct ctdb_connection *tcp_sock = NULL;
+	struct ctdb_tcp_list *tcp;
+	struct ctdb_connection t;
+	int ret;
+	TDB_DATA data;
+	struct ctdb_vnn *vnn;
+	char conn_str[132] = { 0, };
+
+	/* If we don't have public IPs, tickles are useless */
+	if (ctdb->vnn == NULL) {
+		return 0;
+	}
+
+	tcp_sock = (struct ctdb_connection *)indata.dptr;
+
+	ctdb_canonicalize_ip_inplace(&tcp_sock->src);
+	ctdb_canonicalize_ip_inplace(&tcp_sock->dst);
+
+	ret = ctdb_connection_to_buf(conn_str,
+				     sizeof(conn_str),
+				     tcp_sock,
+				     false,
+				     " -> ");
+	if (ret != 0) {
+		strlcpy(conn_str, "UNKNOWN", sizeof(conn_str));
+	}
+
+	vnn = find_public_ip_vnn(ctdb, &tcp_sock->dst);
+	if (vnn == NULL) {
+		D_ERR("Could not register TCP connection %s - "
+		      "not a public address (client_id %u pid %u)\n",
+			conn_str, client_id, client->pid);
+		return 0;
+	}
+
+	if (vnn->pnn != ctdb->pnn) {
+		D_ERR("Attempt to register tcp client for IP %s we don't hold - "
+		      "failing (client_id %u pid %u)\n",
+		      ctdb_addr_to_str(&tcp_sock->dst),
+		      client_id, client->pid);
+		/* failing this call will tell smbd to die */
+		return -1;
+	}
+
+	tcp = talloc(client, struct ctdb_tcp_list);
+	CTDB_NO_MEMORY(ctdb, tcp);
+	tcp->client = client;
+
+	tcp->connection.src = tcp_sock->src;
+	tcp->connection.dst = tcp_sock->dst;
+
+	DLIST_ADD(client->tcp_list, tcp);
+	talloc_set_destructor(tcp, ctdb_tcp_list_destructor);
+
+	t.src = tcp_sock->src;
+	t.dst = tcp_sock->dst;
+
+	data.dptr = (uint8_t *)&t;
+	data.dsize = sizeof(t);
+
+	D_INFO("Registered TCP connection %s (client_id %u pid %u)\n",
+	       conn_str, client_id, client->pid);
+
+	/* tell all nodes about this tcp connection */
+	ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
+				       CTDB_CONTROL_TCP_ADD,
+				       0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+static bool ctdb_client_remove_tcp(struct ctdb_client *client,
+				   const struct ctdb_connection *conn)
+{
+	struct ctdb_tcp_list *tcp = NULL;
+	struct ctdb_tcp_list *tcp_next = NULL;
+	bool found = false;
+
+	for (tcp = client->tcp_list; tcp != NULL; tcp = tcp_next) {
+		bool same;
+
+		tcp_next = tcp->next;
+
+		same = ctdb_connection_same(conn, &tcp->connection);
+		if (!same) {
+			continue;
+		}
+
+		TALLOC_FREE(tcp);
+		found = true;
+	}
+
+	return found;
+}
+
+/*
+  called by a client to inform us of a TCP connection that was disconnected
+ */
+int32_t ctdb_control_tcp_client_disconnected(struct ctdb_context *ctdb,
+					     uint32_t client_id,
+					     TDB_DATA indata)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	struct ctdb_connection *tcp_sock = NULL;
+	int ret;
+	TDB_DATA data;
+	char conn_str[132] = { 0, };
+	bool found = false;
+
+	tcp_sock = (struct ctdb_connection *)indata.dptr;
+
+	ctdb_canonicalize_ip_inplace(&tcp_sock->src);
+	ctdb_canonicalize_ip_inplace(&tcp_sock->dst);
+
+	ret = ctdb_connection_to_buf(conn_str,
+				     sizeof(conn_str),
+				     tcp_sock,
+				     false,
+				     " -> ");
+	if (ret != 0) {
+		strlcpy(conn_str, "UNKNOWN", sizeof(conn_str));
+	}
+
+	found = ctdb_client_remove_tcp(client, tcp_sock);
+	if (!found) {
+		DBG_DEBUG("TCP connection %s not found "
+			  "(client_id %u pid %u).\n",
+			  conn_str, client_id, client->pid);
+		return 0;
+	}
+
+	D_INFO("deregistered TCP connection %s "
+	       "(client_id %u pid %u)\n",
+	       conn_str, client_id, client->pid);
+
+	data.dptr = (uint8_t *)tcp_sock;
+	data.dsize = sizeof(*tcp_sock);
+
+	/* tell all nodes about this tcp connection is gone */
+	ret = ctdb_daemon_send_control(ctdb,
+				       CTDB_BROADCAST_CONNECTED,
+				       0,
+				       CTDB_CONTROL_TCP_REMOVE,
+				       0,
+				       CTDB_CTRL_FLAG_NOREPLY,
+				       data,
+				       NULL,
+				       NULL);
+	if (ret != 0) {
+		DBG_ERR("Failed to send CTDB_CONTROL_TCP_REMOVE: %s\n",
+			conn_str);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  called by a client to inform us of a TCP connection was passed to a different
+  "client" (typically with multichannel to another smbd process).
+ */
+int32_t ctdb_control_tcp_client_passed(struct ctdb_context *ctdb,
+				       uint32_t client_id,
+				       TDB_DATA indata)
+{
+	struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	struct ctdb_connection *tcp_sock = NULL;
+	int ret;
+	char conn_str[132] = { 0, };
+	bool found = false;
+
+	tcp_sock = (struct ctdb_connection *)indata.dptr;
+
+	ctdb_canonicalize_ip_inplace(&tcp_sock->src);
+	ctdb_canonicalize_ip_inplace(&tcp_sock->dst);
+
+	ret = ctdb_connection_to_buf(conn_str,
+				     sizeof(conn_str),
+				     tcp_sock,
+				     false,
+				     " -> ");
+	if (ret != 0) {
+		strlcpy(conn_str, "UNKNOWN", sizeof(conn_str));
+	}
+
+	found = ctdb_client_remove_tcp(client, tcp_sock);
+	if (!found) {
+		DBG_DEBUG("TCP connection from %s not found "
+			  "(client_id %u pid %u).\n",
+			  conn_str, client_id, client->pid);
+		return 0;
+	}
+
+	D_INFO("TCP connection from %s "
+	       "(client_id %u pid %u) passed to another client\n",
+	       conn_str, client_id, client->pid);
+
+	/*
+	 * We don't call CTDB_CONTROL_TCP_REMOVE
+	 * nor ctdb_remove_connection() as the connection
+	 * is still alive, but handled by another client
+	 */
+
+	return 0;
+}
+
+/*
+  find a tcp address on a list
+ */
+static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
+					   struct ctdb_connection *tcp)
+{
+	unsigned int i;
+
+	if (array == NULL) {
+		return NULL;
+	}
+
+	for (i=0;i<array->num;i++) {
+		if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
+		    ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
+			return &array->connections[i];
+		}
+	}
+	return NULL;
+}
+
+
+
+/*
+  called by a daemon to inform us of a TCP connection that one of its
+  clients managing that should tickled with an ACK when IP takeover is
+  done
+ */
+int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
+{
+	struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
+	struct ctdb_tcp_array *tcparray;
+	struct ctdb_connection tcp;
+	struct ctdb_vnn *vnn;
+
+	/* If we don't have public IPs, tickles are useless */
+	if (ctdb->vnn == NULL) {
+		return 0;
+	}
+
+	vnn = find_public_ip_vnn(ctdb, &p->dst);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
+			ctdb_addr_to_str(&p->dst)));
+
+		return -1;
+	}
+
+
+	tcparray = vnn->tcp_array;
+
+	/* If this is the first tickle */
+	if (tcparray == NULL) {
+		tcparray = talloc(vnn, struct ctdb_tcp_array);
+		CTDB_NO_MEMORY(ctdb, tcparray);
+		vnn->tcp_array = tcparray;
+
+		tcparray->num = 0;
+		tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
+		CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+		tcparray->connections[tcparray->num].src = p->src;
+		tcparray->connections[tcparray->num].dst = p->dst;
+		tcparray->num++;
+
+		if (tcp_update_needed) {
+			vnn->tcp_update_needed = true;
+		}
+		return 0;
+	}
+
+
+	/* Do we already have this tickle ?*/
+	tcp.src = p->src;
+	tcp.dst = p->dst;
+	if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
+		DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
+			ctdb_addr_to_str(&tcp.dst),
+			ntohs(tcp.dst.ip.sin_port),
+			vnn->pnn));
+		return 0;
+	}
+
+	/* A new tickle, we must add it to the array */
+	tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
+					struct ctdb_connection,
+					tcparray->num+1);
+	CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+	tcparray->connections[tcparray->num].src = p->src;
+	tcparray->connections[tcparray->num].dst = p->dst;
+	tcparray->num++;
+
+	DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
+		ctdb_addr_to_str(&tcp.dst),
+		ntohs(tcp.dst.ip.sin_port),
+		vnn->pnn));
+
+	if (tcp_update_needed) {
+		vnn->tcp_update_needed = true;
+	}
+
+	return 0;
+}
+
+
+static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
+{
+	struct ctdb_connection *tcpp;
+
+	if (vnn == NULL) {
+		return;
+	}
+
+	/* if the array is empty we can't remove it
+	   and we don't need to do anything
+	 */
+	if (vnn->tcp_array == NULL) {
+		DEBUG(DEBUG_INFO,("Trying to remove tickle that doesn't exist (array is empty) %s:%u\n",
+			ctdb_addr_to_str(&conn->dst),
+			ntohs(conn->dst.ip.sin_port)));
+		return;
+	}
+
+
+	/* See if we know this connection
+	   if we don't know this connection  then we don't need to do anything
+	 */
+	tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
+	if (tcpp == NULL) {
+		DEBUG(DEBUG_INFO,("Trying to remove tickle that doesn't exist %s:%u\n",
+			ctdb_addr_to_str(&conn->dst),
+			ntohs(conn->dst.ip.sin_port)));
+		return;
+	}
+
+
+	/* We need to remove this entry from the array.
+           Instead of allocating a new array and copying data to it
+	   we cheat and just copy the last entry in the existing array
+	   to the entry that is to be removed and just shring the 
+	   ->num field
+	 */
+	*tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
+	vnn->tcp_array->num--;
+
+	/* If we deleted the last entry we also need to remove the entire array
+	 */
+	if (vnn->tcp_array->num == 0) {
+		talloc_free(vnn->tcp_array);
+		vnn->tcp_array = NULL;
+	}		
+
+	vnn->tcp_update_needed = true;
+
+	DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
+		ctdb_addr_to_str(&conn->src),
+		ntohs(conn->src.ip.sin_port)));
+}
+
+
+/*
+  called by a daemon to inform us of a TCP connection that one of its
+  clients used are no longer needed in the tickle database
+ */
+int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_vnn *vnn;
+	struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
+
+	/* If we don't have public IPs, tickles are useless */
+	if (ctdb->vnn == NULL) {
+		return 0;
+	}
+
+	vnn = find_public_ip_vnn(ctdb, &conn->dst);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " unable to find public address %s\n",
+		       ctdb_addr_to_str(&conn->dst)));
+		return 0;
+	}
+
+	ctdb_remove_connection(vnn, conn);
+
+	return 0;
+}
+
+
+static void ctdb_send_set_tcp_tickles_for_all(struct ctdb_context *ctdb,
+					      bool force);
+
+/*
+  Called when another daemon starts - causes all tickles for all
+  public addresses we are serving to be sent to the new node on the
+  next check.  This actually causes the tickles to be sent to the
+  other node immediately.  In case there is an error, the periodic
+  timer will send the updates on timer event.  This is simple and
+  doesn't require careful error handling.
+ */
+int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
+{
+	DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
+			   (unsigned long) pnn));
+
+	ctdb_send_set_tcp_tickles_for_all(ctdb, true);
+	return 0;
+}
+
+
+/*
+  called when a client structure goes away - hook to remove
+  elements from the tcp_list in all daemons
+ */
+void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
+{
+	while (client->tcp_list) {
+		struct ctdb_vnn *vnn;
+		struct ctdb_tcp_list *tcp = client->tcp_list;
+		struct ctdb_connection *conn = &tcp->connection;
+
+		vnn = find_public_ip_vnn(client->ctdb,
+					 &conn->dst);
+
+		/* If the IP address is hosted on this node then
+		 * remove the connection. */
+		if (vnn != NULL && vnn->pnn == client->ctdb->pnn) {
+			ctdb_remove_connection(vnn, conn);
+		}
+
+		/* Otherwise this function has been called because the
+		 * server IP address has been released to another node
+		 * and the client has exited.  This means that we
+		 * should not delete the connection information.  The
+		 * takeover node processes connections too. */
+
+		/*
+		 * The destructor removes from the list
+		 */
+		TALLOC_FREE(tcp);
+	}
+}
+
+
+void ctdb_release_all_ips(struct ctdb_context *ctdb)
+{
+	struct ctdb_vnn *vnn, *next;
+	int count = 0;
+
+	if (ctdb_config.failover_disabled == 1) {
+		return;
+	}
+
+	for (vnn = ctdb->vnn; vnn != NULL; vnn = next) {
+		/* vnn can be freed below in release_ip_post() */
+		next = vnn->next;
+
+		if (!ctdb_sys_have_ip(&vnn->public_address)) {
+			ctdb_vnn_unassign_iface(ctdb, vnn);
+			continue;
+		}
+
+		/* Don't allow multiple releases at once.  Some code,
+		 * particularly ctdb_tickle_sentenced_connections() is
+		 * not re-entrant */
+		if (vnn->update_in_flight) {
+			DEBUG(DEBUG_WARNING,
+			      (__location__
+			       " Not releasing IP %s/%u on interface %s, an update is already in progress\n",
+				    ctdb_addr_to_str(&vnn->public_address),
+				    vnn->public_netmask_bits,
+				    ctdb_vnn_iface_string(vnn)));
+			continue;
+		}
+		vnn->update_in_flight = true;
+
+		DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
+				    ctdb_addr_to_str(&vnn->public_address),
+				    vnn->public_netmask_bits,
+				    ctdb_vnn_iface_string(vnn)));
+
+		ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
+				       ctdb_vnn_iface_string(vnn),
+				       ctdb_addr_to_str(&vnn->public_address),
+				       vnn->public_netmask_bits);
+		/* releaseip timeouts are converted to success, so to
+		 * detect failures just check if the IP address is
+		 * still there...
+		 */
+		if (ctdb_sys_have_ip(&vnn->public_address)) {
+			DEBUG(DEBUG_ERR,
+			      (__location__
+			       " IP address %s not released\n",
+			       ctdb_addr_to_str(&vnn->public_address)));
+			vnn->update_in_flight = false;
+			continue;
+		}
+
+		vnn = release_ip_post(ctdb, vnn, &vnn->public_address);
+		if (vnn != NULL) {
+			vnn->update_in_flight = false;
+		}
+		count++;
+	}
+
+	DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
+}
+
+
+/*
+  get list of public IPs
+ */
+int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
+				    struct ctdb_req_control_old *c, TDB_DATA *outdata)
+{
+	int i, num, len;
+	struct ctdb_public_ip_list_old *ips;
+	struct ctdb_vnn *vnn;
+	bool only_available = false;
+
+	if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
+		only_available = true;
+	}
+
+	/* count how many public ip structures we have */
+	num = 0;
+	for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+		num++;
+	}
+
+	len = offsetof(struct ctdb_public_ip_list_old, ips) +
+		num*sizeof(struct ctdb_public_ip);
+	ips = talloc_zero_size(outdata, len);
+	CTDB_NO_MEMORY(ctdb, ips);
+
+	i = 0;
+	for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+		if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
+			continue;
+		}
+		ips->ips[i].pnn  = vnn->pnn;
+		ips->ips[i].addr = vnn->public_address;
+		i++;
+	}
+	ips->num = i;
+	len = offsetof(struct ctdb_public_ip_list_old, ips) +
+		i*sizeof(struct ctdb_public_ip);
+
+	outdata->dsize = len;
+	outdata->dptr  = (uint8_t *)ips;
+
+	return 0;
+}
+
+
+int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
+					struct ctdb_req_control_old *c,
+					TDB_DATA indata,
+					TDB_DATA *outdata)
+{
+	int i, num, len;
+	ctdb_sock_addr *addr;
+	struct ctdb_public_ip_info_old *info;
+	struct ctdb_vnn *vnn;
+	struct vnn_interface *iface;
+
+	addr = (ctdb_sock_addr *)indata.dptr;
+
+	vnn = find_public_ip_vnn(ctdb, addr);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
+				 "'%s'not a public address\n",
+				 ctdb_addr_to_str(addr)));
+		return -1;
+	}
+
+	/* count how many public ip structures we have */
+	num = 0;
+	for (iface = vnn->ifaces; iface != NULL; iface = iface->next) {
+		num++;
+	}
+
+	len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
+		num*sizeof(struct ctdb_iface);
+	info = talloc_zero_size(outdata, len);
+	CTDB_NO_MEMORY(ctdb, info);
+
+	info->ip.addr = vnn->public_address;
+	info->ip.pnn = vnn->pnn;
+	info->active_idx = 0xFFFFFFFF;
+
+	i = 0;
+	for (iface = vnn->ifaces; iface != NULL; iface = iface->next) {
+		struct ctdb_interface *cur;
+
+		cur = iface->iface;
+		if (vnn->iface == cur) {
+			info->active_idx = i;
+		}
+		strncpy(info->ifaces[i].name, cur->name,
+			sizeof(info->ifaces[i].name));
+		info->ifaces[i].name[sizeof(info->ifaces[i].name)-1] = '\0';
+		info->ifaces[i].link_state = cur->link_up;
+		info->ifaces[i].references = cur->references;
+
+		i++;
+	}
+	info->num = i;
+	len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
+		i*sizeof(struct ctdb_iface);
+
+	outdata->dsize = len;
+	outdata->dptr  = (uint8_t *)info;
+
+	return 0;
+}
+
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+				struct ctdb_req_control_old *c,
+				TDB_DATA *outdata)
+{
+	int i, num, len;
+	struct ctdb_iface_list_old *ifaces;
+	struct ctdb_interface *cur;
+
+	/* count how many public ip structures we have */
+	num = 0;
+	for (cur=ctdb->ifaces;cur;cur=cur->next) {
+		num++;
+	}
+
+	len = offsetof(struct ctdb_iface_list_old, ifaces) +
+		num*sizeof(struct ctdb_iface);
+	ifaces = talloc_zero_size(outdata, len);
+	CTDB_NO_MEMORY(ctdb, ifaces);
+
+	i = 0;
+	for (cur=ctdb->ifaces;cur;cur=cur->next) {
+		strncpy(ifaces->ifaces[i].name, cur->name,
+			sizeof(ifaces->ifaces[i].name));
+		ifaces->ifaces[i].name[sizeof(ifaces->ifaces[i].name)-1] = '\0';
+		ifaces->ifaces[i].link_state = cur->link_up;
+		ifaces->ifaces[i].references = cur->references;
+		i++;
+	}
+	ifaces->num = i;
+	len = offsetof(struct ctdb_iface_list_old, ifaces) +
+		i*sizeof(struct ctdb_iface);
+
+	outdata->dsize = len;
+	outdata->dptr  = (uint8_t *)ifaces;
+
+	return 0;
+}
+
+int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
+				    struct ctdb_req_control_old *c,
+				    TDB_DATA indata)
+{
+	struct ctdb_iface *info;
+	struct ctdb_interface *iface;
+	bool link_up = false;
+
+	info = (struct ctdb_iface *)indata.dptr;
+
+	if (info->name[CTDB_IFACE_SIZE] != '\0') {
+		int len = strnlen(info->name, CTDB_IFACE_SIZE);
+		DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
+				  len, len, info->name));
+		return -1;
+	}
+
+	switch (info->link_state) {
+	case 0:
+		link_up = false;
+		break;
+	case 1:
+		link_up = true;
+		break;
+	default:
+		DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
+				  (unsigned int)info->link_state));
+		return -1;
+	}
+
+	if (info->references != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
+				  (unsigned int)info->references));
+		return -1;
+	}
+
+	iface = ctdb_find_iface(ctdb, info->name);
+	if (iface == NULL) {
+		return -1;
+	}
+
+	if (link_up == iface->link_up) {
+		return 0;
+	}
+
+	DEBUG(DEBUG_ERR,
+	      ("iface[%s] has changed it's link status %s => %s\n",
+	       iface->name,
+	       iface->link_up?"up":"down",
+	       link_up?"up":"down"));
+
+	iface->link_up = link_up;
+	return 0;
+}
+
+
+/*
+  called by a daemon to inform us of the entire list of TCP tickles for
+  a particular public address.
+  this control should only be sent by the node that is currently serving
+  that public address.
+ */
+int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
+	struct ctdb_tcp_array *tcparray;
+	struct ctdb_vnn *vnn;
+
+	/* We must at least have tickles.num or else we can't verify the size
+	   of the received data blob
+	 */
+	if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
+		DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
+		return -1;
+	}
+
+	/* verify that the size of data matches what we expect */
+	if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
+			 + sizeof(struct ctdb_connection) * list->num) {
+		DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
+		return -1;
+	}
+
+	DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
+			   ctdb_addr_to_str(&list->addr)));
+
+	vnn = find_public_ip_vnn(ctdb, &list->addr);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
+			ctdb_addr_to_str(&list->addr)));
+
+		return 1;
+	}
+
+	if (vnn->pnn == ctdb->pnn) {
+		DEBUG(DEBUG_INFO,
+		      ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
+		       ctdb_addr_to_str(&list->addr)));
+		return 0;
+	}
+
+	/* remove any old ticklelist we might have */
+	talloc_free(vnn->tcp_array);
+	vnn->tcp_array = NULL;
+
+	tcparray = talloc(vnn, struct ctdb_tcp_array);
+	CTDB_NO_MEMORY(ctdb, tcparray);
+
+	tcparray->num = list->num;
+
+	tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
+	CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+	memcpy(tcparray->connections, &list->connections[0],
+	       sizeof(struct ctdb_connection)*tcparray->num);
+
+	/* We now have a new fresh tickle list array for this vnn */
+	vnn->tcp_array = tcparray;
+
+	return 0;
+}
+
+/*
+  called to return the full list of tickles for the puclic address associated 
+  with the provided vnn
+ */
+int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+	ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
+	struct ctdb_tickle_list_old *list;
+	struct ctdb_tcp_array *tcparray;
+	unsigned int num, i;
+	struct ctdb_vnn *vnn;
+	unsigned port;
+
+	vnn = find_public_ip_vnn(ctdb, addr);
+	if (vnn == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
+			ctdb_addr_to_str(addr)));
+
+		return 1;
+	}
+
+	port = ctdb_addr_to_port(addr);
+
+	tcparray = vnn->tcp_array;
+	num = 0;
+	if (tcparray != NULL) {
+		if (port == 0) {
+			/* All connections */
+			num = tcparray->num;
+		} else {
+			/* Count connections for port */
+			for (i = 0; i < tcparray->num; i++) {
+				if (port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
+					num++;
+				}
+			}
+		}
+	}
+
+	outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
+			+ sizeof(struct ctdb_connection) * num;
+
+	outdata->dptr  = talloc_size(outdata, outdata->dsize);
+	CTDB_NO_MEMORY(ctdb, outdata->dptr);
+	list = (struct ctdb_tickle_list_old *)outdata->dptr;
+
+	list->addr = *addr;
+	list->num = num;
+
+	if (num == 0) {
+		return 0;
+	}
+
+	num = 0;
+	for (i = 0; i < tcparray->num; i++) {
+		if (port == 0 || \
+		    port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
+			list->connections[num] = tcparray->connections[i];
+			num++;
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+  set the list of all tcp tickles for a public address
+ */
+static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
+					    ctdb_sock_addr *addr,
+					    struct ctdb_tcp_array *tcparray)
+{
+	int ret, num;
+	TDB_DATA data;
+	struct ctdb_tickle_list_old *list;
+
+	if (tcparray) {
+		num = tcparray->num;
+	} else {
+		num = 0;
+	}
+
+	data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
+			sizeof(struct ctdb_connection) * num;
+	data.dptr = talloc_size(ctdb, data.dsize);
+	CTDB_NO_MEMORY(ctdb, data.dptr);
+
+	list = (struct ctdb_tickle_list_old *)data.dptr;
+	list->addr = *addr;
+	list->num = num;
+	if (tcparray) {
+		memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
+	}
+
+	ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
+				       CTDB_CONTROL_SET_TCP_TICKLE_LIST,
+				       0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
+		return -1;
+	}
+
+	talloc_free(data.dptr);
+
+	return ret;
+}
+
+static void ctdb_send_set_tcp_tickles_for_all(struct ctdb_context *ctdb,
+					      bool force)
+{
+	struct ctdb_vnn *vnn;
+	int ret;
+
+	for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
+		/* we only send out updates for public addresses that
+		   we have taken over
+		 */
+		if (ctdb->pnn != vnn->pnn) {
+			continue;
+		}
+
+		/* We only send out the updates if we need to */
+		if (!force && !vnn->tcp_update_needed) {
+			continue;
+		}
+
+		ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
+						       &vnn->public_address,
+						       vnn->tcp_array);
+		if (ret != 0) {
+			D_ERR("Failed to send the tickle update for ip %s\n",
+			      ctdb_addr_to_str(&vnn->public_address));
+			vnn->tcp_update_needed = true;
+		} else {
+			D_INFO("Sent tickle update for ip %s\n",
+			       ctdb_addr_to_str(&vnn->public_address));
+			vnn->tcp_update_needed = false;
+		}
+	}
+
+}
+
+/*
+  perform tickle updates if required
+ */
+static void ctdb_update_tcp_tickles(struct tevent_context *ev,
+				    struct tevent_timer *te,
+				    struct timeval t, void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(
+		private_data, struct ctdb_context);
+
+	ctdb_send_set_tcp_tickles_for_all(ctdb, false);
+
+	tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
+			 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
+			 ctdb_update_tcp_tickles, ctdb);
+}
+
+/*
+  start periodic update of tcp tickles
+ */
+void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
+{
+	ctdb->tickle_update_context = talloc_new(ctdb);
+
+	tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
+			 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
+			 ctdb_update_tcp_tickles, ctdb);
+}
+
+
+
+
+struct control_gratious_arp {
+	struct ctdb_context *ctdb;
+	ctdb_sock_addr addr;
+	const char *iface;
+	int count;
+};
+
+/*
+  send a control_gratuitous arp
+ */
+static void send_gratious_arp(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval t, void *private_data)
+{
+	int ret;
+	struct control_gratious_arp *arp = talloc_get_type(private_data, 
+							struct control_gratious_arp);
+
+	ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
+	if (ret != 0) {
+		DBG_ERR("Failed to send gratuitous ARP on iface %s: %s\n",
+			arp->iface, strerror(ret));
+	}
+
+
+	arp->count++;
+	if (arp->count == CTDB_ARP_REPEAT) {
+		talloc_free(arp);
+		return;
+	}
+
+	tevent_add_timer(arp->ctdb->ev, arp,
+			 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
+			 send_gratious_arp, arp);
+}
+
+
+/*
+  send a gratious arp 
+ */
+int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
+	struct control_gratious_arp *arp;
+
+	/* verify the size of indata */
+	if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
+		DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
+				 (unsigned)indata.dsize, 
+				 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
+		return -1;
+	}
+	if (indata.dsize != 
+		( offsetof(struct ctdb_addr_info_old, iface)
+		+ gratious_arp->len ) ){
+
+		DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+			"but should be %u bytes\n", 
+			 (unsigned)indata.dsize, 
+			 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
+		return -1;
+	}
+
+
+	arp = talloc(ctdb, struct control_gratious_arp);
+	CTDB_NO_MEMORY(ctdb, arp);
+
+	arp->ctdb  = ctdb;
+	arp->addr   = gratious_arp->addr;
+	arp->iface = talloc_strdup(arp, gratious_arp->iface);
+	CTDB_NO_MEMORY(ctdb, arp->iface);
+	arp->count = 0;
+
+	tevent_add_timer(arp->ctdb->ev, arp,
+			 timeval_zero(), send_gratious_arp, arp);
+
+	return 0;
+}
+
+int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
+	int ret;
+
+	/* verify the size of indata */
+	if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
+		DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
+		return -1;
+	}
+	if (indata.dsize != 
+		( offsetof(struct ctdb_addr_info_old, iface)
+		+ pub->len ) ){
+
+		DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+			"but should be %u bytes\n", 
+			 (unsigned)indata.dsize, 
+			 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
+		return -1;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
+	ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
+	struct ctdb_vnn *vnn;
+
+	/* verify the size of indata */
+	if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
+		DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
+		return -1;
+	}
+	if (indata.dsize != 
+		( offsetof(struct ctdb_addr_info_old, iface)
+		+ pub->len ) ){
+
+		DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+			"but should be %u bytes\n", 
+			 (unsigned)indata.dsize, 
+			 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
+		return -1;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
+	/* walk over all public addresses until we find a match */
+	for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+		if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
+			if (vnn->pnn == ctdb->pnn) {
+				/* This IP is currently being hosted.
+				 * Defer the deletion until the next
+				 * takeover run. "ctdb reloadips" will
+				 * always cause a takeover run.  "ctdb
+				 * delip" will now need an explicit
+				 * "ctdb ipreallocated" afterwards. */
+				vnn->delete_pending = true;
+			} else {
+				/* This IP is not hosted on the
+				 * current node so just delete it
+				 * now. */
+				do_delete_ip(ctdb, vnn);
+			}
+
+			return 0;
+		}
+	}
+
+	DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
+			 ctdb_addr_to_str(&pub->addr)));
+	return -1;
+}
+
+
+struct ipreallocated_callback_state {
+	struct ctdb_req_control_old *c;
+};
+
+static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
+					int status, void *p)
+{
+	struct ipreallocated_callback_state *state =
+		talloc_get_type(p, struct ipreallocated_callback_state);
+	TDB_DATA data = { .dsize = 0, };
+
+	if (status != 0) {
+		DEBUG(DEBUG_ERR,
+		      (" \"ipreallocated\" event script failed (status %d)\n",
+		       status));
+		if (status == -ETIMEDOUT) {
+			ctdb_ban_self(ctdb);
+		}
+	}
+
+	D_INFO("Sending IPREALLOCATED message\n");
+	ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_IPREALLOCATED, data);
+
+	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+	talloc_free(state);
+}
+
+/* A control to run the ipreallocated event */
+int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
+				   struct ctdb_req_control_old *c,
+				   bool *async_reply)
+{
+	int ret;
+	struct ipreallocated_callback_state *state;
+
+	state = talloc(ctdb, struct ipreallocated_callback_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
+
+	ret = ctdb_event_script_callback(ctdb, state,
+					 ctdb_ipreallocated_callback, state,
+					 CTDB_EVENT_IPREALLOCATED,
+					 "%s", "");
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	/* tell the control that we will be reply asynchronously */
+	state->c    = talloc_steal(state, c);
+	*async_reply = true;
+
+	return 0;
+}
+
+
+struct ctdb_reloadips_handle {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_control_old *c;
+	int status;
+	int fd[2];
+	pid_t child;
+	struct tevent_fd *fde;
+};
+
+static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
+{
+	if (h == h->ctdb->reload_ips) {
+		h->ctdb->reload_ips = NULL;
+	}
+	if (h->c != NULL) {
+		ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
+		h->c = NULL;
+	}
+	ctdb_kill(h->ctdb, h->child, SIGKILL);
+	return 0;
+}
+
+static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
+					 struct tevent_timer *te,
+					 struct timeval t, void *private_data)
+{
+	struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
+
+	talloc_free(h);
+}
+
+static void ctdb_reloadips_child_handler(struct tevent_context *ev,
+					 struct tevent_fd *fde,
+					 uint16_t flags, void *private_data)
+{
+	struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
+
+	char res;
+	int ret;
+
+	ret = sys_read(h->fd[0], &res, 1);
+	if (ret < 1 || res != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
+		res = 1;
+	}
+	h->status = res;
+
+	talloc_free(h);
+}
+
+static int ctdb_reloadips_child(struct ctdb_context *ctdb)
+{
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	struct ctdb_public_ip_list_old *ips;
+	struct ctdb_vnn *vnn;
+	struct client_async_data *async_data;
+	struct timeval timeout;
+	TDB_DATA data;
+	struct ctdb_client_control_state *state;
+	bool first_add;
+	unsigned int i;
+	int ret;
+
+	CTDB_NO_MEMORY(ctdb, mem_ctx);
+
+	/* Read IPs from local node */
+	ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
+				       CTDB_CURRENT_NODE, mem_ctx, &ips);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Unable to fetch public IPs from local node\n"));
+		talloc_free(mem_ctx);
+		return -1;
+	}
+
+	/* Read IPs file - this is safe since this is a child process */
+	ctdb->vnn = NULL;
+	if (ctdb_set_public_addresses(ctdb, false) != 0) {
+		DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
+		talloc_free(mem_ctx);
+		return -1;
+	}
+
+	async_data = talloc_zero(mem_ctx, struct client_async_data);
+	CTDB_NO_MEMORY(ctdb, async_data);
+
+	/* Compare IPs between node and file for IPs to be deleted */
+	for (i = 0; i < ips->num; i++) {
+		/* */
+		for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
+			if (ctdb_same_ip(&vnn->public_address,
+					 &ips->ips[i].addr)) {
+				/* IP is still in file */
+				break;
+			}
+		}
+
+		if (vnn == NULL) {
+			/* Delete IP ips->ips[i] */
+			struct ctdb_addr_info_old *pub;
+
+			DEBUG(DEBUG_NOTICE,
+			      ("IP %s no longer configured, deleting it\n",
+			       ctdb_addr_to_str(&ips->ips[i].addr)));
+
+			pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
+			CTDB_NO_MEMORY(ctdb, pub);
+
+			pub->addr  = ips->ips[i].addr;
+			pub->mask  = 0;
+			pub->len   = 0;
+
+			timeout = TAKEOVER_TIMEOUT();
+
+			data.dsize = offsetof(struct ctdb_addr_info_old,
+					      iface) + pub->len;
+			data.dptr = (uint8_t *)pub;
+
+			state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+						  CTDB_CONTROL_DEL_PUBLIC_IP,
+						  0, data, async_data,
+						  &timeout, NULL);
+			if (state == NULL) {
+				DEBUG(DEBUG_ERR,
+				      (__location__
+				       " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
+				goto failed;
+			}
+
+			ctdb_client_async_add(async_data, state);
+		}
+	}
+
+	/* Compare IPs between node and file for IPs to be added */
+	first_add = true;
+	for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
+		for (i = 0; i < ips->num; i++) {
+			if (ctdb_same_ip(&vnn->public_address,
+					 &ips->ips[i].addr)) {
+				/* IP already on node */
+				break;
+			}
+		}
+		if (i == ips->num) {
+			/* Add IP ips->ips[i] */
+			struct ctdb_addr_info_old *pub;
+			const char *ifaces = NULL;
+			uint32_t len;
+			struct vnn_interface *iface = NULL;
+
+			DEBUG(DEBUG_NOTICE,
+			      ("New IP %s configured, adding it\n",
+			       ctdb_addr_to_str(&vnn->public_address)));
+			if (first_add) {
+				uint32_t pnn = ctdb_get_pnn(ctdb);
+
+				data.dsize = sizeof(pnn);
+				data.dptr  = (uint8_t *)&pnn;
+
+				ret = ctdb_client_send_message(
+					ctdb,
+					CTDB_BROADCAST_CONNECTED,
+					CTDB_SRVID_REBALANCE_NODE,
+					data);
+				if (ret != 0) {
+					DEBUG(DEBUG_WARNING,
+					      ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
+				}
+
+				first_add = false;
+			}
+
+			ifaces = vnn->ifaces->iface->name;
+			iface = vnn->ifaces->next;
+			while (iface != NULL) {
+				ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
+							 iface->iface->name);
+				iface = iface->next;
+			}
+
+			len   = strlen(ifaces) + 1;
+			pub = talloc_zero_size(mem_ctx,
+					       offsetof(struct ctdb_addr_info_old, iface) + len);
+			CTDB_NO_MEMORY(ctdb, pub);
+
+			pub->addr  = vnn->public_address;
+			pub->mask  = vnn->public_netmask_bits;
+			pub->len   = len;
+			memcpy(&pub->iface[0], ifaces, pub->len);
+
+			timeout = TAKEOVER_TIMEOUT();
+
+			data.dsize = offsetof(struct ctdb_addr_info_old,
+					      iface) + pub->len;
+			data.dptr = (uint8_t *)pub;
+
+			state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+						  CTDB_CONTROL_ADD_PUBLIC_IP,
+						  0, data, async_data,
+						  &timeout, NULL);
+			if (state == NULL) {
+				DEBUG(DEBUG_ERR,
+				      (__location__
+				       " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
+				goto failed;
+			}
+
+			ctdb_client_async_add(async_data, state);
+		}
+	}
+
+	if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
+		goto failed;
+	}
+
+	talloc_free(mem_ctx);
+	return 0;
+
+failed:
+	talloc_free(mem_ctx);
+	return -1;
+}
+
+/* This control is sent to force the node to re-read the public addresses file
+   and drop any addresses we should nnot longer host, and add new addresses
+   that we are now able to host
+*/
+int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
+{
+	struct ctdb_reloadips_handle *h;
+	pid_t parent = getpid();
+
+	if (ctdb->reload_ips != NULL) {
+		talloc_free(ctdb->reload_ips);
+		ctdb->reload_ips = NULL;
+	}
+
+	h = talloc(ctdb, struct ctdb_reloadips_handle);
+	CTDB_NO_MEMORY(ctdb, h);
+	h->ctdb     = ctdb;
+	h->c        = NULL;
+	h->status   = -1;
+	
+	if (pipe(h->fd) == -1) {
+		DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
+		talloc_free(h);
+		return -1;
+	}
+
+	h->child = ctdb_fork(ctdb);
+	if (h->child == (pid_t)-1) {
+		DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
+		close(h->fd[0]);
+		close(h->fd[1]);
+		talloc_free(h);
+		return -1;
+	}
+
+	/* child process */
+	if (h->child == 0) {
+		signed char res = 0;
+
+		close(h->fd[0]);
+
+		prctl_set_comment("ctdb_reloadips");
+		if (switch_from_server_to_client(ctdb) != 0) {
+			DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
+			res = -1;
+		} else {
+			res = ctdb_reloadips_child(ctdb);
+			if (res != 0) {
+				DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
+			}
+		}
+
+		sys_write(h->fd[1], &res, 1);
+		ctdb_wait_for_process_to_exit(parent);
+		_exit(0);
+	}
+
+	h->c             = talloc_steal(h, c);
+
+	close(h->fd[1]);
+	set_close_on_exec(h->fd[0]);
+
+	talloc_set_destructor(h, ctdb_reloadips_destructor);
+
+
+	h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
+			       ctdb_reloadips_child_handler, (void *)h);
+	tevent_fd_set_auto_close(h->fde);
+
+	tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
+			 ctdb_reloadips_timeout_event, h);
+
+	/* we reply later */
+	*async_reply = true;
+	return 0;
+}
diff --git a/ctdb/server/ctdb_takeover_helper.c b/ctdb/server/ctdb_takeover_helper.c
new file mode 100644
index 0000000..c088970
--- /dev/null
+++ b/ctdb/server/ctdb_takeover_helper.c
@@ -0,0 +1,1276 @@
+/*
+   CTDB IP takeover helper
+
+   Copyright (C) Martin Schwenke  2016
+
+   Based on ctdb_recovery_helper.c
+   Copyright (C) Amitay Isaacs  2015
+
+   and ctdb_takeover.c
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <popt.h>
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/strv.h"
+#include "lib/util/strv_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+#include "lib/util/tevent_unix.h"
+
+#include "protocol/protocol.h"
+#include "protocol/protocol_api.h"
+#include "protocol/protocol_util.h"
+#include "client/client.h"
+
+#include "common/logging.h"
+
+#include "server/ipalloc.h"
+
+static int takeover_timeout = 9;
+
+#define TIMEOUT()	timeval_current_ofs(takeover_timeout, 0)
+
+/*
+ * Utility functions
+ */
+
+static bool generic_recv(struct tevent_req *req, int *perr)
+{
+	int err;
+
+	if (tevent_req_is_unix_error(req, &err)) {
+		if (perr != NULL) {
+			*perr = err;
+		}
+		return false;
+	}
+
+	return true;
+}
+
+static enum ipalloc_algorithm
+determine_algorithm(const struct ctdb_tunable_list *tunables)
+{
+	switch (tunables->ip_alloc_algorithm) {
+	case 0:
+		return IPALLOC_DETERMINISTIC;
+	case 1:
+		return IPALLOC_NONDETERMINISTIC;
+	case 2:
+		return IPALLOC_LCP2;
+	default:
+		return IPALLOC_LCP2;
+	};
+}
+
+/**********************************************************************/
+
+struct get_public_ips_state {
+	uint32_t *pnns;
+	int count;
+	struct ctdb_public_ip_list *ips;
+	uint32_t *ban_credits;
+};
+
+static void get_public_ips_done(struct tevent_req *subreq);
+
+static struct tevent_req *get_public_ips_send(
+				TALLOC_CTX *mem_ctx,
+				struct tevent_context *ev,
+				struct ctdb_client_context *client,
+				uint32_t *pnns,
+				int count, int num_nodes,
+				uint32_t *ban_credits,
+				bool available_only)
+{
+	struct tevent_req *req, *subreq;
+	struct get_public_ips_state *state;
+	struct ctdb_req_control request;
+
+	req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->pnns = pnns;
+	state->count = count;
+	state->ban_credits = ban_credits;
+
+	state->ips  = talloc_zero_array(state,
+					struct ctdb_public_ip_list,
+					num_nodes);
+	if (tevent_req_nomem(state->ips, req)) {
+		return tevent_req_post(req, ev);
+	}
+
+	/* Short circuit if no nodes being asked for IPs */
+	if (state->count == 0) {
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	ctdb_req_control_get_public_ips(&request, available_only);
+	subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
+						state->pnns,
+						state->count,
+						TIMEOUT(), &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, get_public_ips_done, req);
+
+	return req;
+}
+
+static void get_public_ips_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct get_public_ips_state *state = tevent_req_data(
+		req, struct get_public_ips_state);
+	struct ctdb_reply_control **reply;
+	int *err_list;
+	int ret, i;
+	bool status, found_errors;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+						&reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		for (i = 0; i < state->count; i++) {
+			if (err_list[i] != 0) {
+				uint32_t pnn = state->pnns[i];
+
+				D_ERR("control GET_PUBLIC_IPS failed on "
+				      "node %u, ret=%d\n", pnn, err_list[i]);
+
+				state->ban_credits[pnn]++;
+			}
+		}
+
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	found_errors = false;
+	for (i = 0; i < state->count; i++) {
+		uint32_t pnn;
+		struct ctdb_public_ip_list *ips;
+
+		pnn = state->pnns[i];
+		ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
+							&ips);
+		if (ret != 0) {
+			D_ERR("control GET_PUBLIC_IPS failed on "
+			      "node %u\n", pnn);
+			state->ban_credits[pnn]++;
+			found_errors = true;
+			continue;
+		}
+
+		D_INFO("Fetched public IPs from node %u\n", pnn);
+		state->ips[pnn] = *ips;
+	}
+
+	if (found_errors) {
+		tevent_req_error(req, EIO);
+		return;
+	}
+
+	talloc_free(reply);
+
+	tevent_req_done(req);
+}
+
+static bool get_public_ips_recv(struct tevent_req *req, int *perr,
+				TALLOC_CTX *mem_ctx,
+				struct ctdb_public_ip_list **ips)
+{
+	struct get_public_ips_state *state = tevent_req_data(
+		req, struct get_public_ips_state);
+	int err;
+
+	if (tevent_req_is_unix_error(req, &err)) {
+		if (perr != NULL) {
+			*perr = err;
+		}
+		return false;
+	}
+
+	*ips = talloc_steal(mem_ctx, state->ips);
+
+	return true;
+}
+
+/**********************************************************************/
+
+struct release_ip_state {
+	int num_sent;
+	int num_replies;
+	int num_fails;
+	int err_any;
+	uint32_t *ban_credits;
+};
+
+struct release_ip_one_state {
+	struct tevent_req *req;
+	uint32_t *pnns;
+	int count;
+	const char *ip_str;
+};
+
+static void release_ip_done(struct tevent_req *subreq);
+
+static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
+					  struct tevent_context *ev,
+					  struct ctdb_client_context *client,
+					  uint32_t *pnns,
+					  int count,
+					  struct timeval timeout,
+					  struct public_ip_list *all_ips,
+					  uint32_t *ban_credits)
+{
+	struct tevent_req *req, *subreq;
+	struct release_ip_state *state;
+	struct ctdb_req_control request;
+	struct public_ip_list *tmp_ip;
+
+	req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->num_sent = 0;
+	state->num_replies = 0;
+	state->num_fails = 0;
+	state->ban_credits = ban_credits;
+
+	/* Send a RELEASE_IP to all nodes that should not be hosting
+	 * each IP.  For each IP, all but one of these will be
+	 * redundant.  However, the redundant ones are used to tell
+	 * nodes which node should be hosting the IP so that commands
+	 * like "ctdb ip" can display a particular nodes idea of who
+	 * is hosting what. */
+	for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
+		struct release_ip_one_state *substate;
+		struct ctdb_public_ip ip;
+		int i;
+
+		substate = talloc_zero(state, struct release_ip_one_state);
+		if (tevent_req_nomem(substate, req)) {
+			return tevent_req_post(req, ev);
+		}
+
+		substate->pnns = talloc_zero_array(substate, uint32_t, count);
+		if (tevent_req_nomem(substate->pnns, req)) {
+			return tevent_req_post(req, ev);
+		}
+
+		substate->count = 0;
+		substate->req = req;
+
+		substate->ip_str  = ctdb_sock_addr_to_string(substate,
+							     &tmp_ip->addr,
+							     false);
+		if (tevent_req_nomem(substate->ip_str, req)) {
+			return tevent_req_post(req, ev);
+		}
+
+		for (i = 0; i < count; i++) {
+			uint32_t pnn = pnns[i];
+
+			/* Skip this node if IP is not known */
+			if (! bitmap_query(tmp_ip->known_on, pnn)) {
+				continue;
+			}
+
+			/* If pnn is not the node that should be
+			 * hosting the IP then add it to the list of
+			 * nodes that need to do a release. */
+			if (tmp_ip->pnn != pnn) {
+				substate->pnns[substate->count] = pnn;
+				substate->count++;
+			}
+		}
+
+		if (substate->count == 0) {
+			/* No releases to send for this address... */
+			TALLOC_FREE(substate);
+			continue;
+		}
+
+		ip.pnn = tmp_ip->pnn;
+		ip.addr = tmp_ip->addr;
+		ctdb_req_control_release_ip(&request, &ip);
+		subreq = ctdb_client_control_multi_send(state, ev, client,
+							substate->pnns,
+							substate->count,
+							timeout,/* cumulative */
+							&request);
+		if (tevent_req_nomem(subreq, req)) {
+			return tevent_req_post(req, ev);
+		}
+		tevent_req_set_callback(subreq, release_ip_done, substate);
+
+		state->num_sent++;
+	}
+
+	/* None sent, finished... */
+	if (state->num_sent == 0) {
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	return req;
+}
+
+static void release_ip_done(struct tevent_req *subreq)
+{
+	struct release_ip_one_state *substate = tevent_req_callback_data(
+		subreq, struct release_ip_one_state);
+	struct tevent_req *req = substate->req;
+	struct release_ip_state *state = tevent_req_data(
+		req, struct release_ip_state);
+	int ret, i;
+	int *err_list;
+	bool status, found_errors;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state,
+						&err_list, NULL);
+	TALLOC_FREE(subreq);
+
+	if (status) {
+		D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
+		       substate->ip_str, substate->count);
+		goto done;
+	}
+
+	/* Get some clear error messages out of err_list and count
+	 * banning credits
+	 */
+	found_errors = false;
+	for (i = 0; i < substate->count; i++) {
+		int err = err_list[i];
+		if (err != 0) {
+			uint32_t pnn = substate->pnns[i];
+
+			D_ERR("RELEASE_IP %s failed on node %u, "
+			      "ret=%d\n", substate->ip_str, pnn, err);
+
+			state->ban_credits[pnn]++;
+			state->err_any = err;
+			found_errors = true;
+		}
+	}
+	if (! found_errors) {
+		D_ERR("RELEASE_IP %s internal error, ret=%d\n",
+		      substate->ip_str, ret);
+		state->err_any = EIO;
+	}
+
+	state->num_fails++;
+
+done:
+	talloc_free(substate);
+
+	state->num_replies++;
+
+	if (state->num_replies < state->num_sent) {
+		/* Not all replies received, don't go further */
+		return;
+	}
+
+	if (state->num_fails > 0) {
+		tevent_req_error(req, state->err_any);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool release_ip_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+struct take_ip_state {
+	int num_sent;
+	int num_replies;
+	int num_fails;
+	int err_any;
+	uint32_t *ban_credits;
+};
+
+struct take_ip_one_state {
+	struct tevent_req *req;
+	uint32_t pnn;
+	const char *ip_str;
+};
+
+static void take_ip_done(struct tevent_req *subreq);
+
+static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
+				       struct tevent_context *ev,
+				       struct ctdb_client_context *client,
+				       struct timeval timeout,
+				       struct public_ip_list *all_ips,
+				       uint32_t *ban_credits)
+{
+	struct tevent_req *req, *subreq;
+	struct take_ip_state *state;
+	struct ctdb_req_control request;
+	struct public_ip_list *tmp_ip;
+
+	req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->num_sent = 0;
+	state->num_replies = 0;
+	state->num_fails = 0;
+	state->ban_credits = ban_credits;
+
+	/* For each IP, send a TAKOVER_IP to the node that should be
+	 * hosting it.  Many of these will often be redundant (since
+	 * the allocation won't have changed) but they can be useful
+	 * to recover from inconsistencies. */
+	for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
+		struct take_ip_one_state *substate;
+		struct ctdb_public_ip ip;
+
+		if (tmp_ip->pnn == CTDB_UNKNOWN_PNN) {
+			/* IP will be unassigned */
+			continue;
+		}
+
+		substate = talloc_zero(state, struct take_ip_one_state);
+		if (tevent_req_nomem(substate, req)) {
+			return tevent_req_post(req, ev);
+		}
+
+		substate->req = req;
+		substate->pnn = tmp_ip->pnn;
+
+		substate->ip_str  = ctdb_sock_addr_to_string(substate,
+							     &tmp_ip->addr,
+							     false);
+		if (tevent_req_nomem(substate->ip_str, req)) {
+			return tevent_req_post(req, ev);
+		}
+
+		ip.pnn = tmp_ip->pnn;
+		ip.addr = tmp_ip->addr;
+		ctdb_req_control_takeover_ip(&request, &ip);
+		subreq = ctdb_client_control_send(
+					state, ev, client, tmp_ip->pnn,
+					timeout, /* cumulative */
+					&request);
+		if (tevent_req_nomem(subreq, req)) {
+			return tevent_req_post(req, ev);
+		}
+		tevent_req_set_callback(subreq, take_ip_done, substate);
+
+		state->num_sent++;
+	}
+
+	/* None sent, finished... */
+	if (state->num_sent == 0) {
+		tevent_req_done(req);
+		return tevent_req_post(req, ev);
+	}
+
+	return req;
+}
+
+static void take_ip_done(struct tevent_req *subreq)
+{
+	struct take_ip_one_state *substate = tevent_req_callback_data(
+		subreq, struct take_ip_one_state);
+	struct tevent_req *req = substate->req;
+	struct ctdb_reply_control *reply;
+	struct take_ip_state *state = tevent_req_data(
+		req, struct take_ip_state);
+	int ret = 0;
+	bool status;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+
+	if (! status) {
+		D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
+		      substate->ip_str, substate->pnn, ret);
+		goto fail;
+	}
+
+	ret = ctdb_reply_control_takeover_ip(reply);
+	if (ret != 0) {
+		D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
+		      substate->ip_str, substate->pnn, ret);
+		goto fail;
+	}
+
+	D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
+	       substate->ip_str, substate->pnn);
+	goto done;
+
+fail:
+	state->ban_credits[substate->pnn]++;
+	state->num_fails++;
+	state->err_any = ret;
+
+done:
+	talloc_free(substate);
+
+	state->num_replies++;
+
+	if (state->num_replies < state->num_sent) {
+		/* Not all replies received, don't go further */
+		return;
+	}
+
+	if (state->num_fails > 0) {
+		tevent_req_error(req, state->err_any);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+static bool take_ip_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+struct ipreallocated_state {
+	uint32_t *pnns;
+	int count;
+	uint32_t *ban_credits;
+};
+
+static void ipreallocated_done(struct tevent_req *subreq);
+
+static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
+					     struct tevent_context *ev,
+					     struct ctdb_client_context *client,
+					     uint32_t *pnns,
+					     int count,
+					     struct timeval timeout,
+					     uint32_t *ban_credits)
+{
+	struct tevent_req *req, *subreq;
+	struct ipreallocated_state *state;
+	struct ctdb_req_control request;
+
+	req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->pnns = pnns;
+	state->count = count;
+	state->ban_credits = ban_credits;
+
+	ctdb_req_control_ipreallocated(&request);
+	subreq = ctdb_client_control_multi_send(state, ev, client,
+						pnns, count,
+						timeout, /* cumulative */
+						&request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, ipreallocated_done, req);
+
+	return req;
+}
+
+static void ipreallocated_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct ipreallocated_state *state = tevent_req_data(
+		req, struct ipreallocated_state);
+	int *err_list = NULL;
+	int ret, i;
+	bool status, found_errors;
+
+	status = ctdb_client_control_multi_recv(subreq, &ret, state,
+						&err_list, NULL);
+	TALLOC_FREE(subreq);
+
+	if (status) {
+		D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
+		tevent_req_done(req);
+		return;
+	}
+
+	/* Get some clear error messages out of err_list and count
+	 * banning credits
+	 */
+	found_errors = false;
+	for (i = 0; i < state->count; i++) {
+		int err = err_list[i];
+		if (err != 0) {
+			uint32_t pnn = state->pnns[i];
+
+			D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
+			      pnn, err);
+
+			state->ban_credits[pnn]++;
+			found_errors = true;
+		}
+	}
+
+	if (! found_errors) {
+		D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
+	}
+
+	tevent_req_error(req, ret);
+}
+
+static bool ipreallocated_recv(struct tevent_req *req, int *perr)
+{
+	return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+/*
+ * Recalculate the allocation of public IPs to nodes and have the
+ * nodes host their allocated addresses.
+ *
+ * - Get tunables
+ * - Get nodemap
+ * - Initialise IP allocation state.  Pass:
+ *   + algorithm to be used;
+ *   + various tunables (NoIPTakeover, NoIPFailback)
+ *   + list of nodes to force rebalance (internal structure, currently
+ *     no way to fetch, only used by LCP2 for nodes that have had new
+ *     IP addresses added).
+ * - Set IP flags for IP allocation based on node map
+ * - Retrieve known and available IP addresses (done separately so
+ *   values can be faked in unit testing)
+ * - Use ipalloc_set_public_ips() to set known and available IP
+ *   addresses for allocation
+ * - If cluster can't host IP addresses then jump to IPREALLOCATED
+ * - Run IP allocation algorithm
+ * - Send RELEASE_IP to all nodes for IPs they should not host
+ * - Send TAKE_IP to all nodes for IPs they should host
+ * - Send IPREALLOCATED to all nodes
+ */
+
+struct takeover_state {
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	struct timeval timeout;
+	unsigned int num_nodes;
+	uint32_t *pnns_connected;
+	int num_connected;
+	uint32_t *pnns_active;
+	int num_active;
+	uint32_t destnode;
+	uint32_t *force_rebalance_nodes;
+	struct ctdb_tunable_list *tun_list;
+	struct ipalloc_state *ipalloc_state;
+	struct ctdb_public_ip_list *known_ips;
+	struct public_ip_list *all_ips;
+	uint32_t *ban_credits;
+};
+
+static void takeover_tunables_done(struct tevent_req *subreq);
+static void takeover_nodemap_done(struct tevent_req *subreq);
+static void takeover_known_ips_done(struct tevent_req *subreq);
+static void takeover_avail_ips_done(struct tevent_req *subreq);
+static void takeover_release_ip_done(struct tevent_req *subreq);
+static void takeover_take_ip_done(struct tevent_req *subreq);
+static void takeover_ipreallocated(struct tevent_req *req);
+static void takeover_ipreallocated_done(struct tevent_req *subreq);
+static void takeover_failed(struct tevent_req *subreq, int ret);
+static void takeover_failed_done(struct tevent_req *subreq);
+
+static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
+					struct tevent_context *ev,
+					struct ctdb_client_context *client,
+					uint32_t *force_rebalance_nodes)
+{
+	struct tevent_req *req, *subreq;
+	struct takeover_state *state;
+	struct ctdb_req_control request;
+
+	req = tevent_req_create(mem_ctx, &state, struct takeover_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	state->ev = ev;
+	state->client = client;
+	state->force_rebalance_nodes = force_rebalance_nodes;
+	state->destnode = ctdb_client_pnn(client);
+
+	ctdb_req_control_get_all_tunables(&request);
+	subreq = ctdb_client_control_send(state, state->ev, state->client,
+					  state->destnode, TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+	tevent_req_set_callback(subreq, takeover_tunables_done, req);
+
+	return req;
+}
+
+static void takeover_tunables_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	struct ctdb_reply_control *reply;
+	struct ctdb_req_control request;
+	int ret;
+	bool status;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_get_all_tunables(reply, state,
+						  &state->tun_list);
+	if (ret != 0) {
+		D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	talloc_free(reply);
+
+	takeover_timeout = state->tun_list->takeover_timeout;
+
+	ctdb_req_control_get_nodemap(&request);
+	subreq = ctdb_client_control_send(state, state->ev, state->client,
+					  state->destnode, TIMEOUT(),
+					  &request);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, takeover_nodemap_done, req);
+}
+
+static void takeover_nodemap_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	struct ctdb_reply_control *reply;
+	bool status;
+	int ret;
+	struct ctdb_node_map *nodemap;
+	const char *ptr;
+
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+			state->destnode, ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+	if (ret != 0) {
+		D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	state->num_nodes = nodemap->num;
+
+	state->num_connected = list_of_connected_nodes(nodemap,
+						       CTDB_UNKNOWN_PNN, state,
+						       &state->pnns_connected);
+	if (state->num_connected <= 0) {
+		tevent_req_error(req, ENOMEM);
+		return;
+	}
+
+	state->num_active = list_of_active_nodes(nodemap,
+						 CTDB_UNKNOWN_PNN, state,
+						 &state->pnns_active);
+	if (state->num_active <= 0) {
+		tevent_req_error(req, ENOMEM);
+		return;
+	}
+
+	/* Default timeout for early jump to IPREALLOCATED.  See below
+	 * for explanation of 3 times...
+	 */
+	state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
+
+	state->ban_credits = talloc_zero_array(state, uint32_t,
+					       state->num_nodes);
+	if (tevent_req_nomem(state->ban_credits, req)) {
+		return;
+	}
+
+	ptr = getenv("CTDB_DISABLE_IP_FAILOVER");
+	if (ptr != NULL) {
+		/* IP failover is completely disabled so just send out
+		 * ipreallocated event.
+		 */
+		takeover_ipreallocated(req);
+		return;
+	}
+
+	state->ipalloc_state =
+		ipalloc_state_init(
+			state, state->num_nodes,
+			determine_algorithm(state->tun_list),
+			(state->tun_list->no_ip_takeover != 0),
+			(state->tun_list->no_ip_failback != 0),
+			state->force_rebalance_nodes);
+	if (tevent_req_nomem(state->ipalloc_state, req)) {
+		return;
+	}
+
+	subreq = get_public_ips_send(state, state->ev, state->client,
+				     state->pnns_connected, state->num_connected,
+				     state->num_nodes, state->ban_credits,
+				     false);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+
+	tevent_req_set_callback(subreq, takeover_known_ips_done, req);
+}
+
+static void takeover_known_ips_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	int ret;
+	bool status;
+	uint32_t *pnns = NULL;
+	int count, i;
+
+	status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
+	TALLOC_FREE(subreq);
+
+	if (! status) {
+		D_ERR("Failed to fetch known public IPs\n");
+		takeover_failed(req, ret);
+		return;
+	}
+
+	/* Get available IPs from active nodes that actually have known IPs */
+
+	pnns = talloc_zero_array(state, uint32_t, state->num_active);
+	if (tevent_req_nomem(pnns, req)) {
+		return;
+	}
+
+	count = 0;
+	for (i = 0; i < state->num_active; i++) {
+		uint32_t pnn = state->pnns_active[i];
+
+		/* If pnn has IPs then fetch available IPs from it */
+		if (state->known_ips[pnn].num > 0) {
+			pnns[count] = pnn;
+			count++;
+		}
+	}
+
+	subreq = get_public_ips_send(state, state->ev, state->client,
+				     pnns, count,
+				     state->num_nodes, state->ban_credits,
+				     true);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+
+	tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
+}
+
+static void takeover_avail_ips_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	bool status;
+	int ret;
+	struct ctdb_public_ip_list *available_ips;
+
+	status = get_public_ips_recv(subreq, &ret, state, &available_ips);
+	TALLOC_FREE(subreq);
+
+	if (! status) {
+		D_ERR("Failed to fetch available public IPs\n");
+		takeover_failed(req, ret);
+		return;
+	}
+
+	ipalloc_set_public_ips(state->ipalloc_state,
+			       state->known_ips, available_ips);
+
+	if (! ipalloc_can_host_ips(state->ipalloc_state)) {
+		D_NOTICE("No nodes available to host public IPs yet\n");
+		takeover_ipreallocated(req);
+		return;
+	}
+
+	/* Do the IP reassignment calculations */
+	state->all_ips = ipalloc(state->ipalloc_state);
+	if (tevent_req_nomem(state->all_ips, req)) {
+		return;
+	}
+
+	/* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
+	 * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
+	 * seconds.  However, RELEASE_IP can take longer due to TCP
+	 * connection killing, so sometimes needs more time.
+	 * Therefore, use a cumulative timeout of TakeoverTimeout * 3
+	 * seconds across all 3 stages.  No explicit expiry checks are
+	 * needed before each stage because tevent is smart enough to
+	 * fire the timeouts even if they are in the past.  Initialise
+	 * this here so it explicitly covers the stages we're
+	 * interested in but, in particular, not the time taken by the
+	 * ipalloc().
+	 */
+	state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
+
+	subreq = release_ip_send(state, state->ev, state->client,
+				 state->pnns_connected, state->num_connected,
+				 state->timeout, state->all_ips,
+				 state->ban_credits);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, takeover_release_ip_done, req);
+}
+
+static void takeover_release_ip_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	int ret;
+	bool status;
+
+	status = release_ip_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+
+	if (! status) {
+		takeover_failed(req, ret);
+		return;
+	}
+
+	/* All released, now for takeovers */
+
+	subreq = take_ip_send(state, state->ev, state->client,
+			      state->timeout, state->all_ips,
+			      state->ban_credits);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, takeover_take_ip_done, req);
+}
+
+static void takeover_take_ip_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	int ret = 0;
+	bool status;
+
+	status = take_ip_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+
+	if (! status) {
+		takeover_failed(req, ret);
+		return;
+	}
+
+	takeover_ipreallocated(req);
+}
+
+static void takeover_ipreallocated(struct tevent_req *req)
+{
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	struct tevent_req *subreq;
+
+	subreq = ipreallocated_send(state, state->ev, state->client,
+				    state->pnns_connected,
+				    state->num_connected,
+				    state->timeout,
+				    state->ban_credits);
+	if (tevent_req_nomem(subreq, req)) {
+		return;
+	}
+	tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
+}
+
+static void takeover_ipreallocated_done(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	int ret;
+	bool status;
+
+	status = ipreallocated_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+
+	if (! status) {
+		takeover_failed(req, ret);
+		return;
+	}
+
+	tevent_req_done(req);
+}
+
+struct takeover_failed_state {
+	struct tevent_req *req;
+	int ret;
+};
+
+void takeover_failed(struct tevent_req *req, int ret)
+{
+	struct takeover_state *state = tevent_req_data(
+		req, struct takeover_state);
+	struct tevent_req *subreq;
+	uint32_t max_pnn = CTDB_UNKNOWN_PNN;
+	unsigned int max_credits = 0;
+	uint32_t pnn;
+
+	/* Check that bans are enabled */
+	if (state->tun_list->enable_bans == 0) {
+		tevent_req_error(req, ret);
+		return;
+	}
+
+	for (pnn = 0; pnn < state->num_nodes; pnn++) {
+		if (state->ban_credits[pnn] > max_credits) {
+			max_pnn = pnn;
+			max_credits = state->ban_credits[pnn];
+		}
+	}
+
+	if (max_credits > 0) {
+		struct ctdb_req_message message;
+		struct takeover_failed_state *substate;
+
+		D_WARNING("Assigning banning credits to node %u\n", max_pnn);
+
+		substate = talloc_zero(state, struct takeover_failed_state);
+		if (tevent_req_nomem(substate, req)) {
+			return;
+		}
+		substate->req = req;
+		substate->ret = ret;
+
+		message.srvid = CTDB_SRVID_BANNING;
+		message.data.pnn = max_pnn;
+
+		subreq = ctdb_client_message_send(
+			state, state->ev, state->client,
+			ctdb_client_pnn(state->client),
+			&message);
+		if (subreq == NULL) {
+			D_ERR("failed to assign banning credits\n");
+			tevent_req_error(req, ret);
+			return;
+		}
+		tevent_req_set_callback(subreq, takeover_failed_done, substate);
+	} else {
+		tevent_req_error(req, ret);
+	}
+}
+
+static void takeover_failed_done(struct tevent_req *subreq)
+{
+	struct takeover_failed_state *substate = tevent_req_callback_data(
+		subreq, struct takeover_failed_state);
+	struct tevent_req *req = substate->req;
+	int ret;
+	bool status;
+
+	status = ctdb_client_message_recv(subreq, &ret);
+	TALLOC_FREE(subreq);
+	if (! status) {
+		D_ERR("failed to assign banning credits, ret=%d\n", ret);
+	}
+
+	ret = substate->ret;
+	talloc_free(substate);
+	tevent_req_error(req, ret);
+}
+
+static void takeover_recv(struct tevent_req *req, int *perr)
+{
+	generic_recv(req, perr);
+}
+
+static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
+{
+	char *strv = NULL;
+	int num, i, ret;
+	char *t;
+	uint32_t *nodes;
+
+	ret = strv_split(mem_ctx, &strv, s, ",");
+	if (ret != 0) {
+		D_ERR("out of memory\n");
+		return NULL;
+	}
+
+	num = strv_count(strv);
+
+	nodes = talloc_array(mem_ctx, uint32_t, num);
+	if (nodes == NULL) {
+		D_ERR("out of memory\n");
+		return NULL;
+	}
+
+	t = NULL;
+	for (i = 0; i < num; i++) {
+		t = strv_next(strv, t);
+		nodes[i] = atoi(t);
+	}
+
+	return nodes;
+}
+
+static void usage(const char *progname)
+{
+	fprintf(stderr,
+		"\nUsage: %s <output-fd> <ctdb-socket-path> "
+		"[<force-rebalance-nodes>]\n",
+		progname);
+}
+
+/*
+ * Arguments - write fd, socket path
+ */
+int main(int argc, const char *argv[])
+{
+	int write_fd;
+	const char *sockpath;
+	TALLOC_CTX *mem_ctx;
+	struct tevent_context *ev;
+	struct ctdb_client_context *client;
+	bool status;
+	int ret;
+	struct tevent_req *req;
+	uint32_t *force_rebalance_nodes = NULL;
+
+	if (argc < 3 || argc > 4) {
+		usage(argv[0]);
+		exit(1);
+	}
+
+	write_fd = atoi(argv[1]);
+	sockpath = argv[2];
+
+	mem_ctx = talloc_new(NULL);
+	if (mem_ctx == NULL) {
+		fprintf(stderr, "talloc_new() failed\n");
+		ret = ENOMEM;
+		goto done;
+	}
+
+	if (argc == 4) {
+		force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
+		if (force_rebalance_nodes == NULL) {
+			usage(argv[0]);
+			ret = EINVAL;
+			goto done;
+		}
+	}
+
+	ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
+	if (ret != 0) {
+		fprintf(stderr,
+			"ctdb-takeover: Unable to initialize logging\n");
+		goto done;
+	}
+
+	ev = tevent_context_init(mem_ctx);
+	if (ev == NULL) {
+		D_ERR("tevent_context_init() failed\n");
+		ret = ENOMEM;
+		goto done;
+	}
+
+	status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL);
+	if (!status) {
+		D_ERR("logging_setup_sighup_handler() failed\n");
+		ret = ENOMEM;
+		goto done;
+	}
+
+	ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
+	if (ret != 0) {
+		D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
+		goto done;
+	}
+
+	req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
+	if (req == NULL) {
+		D_ERR("takeover_send() failed\n");
+		ret = 1;
+		goto done;
+	}
+
+	if (! tevent_req_poll(req, ev)) {
+		D_ERR("tevent_req_poll() failed\n");
+		ret = 1;
+		goto done;
+	}
+
+	takeover_recv(req, &ret);
+	TALLOC_FREE(req);
+	if (ret != 0) {
+		D_ERR("takeover run failed, ret=%d\n", ret);
+	}
+
+done:
+	sys_write_v(write_fd, &ret, sizeof(ret));
+
+	talloc_free(mem_ctx);
+	return ret;
+}
diff --git a/ctdb/server/ctdb_traverse.c b/ctdb/server/ctdb_traverse.c
new file mode 100644
index 0000000..4865dcc
--- /dev/null
+++ b/ctdb/server/ctdb_traverse.c
@@ -0,0 +1,781 @@
+/* 
+   efficient async ctdb traverse
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+typedef void (*ctdb_traverse_fn_t)(void *private_data, TDB_DATA key, TDB_DATA data);
+
+/*
+  handle returned to caller - freeing this handler will kill the child and 
+  terminate the traverse
+ */
+struct ctdb_traverse_local_handle {
+	struct ctdb_traverse_local_handle *next, *prev;
+	struct ctdb_db_context *ctdb_db;
+	int fd[2];
+	pid_t child;
+	uint64_t srvid;
+	uint32_t client_reqid;
+	uint32_t reqid;
+	int srcnode;
+	void *private_data;
+	ctdb_traverse_fn_t callback;
+	bool withemptyrecords;
+	struct tevent_fd *fde;
+	int records_failed;
+	int records_sent;
+};
+
+/*
+ * called when traverse is completed by child or on error
+ */
+static void ctdb_traverse_child_handler(struct tevent_context *ev, struct tevent_fd *fde,
+					uint16_t flags, void *private_data)
+{
+	struct ctdb_traverse_local_handle *h = talloc_get_type(private_data,
+							struct ctdb_traverse_local_handle);
+	ctdb_traverse_fn_t callback = h->callback;
+	void *p = h->private_data;
+	int res;
+	ssize_t n;
+
+	/* Read the number of records sent by traverse child */
+	n = sys_read(h->fd[0], &res, sizeof(res));
+	if (n < 0 || n != sizeof(res)) {
+		/* Traverse child failed */
+		DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d\n",
+				  h->ctdb_db->db_name, h->reqid));
+	} else if (res < 0) {
+		/* Traverse failed */
+		res = -res;
+		DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d records:%d\n",
+				  h->ctdb_db->db_name, h->reqid, res));
+	} else {
+		DEBUG(DEBUG_INFO, ("Local traverse end db:%s reqid:%d records:%d\n",
+				   h->ctdb_db->db_name, h->reqid, res));
+	}
+
+	callback(p, tdb_null, tdb_null);
+}
+
+/*
+  destroy a in-flight traverse operation
+ */
+static int traverse_local_destructor(struct ctdb_traverse_local_handle *h)
+{
+	DLIST_REMOVE(h->ctdb_db->traverse, h);
+	ctdb_kill(h->ctdb_db->ctdb, h->child, SIGKILL);
+	return 0;
+}
+
+/*
+  callback from tdb_traverse_read()
+ */
+static int ctdb_traverse_local_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
+{
+	struct ctdb_traverse_local_handle *h = talloc_get_type(p,
+							       struct ctdb_traverse_local_handle);
+	struct ctdb_rec_data_old *d;
+	struct ctdb_ltdb_header *hdr;
+	int res, status;
+	TDB_DATA outdata;
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (ctdb_db_volatile(h->ctdb_db)) {
+		/* filter out zero-length records */
+		if (!h->withemptyrecords &&
+		    data.dsize <= sizeof(struct ctdb_ltdb_header))
+		{
+			return 0;
+		}
+
+		/* filter out non-authoritative records */
+		if (hdr->dmaster != h->ctdb_db->ctdb->pnn) {
+			return 0;
+		}
+	}
+
+	d = ctdb_marshall_record(h, h->reqid, key, NULL, data);
+	if (d == NULL) {
+		/* error handling is tricky in this child code .... */
+		h->records_failed++;
+		return -1;
+	}
+
+	outdata.dptr = (uint8_t *)d;
+	outdata.dsize = d->length;
+
+	res = ctdb_control(h->ctdb_db->ctdb, h->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA,
+			   CTDB_CTRL_FLAG_NOREPLY, outdata, NULL, NULL, &status, NULL, NULL);
+	if (res != 0 || status != 0) {
+		h->records_failed++;
+		return -1;
+	}
+
+	h->records_sent++;
+	return 0;
+}
+
+struct traverse_all_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_traverse_local_handle *h;
+	uint32_t reqid;
+	uint32_t srcnode;
+	uint32_t client_reqid;
+	uint64_t srvid;
+	bool withemptyrecords;
+};
+
+/*
+  setup a non-blocking traverse of a local ltdb. The callback function
+  will be called on every record in the local ltdb. To stop the
+  traverse, talloc_free() the traverse_handle.
+
+  The traverse is finished when the callback is called with tdb_null for key and data
+ */
+static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_context *ctdb_db,
+							      ctdb_traverse_fn_t callback,
+							      struct traverse_all_state *all_state)
+{
+	struct ctdb_traverse_local_handle *h;
+	int ret;
+
+	h = talloc_zero(all_state, struct ctdb_traverse_local_handle);
+	if (h == NULL) {
+		return NULL;
+	}
+
+	ret = pipe(h->fd);
+
+	if (ret != 0) {
+		talloc_free(h);
+		return NULL;
+	}
+
+	h->child = ctdb_fork(ctdb_db->ctdb);
+
+	if (h->child == (pid_t)-1) {
+		close(h->fd[0]);
+		close(h->fd[1]);
+		talloc_free(h);
+		return NULL;
+	}
+
+	h->callback = callback;
+	h->private_data = all_state;
+	h->ctdb_db = ctdb_db;
+	h->client_reqid = all_state->client_reqid;
+	h->reqid = all_state->reqid;
+	h->srvid = all_state->srvid;
+	h->srcnode = all_state->srcnode;
+	h->withemptyrecords = all_state->withemptyrecords;
+
+	if (h->child == 0) {
+		/* start the traverse in the child */
+		int res, status;
+		pid_t parent = getpid();
+		struct ctdb_context *ctdb = ctdb_db->ctdb;
+		struct ctdb_rec_data_old *d;
+		TDB_DATA outdata;
+
+		close(h->fd[0]);
+
+		prctl_set_comment("ctdb_traverse");
+		if (switch_from_server_to_client(ctdb) != 0) {
+			DEBUG(DEBUG_CRIT, ("Failed to switch traverse child into client mode\n"));
+			_exit(0);
+		}
+
+		d = ctdb_marshall_record(h, h->reqid, tdb_null, NULL, tdb_null);
+		if (d == NULL) {
+			res = 0;
+			sys_write(h->fd[1], &res, sizeof(int));
+			_exit(0);
+		}
+
+		res = tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h);
+		if (res == -1 || h->records_failed > 0) {
+			/* traverse failed */
+			res = -(h->records_sent);
+		} else {
+			res = h->records_sent;
+		}
+
+		/* Wait till all the data is flushed from output queue */
+		while (ctdb_queue_length(ctdb->daemon.queue) > 0) {
+			tevent_loop_once(ctdb->ev);
+		}
+
+		/* End traverse by sending empty record */
+		outdata.dptr = (uint8_t *)d;
+		outdata.dsize = d->length;
+		ret = ctdb_control(ctdb, h->srcnode, 0,
+				   CTDB_CONTROL_TRAVERSE_DATA,
+				   CTDB_CTRL_FLAG_NOREPLY, outdata,
+				   NULL, NULL, &status, NULL, NULL);
+		if (ret == -1 || status == -1) {
+			if (res > 0) {
+				res = -res;
+			}
+		}
+
+		sys_write(h->fd[1], &res, sizeof(res));
+
+		ctdb_wait_for_process_to_exit(parent);
+		_exit(0);
+	}
+
+	close(h->fd[1]);
+	set_close_on_exec(h->fd[0]);
+
+	talloc_set_destructor(h, traverse_local_destructor);
+
+	DLIST_ADD(ctdb_db->traverse, h);
+
+	h->fde = tevent_add_fd(ctdb_db->ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
+			       ctdb_traverse_child_handler, h);
+	if (h->fde == NULL) {
+		close(h->fd[0]);
+		talloc_free(h);
+		return NULL;
+	}
+	tevent_fd_set_auto_close(h->fde);
+
+	return h;
+}
+
+
+struct ctdb_traverse_all_handle {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	uint32_t reqid;
+	ctdb_traverse_fn_t callback;
+	void *private_data;
+	uint32_t null_count;
+	bool timedout;
+};
+
+/*
+  destroy a traverse_all op
+ */
+static int ctdb_traverse_all_destructor(struct ctdb_traverse_all_handle *state)
+{
+	reqid_remove(state->ctdb->idr, state->reqid);
+	return 0;
+}
+
+/* called when a traverse times out */
+static void ctdb_traverse_all_timeout(struct tevent_context *ev,
+				      struct tevent_timer *te,
+				      struct timeval t, void *private_data)
+{
+	struct ctdb_traverse_all_handle *state = talloc_get_type(private_data, struct ctdb_traverse_all_handle);
+
+	DEBUG(DEBUG_ERR,(__location__ " Traverse all timeout on database:%s\n", state->ctdb_db->db_name));
+	CTDB_INCREMENT_STAT(state->ctdb, timeouts.traverse);
+
+	state->timedout = true;
+	state->callback(state->private_data, tdb_null, tdb_null);
+}
+
+
+struct traverse_start_state {
+	struct ctdb_context *ctdb;
+	struct ctdb_traverse_all_handle *h;
+	uint32_t srcnode;
+	uint32_t reqid;
+	uint32_t db_id;
+	uint64_t srvid;
+	bool withemptyrecords;
+	int num_records;
+};
+
+
+/*
+  setup a cluster-wide non-blocking traverse of a ctdb. The
+  callback function will be called on every record in the local
+  ltdb. To stop the traverse, talloc_free() the traverse_handle.
+
+  The traverse is finished when the callback is called with tdb_null
+  for key and data
+ */
+static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_context *ctdb_db,
+								 ctdb_traverse_fn_t callback,
+								 struct traverse_start_state *start_state)
+{
+	struct ctdb_traverse_all_handle *state;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int ret;
+	TDB_DATA data;
+	struct ctdb_traverse_all r;
+	struct ctdb_traverse_all_ext r_ext;
+	uint32_t destination;
+
+	state = talloc(start_state, struct ctdb_traverse_all_handle);
+	if (state == NULL) {
+		return NULL;
+	}
+
+	state->ctdb         = ctdb;
+	state->ctdb_db      = ctdb_db;
+	state->reqid        = reqid_new(ctdb_db->ctdb->idr, state);
+	state->callback     = callback;
+	state->private_data = start_state;
+	state->null_count   = 0;
+	state->timedout     = false;
+	
+	talloc_set_destructor(state, ctdb_traverse_all_destructor);
+
+	if (start_state->withemptyrecords) {
+		r_ext.db_id = ctdb_db->db_id;
+		r_ext.reqid = state->reqid;
+		r_ext.pnn   = ctdb->pnn;
+		r_ext.client_reqid = start_state->reqid;
+		r_ext.srvid = start_state->srvid;
+		r_ext.withemptyrecords = start_state->withemptyrecords;
+
+		data.dptr = (uint8_t *)&r_ext;
+		data.dsize = sizeof(r_ext);
+	} else {
+		r.db_id = ctdb_db->db_id;
+		r.reqid = state->reqid;
+		r.pnn   = ctdb->pnn;
+		r.client_reqid = start_state->reqid;
+		r.srvid = start_state->srvid;
+
+		data.dptr = (uint8_t *)&r;
+		data.dsize = sizeof(r);
+	}
+
+	if (ctdb_db_volatile(ctdb_db)) {
+		/* volatile database, traverse all active nodes */
+		destination = CTDB_BROADCAST_ACTIVE;
+	} else {
+		unsigned int i;
+		/* persistent database, traverse one node, preferably
+		 * the local one
+		 */
+		destination = ctdb->pnn;
+		/* check we are in the vnnmap */
+		for (i=0; i < ctdb->vnn_map->size; i++) {
+			if (ctdb->vnn_map->map[i] == ctdb->pnn) {
+				break;
+			}
+		}
+		/* if we are not in the vnn map we just pick the first
+		 * node instead
+		 */
+		if (i == ctdb->vnn_map->size) {
+			destination = ctdb->vnn_map->map[0];
+		}
+	}
+
+	/* tell all the nodes in the cluster to start sending records to this
+	 * node, or if it is a persistent database, just tell the local
+	 * node
+	 */
+
+	if (start_state->withemptyrecords) {
+		ret = ctdb_daemon_send_control(ctdb, destination, 0,
+				       CTDB_CONTROL_TRAVERSE_ALL_EXT,
+				       0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+	} else {
+		ret = ctdb_daemon_send_control(ctdb, destination, 0,
+				       CTDB_CONTROL_TRAVERSE_ALL,
+				       0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+	}
+
+	if (ret != 0) {
+		talloc_free(state);
+		return NULL;
+	}
+
+	DEBUG(DEBUG_NOTICE,("Starting traverse on DB %s (id %d)\n",
+			    ctdb_db->db_name, state->reqid));
+
+	/* timeout the traverse */
+	tevent_add_timer(ctdb->ev, state,
+			 timeval_current_ofs(ctdb->tunable.traverse_timeout, 0),
+			 ctdb_traverse_all_timeout, state);
+
+	return state;
+}
+
+/*
+  called when local traverse ends
+ */
+static void traverse_all_callback(void *p, TDB_DATA key, TDB_DATA data)
+{
+	struct traverse_all_state *state = talloc_get_type(p, struct traverse_all_state);
+
+	/* we're done */
+	talloc_free(state);
+}
+
+/*
+ * extended version to take the "withemptyrecords" parameter"
+ */
+int32_t ctdb_control_traverse_all_ext(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+	struct ctdb_traverse_all_ext *c = (struct ctdb_traverse_all_ext *)data.dptr;
+	struct traverse_all_state *state;
+	struct ctdb_db_context *ctdb_db;
+
+	if (data.dsize != sizeof(struct ctdb_traverse_all_ext)) {
+		DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all_ext\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, c->db_id);
+	if (ctdb_db == NULL) {
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+			DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+					ctdb_db->db_name, ctdb_db->unhealthy_reason));
+			return -1;
+		}
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
+	state = talloc(ctdb_db, struct traverse_all_state);
+	if (state == NULL) {
+		return -1;
+	}
+
+	state->reqid = c->reqid;
+	state->srcnode = c->pnn;
+	state->ctdb = ctdb;
+	state->client_reqid = c->client_reqid;
+	state->srvid = c->srvid;
+	state->withemptyrecords = c->withemptyrecords;
+
+	state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state);
+	if (state->h == NULL) {
+		talloc_free(state);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  called when a CTDB_CONTROL_TRAVERSE_ALL control comes in. We then
+  setup a traverse of our local ltdb, sending the records as
+  CTDB_CONTROL_TRAVERSE_DATA records back to the originator
+ */
+int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+	struct ctdb_traverse_all *c = (struct ctdb_traverse_all *)data.dptr;
+	struct traverse_all_state *state;
+	struct ctdb_db_context *ctdb_db;
+
+	if (data.dsize != sizeof(struct ctdb_traverse_all)) {
+		DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, c->db_id);
+	if (ctdb_db == NULL) {
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+			DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+					ctdb_db->db_name, ctdb_db->unhealthy_reason));
+			return -1;
+		}
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
+	state = talloc(ctdb_db, struct traverse_all_state);
+	if (state == NULL) {
+		return -1;
+	}
+
+	state->reqid = c->reqid;
+	state->srcnode = c->pnn;
+	state->ctdb = ctdb;
+	state->client_reqid = c->client_reqid;
+	state->srvid = c->srvid;
+	state->withemptyrecords = false;
+
+	state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state);
+	if (state->h == NULL) {
+		talloc_free(state);
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+  called when a CTDB_CONTROL_TRAVERSE_DATA control comes in. We then
+  call the traverse_all callback with the record
+ */
+int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+	struct ctdb_rec_data_old *d = (struct ctdb_rec_data_old *)data.dptr;
+	struct ctdb_traverse_all_handle *state;
+	TDB_DATA key;
+	ctdb_traverse_fn_t callback;
+	void *private_data;
+
+	if (data.dsize < sizeof(uint32_t) || data.dsize != d->length) {
+		DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_data\n"));
+		return -1;
+	}
+
+	state = reqid_find(ctdb->idr, d->reqid, struct ctdb_traverse_all_handle);
+	if (state == NULL || d->reqid != state->reqid) {
+		/* traverse might have been terminated already */
+		return -1;
+	}
+
+	key.dsize = d->keylen;
+	key.dptr  = &d->data[0];
+	data.dsize = d->datalen;
+	data.dptr = &d->data[d->keylen];
+
+	if (key.dsize == 0 && data.dsize == 0) {
+		state->null_count++;
+		/* Persistent databases are only scanned on one node (the local
+		 * node)
+		 */
+		if (ctdb_db_volatile(state->ctdb_db)) {
+			if (state->null_count != ctdb_get_num_active_nodes(ctdb)) {
+				return 0;
+			}
+		}
+	}
+
+	callback = state->callback;
+	private_data = state->private_data;
+
+	callback(private_data, key, data);
+	return 0;
+}	
+
+/*
+  kill a in-progress traverse, used when a client disconnects
+ */
+int32_t ctdb_control_traverse_kill(struct ctdb_context *ctdb, TDB_DATA data, 
+				   TDB_DATA *outdata, uint32_t srcnode)
+{
+	struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_traverse_local_handle *t;
+
+	ctdb_db = find_ctdb_db(ctdb, d->db_id);
+	if (ctdb_db == NULL) {
+		return -1;
+	}
+
+	for (t=ctdb_db->traverse; t; t=t->next) {
+		if (t->client_reqid == d->reqid &&
+		    t->srvid == d->srvid) {
+			talloc_free(t);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+  this is called when a client disconnects during a traverse
+  we need to notify all the nodes taking part in the search that they
+  should kill their traverse children
+ */
+static int ctdb_traverse_start_destructor(struct traverse_start_state *state)
+{
+	struct ctdb_traverse_start r;
+	TDB_DATA data;
+
+	DEBUG(DEBUG_ERR,(__location__ " Traverse cancelled by client disconnect for database:0x%08x\n", state->db_id));
+	r.db_id = state->db_id;
+	r.reqid = state->reqid;
+	r.srvid = state->srvid;
+
+	data.dptr = (uint8_t *)&r;
+	data.dsize = sizeof(r);
+
+	ctdb_daemon_send_control(state->ctdb, CTDB_BROADCAST_CONNECTED, 0, 
+				 CTDB_CONTROL_TRAVERSE_KILL, 
+				 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+	return 0;
+}
+
+/*
+  callback which sends records as messages to the client
+ */
+static void traverse_start_callback(void *p, TDB_DATA key, TDB_DATA data)
+{
+	struct traverse_start_state *state;
+	struct ctdb_rec_data_old *d;
+	TDB_DATA cdata;
+
+	state = talloc_get_type(p, struct traverse_start_state);
+
+	d = ctdb_marshall_record(state, state->reqid, key, NULL, data);
+	if (d == NULL) {
+		return;
+	}
+
+	cdata.dptr = (uint8_t *)d;
+	cdata.dsize = d->length;
+
+	srvid_dispatch(state->ctdb->srv, state->srvid, 0, cdata);
+	if (key.dsize == 0 && data.dsize == 0) {
+		DEBUG(DEBUG_NOTICE, ("Ending traverse on DB %s (id %d), records %d\n",
+				     state->h->ctdb_db->db_name, state->h->reqid,
+				     state->num_records));
+
+	    	if (state->h->timedout) {
+		    	/* timed out, send TRAVERSE_KILL control */
+			talloc_free(state);
+		} else {
+			/* end of traverse */
+			talloc_set_destructor(state, NULL);
+			talloc_free(state);
+		}
+	} else {
+		state->num_records++;
+	}
+}
+
+
+/**
+ * start a traverse_all - called as a control from a client.
+ * extended version to take the "withemptyrecords" parameter.
+ */
+int32_t ctdb_control_traverse_start_ext(struct ctdb_context *ctdb,
+					TDB_DATA data,
+					TDB_DATA *outdata,
+					uint32_t srcnode,
+					uint32_t client_id)
+{
+	struct ctdb_traverse_start_ext *d = (struct ctdb_traverse_start_ext *)data.dptr;
+	struct traverse_start_state *state;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " No client found\n"));
+		return -1;		
+	}
+
+	if (data.dsize != sizeof(*d)) {
+		DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_start\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, d->db_id);
+	if (ctdb_db == NULL) {
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+			DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+					ctdb_db->db_name, ctdb_db->unhealthy_reason));
+			return -1;
+		}
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
+	state = talloc(client, struct traverse_start_state);
+	if (state == NULL) {
+		return -1;
+	}
+	
+	state->srcnode = srcnode;
+	state->reqid = d->reqid;
+	state->srvid = d->srvid;
+	state->db_id = d->db_id;
+	state->ctdb = ctdb;
+	state->withemptyrecords = d->withemptyrecords;
+	state->num_records = 0;
+
+	state->h = ctdb_daemon_traverse_all(ctdb_db, traverse_start_callback, state);
+	if (state->h == NULL) {
+		talloc_free(state);
+		return -1;
+	}
+
+	talloc_set_destructor(state, ctdb_traverse_start_destructor);
+
+	return 0;
+}
+
+/**
+ * start a traverse_all - called as a control from a client.
+ */
+int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb,
+				    TDB_DATA data,
+				    TDB_DATA *outdata,
+				    uint32_t srcnode,
+				    uint32_t client_id)
+{
+	struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr;
+	struct ctdb_traverse_start_ext d2;
+	TDB_DATA data2;
+
+	ZERO_STRUCT(d2);
+	d2.db_id = d->db_id;
+	d2.reqid = d->reqid;
+	d2.srvid = d->srvid;
+	d2.withemptyrecords = false;
+
+	data2.dsize = sizeof(d2);
+	data2.dptr = (uint8_t *)&d2;
+
+	return ctdb_control_traverse_start_ext(ctdb, data2, outdata, srcnode, client_id);
+}
diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c
new file mode 100644
index 0000000..0dce656
--- /dev/null
+++ b/ctdb/server/ctdb_tunables.c
@@ -0,0 +1,170 @@
+/* 
+   ctdb tunables code
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tdb.h>
+
+#include "lib/util/debug.h"
+
+#include "ctdb_private.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/path.h"
+#include "common/tunable.h"
+
+/*
+  set all tunables to defaults
+ */
+void ctdb_tunables_set_defaults(struct ctdb_context *ctdb)
+{
+	ctdb_tunable_set_defaults(&ctdb->tunable);
+}
+
+
+/*
+  get a tunable
+ */
+int32_t ctdb_control_get_tunable(struct ctdb_context *ctdb, TDB_DATA indata,
+				 TDB_DATA *outdata)
+{
+	struct ctdb_control_get_tunable *t =
+		(struct ctdb_control_get_tunable *)indata.dptr;
+	char *name;
+	uint32_t val;
+	bool ret;
+
+	if (indata.dsize < sizeof(*t) ||
+	    t->length > indata.dsize - offsetof(struct ctdb_control_get_tunable, name)) {
+		DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_get_tunable\n"));
+		return -1;
+	}
+
+	name = talloc_strndup(ctdb, (char*)t->name, t->length);
+	CTDB_NO_MEMORY(ctdb, name);
+
+	ret = ctdb_tunable_get_value(&ctdb->tunable, name, &val);
+	talloc_free(name);
+	if (! ret) {
+		return -EINVAL;
+	}
+
+	outdata->dptr = (uint8_t *)talloc(outdata, uint32_t);
+	CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+	*(uint32_t *)outdata->dptr = val;
+	outdata->dsize = sizeof(uint32_t);
+
+	return 0;
+}
+
+
+/*
+  set a tunable
+ */
+int32_t ctdb_control_set_tunable(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_tunable_old *t =
+		(struct ctdb_tunable_old *)indata.dptr;
+	char *name;
+	int ret;
+	bool obsolete;
+
+	if (indata.dsize < sizeof(*t) ||
+	    t->length > indata.dsize - offsetof(struct ctdb_tunable_old, name)) {
+		DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tunable\n"));
+		return -1;
+	}
+
+	name = talloc_strndup(ctdb, (char *)t->name, t->length);
+	CTDB_NO_MEMORY(ctdb, name);
+
+	ret = ctdb_tunable_set_value(&ctdb->tunable, name, t->value,
+				     &obsolete);
+	if (! ret) {
+		talloc_free(name);
+		return -1;
+	}
+
+	if (obsolete) {
+		DEBUG(DEBUG_WARNING,
+		      ("Setting obsolete tunable \"%s\"\n", name));
+		talloc_free(name);
+		return 1;
+	}
+
+	talloc_free(name);
+	return 0;
+}
+
+/*
+  list tunables
+ */
+int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+	char *list = NULL;
+	struct ctdb_control_list_tunable *t;
+
+	list = ctdb_tunable_names_to_string(outdata);
+	CTDB_NO_MEMORY(ctdb, list);
+
+	outdata->dsize = offsetof(struct ctdb_control_list_tunable, data) +
+		strlen(list) + 1;
+	outdata->dptr = talloc_size(outdata, outdata->dsize);
+	CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+	t = (struct ctdb_control_list_tunable *)outdata->dptr;
+	t->length = strlen(list)+1;
+
+	memcpy(t->data, list, t->length);
+	talloc_free(list);
+
+	return 0;
+}
+
+bool ctdb_tunables_load(struct ctdb_context *ctdb)
+{
+	bool status;
+	TALLOC_CTX *tmp_ctx;
+	char *file = NULL;
+
+	/* Fail by default */
+	status = false;
+
+	tmp_ctx = talloc_new(ctdb);
+	if (tmp_ctx == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		goto done;
+	}
+
+	file = path_etcdir_append(tmp_ctx, "ctdb.tunables");
+	if (file == NULL) {
+		D_ERR("Failed to construct path for ctdb.tunables\n");
+		goto done;
+	}
+
+	status = ctdb_tunable_load_file(tmp_ctx, &ctdb->tunable, file);
+	/* No need to log error, already logged above */
+
+done:
+	talloc_free(tmp_ctx);
+	return status;
+}
diff --git a/ctdb/server/ctdb_tunnel.c b/ctdb/server/ctdb_tunnel.c
new file mode 100644
index 0000000..2df9474
--- /dev/null
+++ b/ctdb/server/ctdb_tunnel.c
@@ -0,0 +1,141 @@
+/*
+   ctdb_tunnel protocol code
+
+   Copyright (C) Amitay Isaacs  2017
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/util/debug.h"
+
+#include "common/logging.h"
+#include "common/reqid.h"
+#include "common/srvid.h"
+
+#include "ctdb_private.h"
+
+int32_t ctdb_control_tunnel_register(struct ctdb_context *ctdb,
+				     uint32_t client_id, uint64_t tunnel_id)
+{
+	struct ctdb_client *client;
+	int ret;
+
+	client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR, ("Bad client_id in ctdb_tunnel_register\n"));
+		return -1;
+	}
+
+	ret = srvid_exists(ctdb->tunnels, tunnel_id, NULL);
+	if (ret == 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Tunnel id 0x%"PRIx64" already registered\n",
+		       tunnel_id));
+		return -1;
+	}
+
+	ret = srvid_register(ctdb->tunnels, client, tunnel_id,
+			     daemon_tunnel_handler, client);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to register tunnel id 0x%"PRIx64"\n",
+		       tunnel_id));
+		return -1;
+	}
+
+	DEBUG(DEBUG_INFO, ("Registered tunnel for id 0x%"PRIx64"\n",
+			   tunnel_id));
+	return 0;
+}
+
+int32_t ctdb_control_tunnel_deregister(struct ctdb_context *ctdb,
+				       uint32_t client_id, uint64_t tunnel_id)
+{
+	struct ctdb_client *client;
+	int ret;
+
+	client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+	if (client == NULL) {
+		DEBUG(DEBUG_ERR, ("Bad client_id in ctdb_tunnel_deregister\n"));
+		return -1;
+	}
+
+	ret = srvid_deregister(ctdb->tunnels, tunnel_id, client);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,
+		      ("Failed to deregister tunnel id 0x%"PRIx64"\n",
+		       tunnel_id));
+		return -1;
+	}
+
+	return 0;
+}
+
+int ctdb_daemon_send_tunnel(struct ctdb_context *ctdb, uint32_t destnode,
+			    uint64_t tunnel_id, uint32_t flags, TDB_DATA data)
+{
+	struct ctdb_req_tunnel_old *c;
+	size_t len;
+
+	if (ctdb->methods == NULL) {
+		DEBUG(DEBUG_INFO,
+		      ("Failed to send tunnel. Transport is DOWN\n"));
+		return -1;
+	}
+
+	len = offsetof(struct ctdb_req_tunnel_old, data) + data.dsize;
+	c = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_TUNNEL, len,
+				    struct ctdb_req_tunnel_old);
+	if (c == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Memory error in ctdb_daemon_send_tunnel()\n"));
+		return -1;
+	}
+
+	c->hdr.destnode = destnode;
+	c->tunnel_id = tunnel_id;
+	c->flags = flags;
+	c->datalen = data.dsize;
+	memcpy(c->data, data.dptr, data.dsize);
+
+	ctdb_queue_packet(ctdb, &c->hdr);
+
+	talloc_free(c);
+	return 0;
+}
+
+void ctdb_request_tunnel(struct ctdb_context *ctdb,
+			 struct ctdb_req_header *hdr)
+{
+	struct ctdb_req_tunnel_old *c =
+		(struct ctdb_req_tunnel_old *)hdr;
+	TDB_DATA data;
+	int ret;
+
+	data.dsize = hdr->length;
+	data.dptr = (uint8_t *)c;
+
+	ret = srvid_dispatch(ctdb->tunnels, c->tunnel_id, 0, data);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Tunnel id 0x%"PRIx64" not registered\n",
+				  c->tunnel_id));
+	}
+}
diff --git a/ctdb/server/ctdb_update_record.c b/ctdb/server/ctdb_update_record.c
new file mode 100644
index 0000000..405499c
--- /dev/null
+++ b/ctdb/server/ctdb_update_record.c
@@ -0,0 +1,372 @@
+/* 
+   implementation of the update record control
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_persistent_write_state {
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_marshall_buffer *m;
+	struct ctdb_req_control_old *c;
+	uint32_t flags;
+};
+
+/* don't create/update records that does not exist locally */
+#define UPDATE_FLAGS_REPLACE_ONLY	1
+
+/*
+  called from a child process to write the data
+ */
+static int ctdb_persistent_store(struct ctdb_persistent_write_state *state)
+{
+	unsigned int i;
+	int ret;
+	struct ctdb_rec_data_old *rec = NULL;
+	struct ctdb_marshall_buffer *m = state->m;
+
+	ret = tdb_transaction_start(state->ctdb_db->ltdb->tdb);
+	if (ret == -1) {
+		DEBUG(DEBUG_ERR,("Failed to start transaction for db_id 0x%08x in ctdb_persistent_store\n",
+				 state->ctdb_db->db_id));
+		return -1;
+	}
+
+	for (i=0;i<m->count;i++) {
+		struct ctdb_ltdb_header oldheader;
+		struct ctdb_ltdb_header header;
+		TDB_DATA key, data, olddata;
+		TALLOC_CTX *tmp_ctx = talloc_new(state);
+
+		rec = ctdb_marshall_loop_next(m, rec, NULL, &header, &key, &data);
+
+		if (rec == NULL) {
+			D_ERR("Failed to get next record %u for db_id 0x%08x "
+			      "in ctdb_persistent_store\n",
+			      i,
+			      state->ctdb_db->db_id);
+			talloc_free(tmp_ctx);
+			goto failed;
+		}
+
+		/* we must check if the record exists or not because
+		   ctdb_ltdb_fetch will unconditionally create a record
+		 */
+		if (state->flags & UPDATE_FLAGS_REPLACE_ONLY) {
+			TDB_DATA trec;
+			trec = tdb_fetch(state->ctdb_db->ltdb->tdb, key);
+			if (trec.dsize == 0) {
+				talloc_free(tmp_ctx);
+				continue;
+			}
+			free(trec.dptr);
+		}
+
+		/* fetch the old header and ensure the rsn is less than the new rsn */
+		ret = ctdb_ltdb_fetch(state->ctdb_db, key, &oldheader, tmp_ctx, &olddata);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,("Failed to fetch old record for db_id 0x%08x in ctdb_persistent_store\n",
+					 state->ctdb_db->db_id));
+			talloc_free(tmp_ctx);
+			goto failed;
+		}
+
+		if (oldheader.rsn >= header.rsn &&
+		    (olddata.dsize != data.dsize ||
+		     memcmp(olddata.dptr, data.dptr, data.dsize) != 0)) {
+			DEBUG(DEBUG_CRIT,("existing header for db_id 0x%08x has larger RSN %llu than new RSN %llu in ctdb_persistent_store\n",
+					  state->ctdb_db->db_id,
+					  (unsigned long long)oldheader.rsn, (unsigned long long)header.rsn));
+			talloc_free(tmp_ctx);
+			goto failed;
+		}
+
+		talloc_free(tmp_ctx);
+
+		ret = ctdb_ltdb_store(state->ctdb_db, key, &header, data);
+		if (ret != 0) {
+			DEBUG(DEBUG_CRIT,("Failed to store record for db_id 0x%08x in ctdb_persistent_store\n",
+					  state->ctdb_db->db_id));
+			goto failed;
+		}
+	}
+
+	ret = tdb_transaction_commit(state->ctdb_db->ltdb->tdb);
+	if (ret == -1) {
+		DEBUG(DEBUG_ERR,("Failed to commit transaction for db_id 0x%08x in ctdb_persistent_store\n",
+				 state->ctdb_db->db_id));
+		return -1;
+	}
+
+	return 0;
+
+failed:
+	tdb_transaction_cancel(state->ctdb_db->ltdb->tdb);
+	return -1;
+}
+
+
+/*
+  called when we the child has completed the persistent write
+  on our behalf
+ */
+static void ctdb_persistent_write_callback(int status, void *private_data)
+{
+	struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
+								   struct ctdb_persistent_write_state);
+
+
+	ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL);
+
+	talloc_free(state);
+}
+
+/*
+  called if our lockwait child times out
+ */
+static void ctdb_persistent_lock_timeout(struct tevent_context *ev,
+					 struct tevent_timer *te,
+					 struct timeval t, void *private_data)
+{
+	struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
+								   struct ctdb_persistent_write_state);
+	ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock");
+	talloc_free(state);
+}
+
+struct childwrite_handle {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct tevent_fd *fde;
+	int fd[2];
+	pid_t child;
+	void *private_data;
+	void (*callback)(int, void *);
+	struct timeval start_time;
+};
+
+static int childwrite_destructor(struct childwrite_handle *h)
+{
+	CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
+	ctdb_kill(h->ctdb, h->child, SIGKILL);
+	return 0;
+}
+
+/* called when the child process has finished writing the record to the
+   database
+*/
+static void childwrite_handler(struct tevent_context *ev,
+			       struct tevent_fd *fde,
+			       uint16_t flags, void *private_data)
+{
+	struct childwrite_handle *h = talloc_get_type(private_data,
+						     struct childwrite_handle);
+	void *p = h->private_data;
+	void (*callback)(int, void *) = h->callback;
+	pid_t child = h->child;
+	TALLOC_CTX *tmp_ctx = talloc_new(ev);
+	int ret;
+	char c;
+
+	CTDB_UPDATE_LATENCY(h->ctdb, h->ctdb_db, "persistent", childwrite_latency, h->start_time);
+	CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
+
+	/* the handle needs to go away when the context is gone - when
+	   the handle goes away this implicitly closes the pipe, which
+	   kills the child */
+	talloc_steal(tmp_ctx, h);
+
+	talloc_set_destructor(h, NULL);
+
+	ret = sys_read(h->fd[0], &c, 1);
+	if (ret < 1) {
+		DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret));
+		c = 1;
+	}
+
+	callback(c, p);
+
+	ctdb_kill(h->ctdb, child, SIGKILL);
+	talloc_free(tmp_ctx);
+}
+
+/* this creates a child process which will take out a tdb transaction
+   and write the record to the database.
+*/
+static struct childwrite_handle *ctdb_childwrite(
+				struct ctdb_db_context *ctdb_db,
+				void (*callback)(int, void *private_data),
+				struct ctdb_persistent_write_state *state)
+{
+	struct childwrite_handle *result;
+	int ret;
+	pid_t parent = getpid();
+
+	CTDB_INCREMENT_STAT(ctdb_db->ctdb, childwrite_calls);
+	CTDB_INCREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+
+	if (!(result = talloc_zero(state, struct childwrite_handle))) {
+		CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+		return NULL;
+	}
+
+	ret = pipe(result->fd);
+
+	if (ret != 0) {
+		talloc_free(result);
+		CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+		return NULL;
+	}
+
+	result->child = ctdb_fork(ctdb_db->ctdb);
+
+	if (result->child == (pid_t)-1) {
+		close(result->fd[0]);
+		close(result->fd[1]);
+		talloc_free(result);
+		CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+		return NULL;
+	}
+
+	result->callback = callback;
+	result->private_data = state;
+	result->ctdb = ctdb_db->ctdb;
+	result->ctdb_db = ctdb_db;
+
+	if (result->child == 0) {
+		char c = 0;
+
+		close(result->fd[0]);
+		prctl_set_comment("ctdb_write_persistent");
+		ret = ctdb_persistent_store(state);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n"));
+			c = 1;
+		}
+
+		sys_write(result->fd[1], &c, 1);
+
+		ctdb_wait_for_process_to_exit(parent);
+		_exit(0);
+	}
+
+	close(result->fd[1]);
+	set_close_on_exec(result->fd[0]);
+
+	talloc_set_destructor(result, childwrite_destructor);
+
+	DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for ctdb_childwrite\n", result->fd[0]));
+
+	result->fde = tevent_add_fd(ctdb_db->ctdb->ev, result, result->fd[0],
+				    TEVENT_FD_READ, childwrite_handler,
+				    (void *)result);
+	if (result->fde == NULL) {
+		talloc_free(result);
+		CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+		return NULL;
+	}
+	tevent_fd_set_auto_close(result->fde);
+
+	result->start_time = timeval_current();
+
+	return result;
+}
+
+/*
+   update a record on this node if the new record has a higher rsn than the
+   current record
+ */
+int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
+				   struct ctdb_req_control_old *c, TDB_DATA recdata,
+				   bool *async_reply)
+{
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_persistent_write_state *state;
+	struct childwrite_handle *handle;
+	struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		DEBUG(DEBUG_INFO,("rejecting ctdb_control_update_record when recovery active\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, m->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,("Unknown database 0x%08x in ctdb_control_update_record\n", m->db_id));
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n",
+				 ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		return -1;
+	}
+
+	state = talloc(ctdb, struct ctdb_persistent_write_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->ctdb_db = ctdb_db;
+	state->c       = c;
+	state->m       = m;
+	state->flags   = 0;
+	if (ctdb_db_volatile(ctdb_db)) {
+		state->flags   = UPDATE_FLAGS_REPLACE_ONLY;
+	}
+
+	/* create a child process to take out a transaction and
+	   write the data.
+	*/
+	handle = ctdb_childwrite(ctdb_db, ctdb_persistent_write_callback, state);
+	if (handle == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	/* we need to wait for the replies */
+	*async_reply = true;
+
+	/* need to keep the control structure around */
+	talloc_steal(state, c);
+
+	/* but we won't wait forever */
+	tevent_add_timer(ctdb->ev, state,
+			 timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+			 ctdb_persistent_lock_timeout, state);
+
+	return 0;
+}
+
diff --git a/ctdb/server/ctdb_uptime.c b/ctdb/server/ctdb_uptime.c
new file mode 100644
index 0000000..53025f5
--- /dev/null
+++ b/ctdb/server/ctdb_uptime.c
@@ -0,0 +1,55 @@
+/* 
+   ctdb uptime code
+
+   Copyright (C) Ronnie Sahlberg 2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/syslog.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/network.h"
+
+#include <talloc.h>
+
+#include "lib/util/debug.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+/* 
+   returns the ctdb uptime
+*/
+int32_t ctdb_control_uptime(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+	struct ctdb_uptime *uptime;
+
+	uptime = talloc_zero(outdata, struct ctdb_uptime);
+	CTDB_NO_MEMORY(ctdb, uptime);
+
+	gettimeofday(&uptime->current_time, NULL);
+	uptime->ctdbd_start_time       = ctdb->ctdbd_start_time;
+	uptime->last_recovery_started  = ctdb->last_recovery_started;
+	uptime->last_recovery_finished = ctdb->last_recovery_finished;
+
+	outdata->dsize = sizeof(struct ctdb_uptime);
+	outdata->dptr  = (uint8_t *)uptime;
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_vacuum.c b/ctdb/server/ctdb_vacuum.c
new file mode 100644
index 0000000..7ff79ac
--- /dev/null
+++ b/ctdb/server/ctdb_vacuum.c
@@ -0,0 +1,1990 @@
+/*
+   ctdb vacuuming events
+
+   Copyright (C) Ronnie Sahlberg  2009
+   Copyright (C) Michael Adam 2010-2013
+   Copyright (C) Stefan Metzmacher 2010-2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "protocol/protocol_private.h"
+
+#include "common/rb_tree.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_api.h"
+
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+
+enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT};
+
+struct ctdb_vacuum_child_context {
+	struct ctdb_vacuum_handle *vacuum_handle;
+	/* fd child writes status to */
+	int fd[2];
+	pid_t child_pid;
+	enum vacuum_child_status status;
+	struct timeval start_time;
+	bool scheduled;
+};
+
+struct ctdb_vacuum_handle {
+	struct ctdb_db_context *ctdb_db;
+	uint32_t fast_path_count;
+	uint32_t vacuum_interval;
+};
+
+
+/*  a list of records to possibly delete */
+struct vacuum_data {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct tdb_context *dest_db;
+	trbt_tree_t *delete_list;
+	struct ctdb_marshall_buffer **vacuum_fetch_list;
+	struct timeval start;
+	bool traverse_error;
+	bool vacuum;
+	struct {
+		struct {
+			uint32_t added_to_vacuum_fetch_list;
+			uint32_t added_to_delete_list;
+			uint32_t deleted;
+			uint32_t skipped;
+			uint32_t error;
+			uint32_t total;
+		} delete_queue;
+		struct {
+			uint32_t scheduled;
+			uint32_t skipped;
+			uint32_t error;
+			uint32_t total;
+		} db_traverse;
+		struct {
+			uint32_t total;
+			uint32_t remote_error;
+			uint32_t local_error;
+			uint32_t deleted;
+			uint32_t skipped;
+			uint32_t left;
+		} delete_list;
+		struct {
+			uint32_t vacuumed;
+			uint32_t copied;
+		} repack;
+	} count;
+};
+
+/* this structure contains the information for one record to be deleted */
+struct delete_record_data {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_ltdb_header hdr;
+	uint32_t remote_fail_count;
+	TDB_DATA key;
+	uint8_t keydata[1];
+};
+
+struct delete_records_list {
+	struct ctdb_marshall_buffer *records;
+	struct vacuum_data *vdata;
+};
+
+struct fetch_record_data {
+	TDB_DATA key;
+	uint8_t keydata[1];
+};
+
+static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
+					   const struct ctdb_ltdb_header *hdr,
+					   TDB_DATA key);
+
+/**
+ * Store key and header in a tree, indexed by the key hash.
+ */
+static int insert_delete_record_data_into_tree(struct ctdb_context *ctdb,
+					       struct ctdb_db_context *ctdb_db,
+					       trbt_tree_t *tree,
+					       const struct ctdb_ltdb_header *hdr,
+					       TDB_DATA key)
+{
+	struct delete_record_data *dd;
+	uint32_t hash;
+	size_t len;
+
+	len = offsetof(struct delete_record_data, keydata) + key.dsize;
+
+	dd = (struct delete_record_data *)talloc_size(tree, len);
+	if (dd == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		return -1;
+	}
+	talloc_set_name_const(dd, "struct delete_record_data");
+
+	dd->ctdb      = ctdb;
+	dd->ctdb_db   = ctdb_db;
+	dd->key.dsize = key.dsize;
+	dd->key.dptr  = dd->keydata;
+	memcpy(dd->keydata, key.dptr, key.dsize);
+
+	dd->hdr = *hdr;
+	dd->remote_fail_count = 0;
+
+	hash = ctdb_hash(&key);
+
+	trbt_insert32(tree, hash, dd);
+
+	return 0;
+}
+
+static int add_record_to_delete_list(struct vacuum_data *vdata, TDB_DATA key,
+				     struct ctdb_ltdb_header *hdr)
+{
+	struct ctdb_context *ctdb = vdata->ctdb;
+	struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
+	uint32_t hash;
+	int ret;
+
+	hash = ctdb_hash(&key);
+
+	if (trbt_lookup32(vdata->delete_list, hash)) {
+		DEBUG(DEBUG_INFO, (__location__ " Hash collision when vacuuming, skipping this record.\n"));
+		return 0;
+	}
+
+	ret = insert_delete_record_data_into_tree(ctdb, ctdb_db,
+						  vdata->delete_list,
+						  hdr, key);
+	if (ret != 0) {
+		return -1;
+	}
+
+	vdata->count.delete_list.total++;
+
+	return 0;
+}
+
+/**
+ * Add a record to the list of records to be sent
+ * to their lmaster with VACUUM_FETCH.
+ */
+static int add_record_to_vacuum_fetch_list(struct vacuum_data *vdata,
+					   TDB_DATA key)
+{
+	struct ctdb_context *ctdb = vdata->ctdb;
+	uint32_t lmaster;
+	struct ctdb_marshall_buffer *vfl;
+
+	lmaster = ctdb_lmaster(ctdb, &key);
+
+	vfl = vdata->vacuum_fetch_list[lmaster];
+
+	vfl = ctdb_marshall_add(ctdb, vfl, vfl->db_id, ctdb->pnn,
+				key, NULL, tdb_null);
+	if (vfl == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		vdata->traverse_error = true;
+		return -1;
+	}
+
+	vdata->vacuum_fetch_list[lmaster] = vfl;
+
+	return 0;
+}
+
+
+static void ctdb_vacuum_event(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval t, void *private_data);
+
+static int vacuum_record_parser(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+	struct ctdb_ltdb_header *header =
+		(struct ctdb_ltdb_header *)private_data;
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		return -1;
+	}
+
+	*header = *(struct ctdb_ltdb_header *)data.dptr;
+
+	return 0;
+}
+
+/*
+ * traverse function for gathering the records that can be deleted
+ */
+static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+			   void *private_data)
+{
+	struct vacuum_data *vdata = talloc_get_type(private_data,
+						    struct vacuum_data);
+	struct ctdb_context *ctdb = vdata->ctdb;
+	struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
+	uint32_t lmaster;
+	struct ctdb_ltdb_header *hdr;
+	int res = 0;
+
+	vdata->count.db_traverse.total++;
+
+	lmaster = ctdb_lmaster(ctdb, &key);
+	if (lmaster >= ctdb->num_nodes) {
+		vdata->count.db_traverse.error++;
+		DEBUG(DEBUG_CRIT, (__location__
+				   " lmaster[%u] >= ctdb->num_nodes[%u] for key"
+				   " with hash[%u]!\n",
+				   (unsigned)lmaster,
+				   (unsigned)ctdb->num_nodes,
+				   (unsigned)ctdb_hash(&key)));
+		return -1;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		/* it is not a deleted record */
+		vdata->count.db_traverse.skipped++;
+		return 0;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (hdr->dmaster != ctdb->pnn) {
+		vdata->count.db_traverse.skipped++;
+		return 0;
+	}
+
+	/*
+	 * Add the record to this process's delete_queue for processing
+	 * in the subsequent traverse in the fast vacuum run.
+	 */
+	res = insert_record_into_delete_queue(ctdb_db, hdr, key);
+	if (res != 0) {
+		vdata->count.db_traverse.error++;
+	} else {
+		vdata->count.db_traverse.scheduled++;
+	}
+
+	return 0;
+}
+
+/*
+ * traverse the tree of records to delete and marshall them into
+ * a blob
+ */
+static int delete_marshall_traverse(void *param, void *data)
+{
+	struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
+	struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
+	struct ctdb_marshall_buffer *m;
+
+	m = ctdb_marshall_add(recs, recs->records, recs->records->db_id,
+			      recs->records->db_id,
+			      dd->key, &dd->hdr, tdb_null);
+	if (m == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to marshall record\n"));
+		return -1;
+	}
+
+	recs->records = m;
+	return 0;
+}
+
+struct fetch_queue_state {
+	struct ctdb_db_context *ctdb_db;
+	int count;
+};
+
+struct fetch_record_migrate_state {
+	struct fetch_queue_state *fetch_queue;
+	TDB_DATA key;
+};
+
+static void fetch_record_migrate_callback(struct ctdb_client_call_state *state)
+{
+	struct fetch_record_migrate_state *fetch = talloc_get_type_abort(
+		state->async.private_data, struct fetch_record_migrate_state);
+	struct fetch_queue_state *fetch_queue = fetch->fetch_queue;
+	struct ctdb_ltdb_header hdr;
+	struct ctdb_call call = { 0 };
+	int ret;
+
+	ret = ctdb_call_recv(state, &call);
+	fetch_queue->count--;
+	if (ret != 0) {
+		D_ERR("Failed to migrate record for vacuuming\n");
+		goto done;
+	}
+
+	ret = tdb_chainlock_nonblock(fetch_queue->ctdb_db->ltdb->tdb,
+				     fetch->key);
+	if (ret != 0) {
+		goto done;
+	}
+
+	ret = tdb_parse_record(fetch_queue->ctdb_db->ltdb->tdb,
+			       fetch->key,
+			       vacuum_record_parser,
+			       &hdr);
+
+	tdb_chainunlock(fetch_queue->ctdb_db->ltdb->tdb, fetch->key);
+
+	if (ret != 0) {
+		goto done;
+	}
+
+	D_INFO("Vacuum Fetch record, key=%.*s\n",
+	       (int)fetch->key.dsize,
+	       fetch->key.dptr);
+
+	(void) ctdb_local_schedule_for_deletion(fetch_queue->ctdb_db,
+						&hdr,
+						fetch->key);
+
+done:
+	talloc_free(fetch);
+}
+
+static int fetch_record_parser(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+	struct ctdb_ltdb_header *header =
+		(struct ctdb_ltdb_header *)private_data;
+
+	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+		return -1;
+	}
+
+	memcpy(header, data.dptr, sizeof(*header));
+	return 0;
+}
+
+/**
+ * traverse function for the traversal of the fetch_queue.
+ *
+ * Send a record migration request.
+ */
+static int fetch_queue_traverse(void *param, void *data)
+{
+	struct fetch_record_data *rd = talloc_get_type_abort(
+		data, struct fetch_record_data);
+	struct fetch_queue_state *fetch_queue =
+		(struct fetch_queue_state *)param;
+	struct ctdb_db_context *ctdb_db = fetch_queue->ctdb_db;
+	struct ctdb_client_call_state *state;
+	struct fetch_record_migrate_state *fetch;
+	struct ctdb_call call = { 0 };
+	struct ctdb_ltdb_header header;
+	int ret;
+
+	ret = tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, rd->key);
+	if (ret != 0) {
+		return 0;
+	}
+
+	ret = tdb_parse_record(ctdb_db->ltdb->tdb,
+			       rd->key,
+			       fetch_record_parser,
+			       &header);
+
+	tdb_chainunlock(ctdb_db->ltdb->tdb, rd->key);
+
+	if (ret != 0) {
+		goto skipped;
+	}
+
+	if (header.dmaster == ctdb_db->ctdb->pnn) {
+		/* If the record is already migrated, skip */
+		goto skipped;
+	}
+
+	fetch = talloc_zero(ctdb_db, struct fetch_record_migrate_state);
+	if (fetch == NULL) {
+		D_ERR("Failed to setup fetch record migrate state\n");
+		return 0;
+	}
+
+	fetch->fetch_queue = fetch_queue;
+
+	fetch->key.dsize = rd->key.dsize;
+	fetch->key.dptr = talloc_memdup(fetch, rd->key.dptr, rd->key.dsize);
+	if (fetch->key.dptr == NULL) {
+		D_ERR("Memory error in fetch_queue_traverse\n");
+		talloc_free(fetch);
+		return 0;
+	}
+
+	call.call_id = CTDB_NULL_FUNC;
+	call.flags = CTDB_IMMEDIATE_MIGRATION |
+		     CTDB_CALL_FLAG_VACUUM_MIGRATION;
+	call.key = fetch->key;
+
+	state = ctdb_call_send(ctdb_db, &call);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to setup vacuum fetch call\n"));
+		talloc_free(fetch);
+		return 0;
+	}
+
+	state->async.fn = fetch_record_migrate_callback;
+	state->async.private_data = fetch;
+
+	fetch_queue->count++;
+
+	return 0;
+
+skipped:
+	D_INFO("Skipped Fetch record, key=%.*s\n",
+	       (int)rd->key.dsize,
+	       rd->key.dptr);
+	return 0;
+}
+
+/**
+ * Traverse the fetch.
+ * Records are migrated to the local node and
+ * added to delete queue for further processing.
+ */
+static void ctdb_process_fetch_queue(struct ctdb_db_context *ctdb_db)
+{
+	struct fetch_queue_state state;
+	int ret;
+
+	state.ctdb_db = ctdb_db;
+	state.count = 0;
+
+	ret = trbt_traversearray32(ctdb_db->fetch_queue, 1,
+				   fetch_queue_traverse, &state);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Error traversing "
+		      "the fetch queue.\n"));
+	}
+
+	/* Wait for all migrations to complete */
+	while (state.count > 0) {
+		tevent_loop_once(ctdb_db->ctdb->ev);
+	}
+}
+
+/**
+ * traverse function for the traversal of the delete_queue,
+ * the fast-path vacuuming list.
+ *
+ *  - If the record has been migrated off the node
+ *    or has been revived (filled with data) on the node,
+ *    then skip the record.
+ *
+ *  - If the current node is the record's lmaster and it is
+ *    a record that has never been migrated with data, then
+ *    delete the record from the local tdb.
+ *
+ *  - If the current node is the record's lmaster and it has
+ *    been migrated with data, then schedule it for the normal
+ *    vacuuming procedure (i.e. add it to the delete_list).
+ *
+ *  - If the current node is NOT the record's lmaster then
+ *    add it to the list of records that are to be sent to
+ *    the lmaster with the VACUUM_FETCH message.
+ */
+static int delete_queue_traverse(void *param, void *data)
+{
+	struct delete_record_data *dd =
+		talloc_get_type(data, struct delete_record_data);
+	struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data);
+	struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+	struct ctdb_context *ctdb = ctdb_db->ctdb; /* or dd->ctdb ??? */
+	int res;
+	struct ctdb_ltdb_header header;
+	uint32_t lmaster;
+	uint32_t hash = ctdb_hash(&(dd->key));
+
+	vdata->count.delete_queue.total++;
+
+	res = tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, dd->key);
+	if (res != 0) {
+		vdata->count.delete_queue.error++;
+		return 0;
+	}
+
+	res = tdb_parse_record(ctdb_db->ltdb->tdb, dd->key,
+			       vacuum_record_parser, &header);
+	if (res != 0) {
+		goto skipped;
+	}
+
+	if (header.dmaster != ctdb->pnn) {
+		/* The record has been migrated off the node. Skip. */
+		goto skipped;
+	}
+
+	if (header.rsn != dd->hdr.rsn) {
+		/*
+		 * The record has been migrated off the node and back again.
+		 * But not requeued for deletion. Skip it.
+		 */
+		goto skipped;
+	}
+
+	/*
+	 * We are dmaster, and the record has no data, and it has
+	 * not been migrated after it has been queued for deletion.
+	 *
+	 * At this stage, the record could still have been revived locally
+	 * and last been written with empty data. This can only be
+	 * fixed with the addition of an active or delete flag. (TODO)
+	 */
+
+	lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+	if (lmaster != ctdb->pnn) {
+		res = add_record_to_vacuum_fetch_list(vdata, dd->key);
+
+		if (res != 0) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ " Error adding record to list "
+			       "of records to send to lmaster.\n"));
+			vdata->count.delete_queue.error++;
+		} else {
+			vdata->count.delete_queue.added_to_vacuum_fetch_list++;
+		}
+		goto done;
+	}
+
+	/* use header->flags or dd->hdr.flags ?? */
+	if (dd->hdr.flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) {
+		res = add_record_to_delete_list(vdata, dd->key, &dd->hdr);
+
+		if (res != 0) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ " Error adding record to list "
+			       "of records for deletion on lmaster.\n"));
+			vdata->count.delete_queue.error++;
+		} else {
+			vdata->count.delete_queue.added_to_delete_list++;
+		}
+	} else {
+		res = tdb_delete(ctdb_db->ltdb->tdb, dd->key);
+
+		if (res != 0) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ " Error deleting record with key "
+			       "hash [0x%08x] from local data base db[%s].\n",
+			       hash, ctdb_db->db_name));
+			vdata->count.delete_queue.error++;
+			goto done;
+		}
+
+		DEBUG(DEBUG_DEBUG,
+		      (__location__ " Deleted record with key hash "
+		       "[0x%08x] from local data base db[%s].\n",
+		       hash, ctdb_db->db_name));
+		vdata->count.delete_queue.deleted++;
+	}
+
+	goto done;
+
+skipped:
+	vdata->count.delete_queue.skipped++;
+
+done:
+	tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+	return 0;
+}
+
+/**
+ * Delete the records that we are lmaster and dmaster for and
+ * that could be deleted on all other nodes via the TRY_DELETE_RECORDS
+ * control.
+ */
+static int delete_record_traverse(void *param, void *data)
+{
+	struct delete_record_data *dd =
+		talloc_get_type(data, struct delete_record_data);
+	struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data);
+	struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int res;
+	struct ctdb_ltdb_header header;
+	uint32_t lmaster;
+	uint32_t hash = ctdb_hash(&(dd->key));
+
+	if (dd->remote_fail_count > 0) {
+		vdata->count.delete_list.remote_error++;
+		vdata->count.delete_list.left--;
+		talloc_free(dd);
+		return 0;
+	}
+
+	res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
+	if (res != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Error getting chainlock on record with "
+		       "key hash [0x%08x] on database db[%s].\n",
+		       hash, ctdb_db->db_name));
+		vdata->count.delete_list.local_error++;
+		vdata->count.delete_list.left--;
+		talloc_free(dd);
+		return 0;
+	}
+
+	/*
+	 * Verify that the record is still empty, its RSN has not
+	 * changed and that we are still its lmaster and dmaster.
+	 */
+
+	res = tdb_parse_record(ctdb_db->ltdb->tdb, dd->key,
+			       vacuum_record_parser, &header);
+	if (res != 0) {
+		goto skip;
+	}
+
+	if (header.flags & CTDB_REC_RO_FLAGS) {
+		DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+				   "on database db[%s] has read-only flags. "
+				   "skipping.\n",
+				   hash, ctdb_db->db_name));
+		goto skip;
+	}
+
+	if (header.dmaster != ctdb->pnn) {
+		DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+				   "on database db[%s] has been migrated away. "
+				   "skipping.\n",
+				   hash, ctdb_db->db_name));
+		goto skip;
+	}
+
+	if (header.rsn != dd->hdr.rsn) {
+		/*
+		 * The record has been migrated off the node and back again.
+		 * But not requeued for deletion. Skip it.
+		 */
+		DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+				   "on database db[%s] seems to have been "
+				   "migrated away and back again (with empty "
+				   "data). skipping.\n",
+				   hash, ctdb_db->db_name));
+		goto skip;
+	}
+
+	lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+	if (lmaster != ctdb->pnn) {
+		DEBUG(DEBUG_INFO, (__location__ ": not lmaster for record in "
+				   "delete list (key hash [0x%08x], db[%s]). "
+				   "Strange! skipping.\n",
+				   hash, ctdb_db->db_name));
+		goto skip;
+	}
+
+	res = tdb_delete(ctdb_db->ltdb->tdb, dd->key);
+
+	if (res != 0) {
+		DEBUG(DEBUG_ERR,
+		      (__location__ " Error deleting record with key hash "
+		       "[0x%08x] from local data base db[%s].\n",
+		       hash, ctdb_db->db_name));
+		vdata->count.delete_list.local_error++;
+		goto done;
+	}
+
+	DEBUG(DEBUG_DEBUG,
+	      (__location__ " Deleted record with key hash [0x%08x] from "
+	       "local data base db[%s].\n", hash, ctdb_db->db_name));
+
+	vdata->count.delete_list.deleted++;
+	goto done;
+
+skip:
+	vdata->count.delete_list.skipped++;
+
+done:
+	tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+	talloc_free(dd);
+	vdata->count.delete_list.left--;
+
+	return 0;
+}
+
+/**
+ * Traverse the delete_queue.
+ * Records are either deleted directly or filled
+ * into the delete list or the vacuum fetch lists
+ * for further processing.
+ */
+static void ctdb_process_delete_queue(struct ctdb_db_context *ctdb_db,
+				      struct vacuum_data *vdata)
+{
+	uint32_t sum;
+	int ret;
+
+	ret = trbt_traversearray32(ctdb_db->delete_queue, 1,
+				   delete_queue_traverse, vdata);
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Error traversing "
+		      "the delete queue.\n"));
+	}
+
+	sum = vdata->count.delete_queue.deleted
+	    + vdata->count.delete_queue.skipped
+	    + vdata->count.delete_queue.error
+	    + vdata->count.delete_queue.added_to_delete_list
+	    + vdata->count.delete_queue.added_to_vacuum_fetch_list;
+
+	if (vdata->count.delete_queue.total != sum) {
+		DEBUG(DEBUG_ERR, (__location__ " Inconsistency in fast vacuum "
+		      "counts for db[%s]: total[%u] != sum[%u]\n",
+		      ctdb_db->db_name,
+		      (unsigned)vdata->count.delete_queue.total,
+		      (unsigned)sum));
+	}
+
+	if (vdata->count.delete_queue.total > 0) {
+		DEBUG(DEBUG_INFO,
+		      (__location__
+		       " fast vacuuming delete_queue traverse statistics: "
+		       "db[%s] "
+		       "total[%u] "
+		       "del[%u] "
+		       "skp[%u] "
+		       "err[%u] "
+		       "adl[%u] "
+		       "avf[%u]\n",
+		       ctdb_db->db_name,
+		       (unsigned)vdata->count.delete_queue.total,
+		       (unsigned)vdata->count.delete_queue.deleted,
+		       (unsigned)vdata->count.delete_queue.skipped,
+		       (unsigned)vdata->count.delete_queue.error,
+		       (unsigned)vdata->count.delete_queue.added_to_delete_list,
+		       (unsigned)vdata->count.delete_queue.added_to_vacuum_fetch_list));
+	}
+
+	return;
+}
+
+/**
+ * read-only traverse of the database, looking for records that
+ * might be able to be vacuumed.
+ *
+ * This is not done each time but only every tunable
+ * VacuumFastPathCount times.
+ */
+static void ctdb_vacuum_traverse_db(struct ctdb_db_context *ctdb_db,
+				    struct vacuum_data *vdata)
+{
+	int ret;
+
+	ret = tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata);
+	if (ret == -1 || vdata->traverse_error) {
+		DEBUG(DEBUG_ERR, (__location__ " Traverse error in vacuuming "
+				  "'%s'\n", ctdb_db->db_name));
+		return;
+	}
+
+	if (vdata->count.db_traverse.total > 0) {
+		DEBUG(DEBUG_INFO,
+		      (__location__
+		       " full vacuuming db traverse statistics: "
+		       "db[%s] "
+		       "total[%u] "
+		       "skp[%u] "
+		       "err[%u] "
+		       "sched[%u]\n",
+		       ctdb_db->db_name,
+		       (unsigned)vdata->count.db_traverse.total,
+		       (unsigned)vdata->count.db_traverse.skipped,
+		       (unsigned)vdata->count.db_traverse.error,
+		       (unsigned)vdata->count.db_traverse.scheduled));
+	}
+
+	return;
+}
+
+/**
+ * Process the vacuum fetch lists:
+ * For records for which we are not the lmaster, tell the lmaster to
+ * fetch the record.
+ */
+static void ctdb_process_vacuum_fetch_lists(struct ctdb_db_context *ctdb_db,
+					    struct vacuum_data *vdata)
+{
+	unsigned int i;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int ret, res;
+
+	for (i = 0; i < ctdb->num_nodes; i++) {
+		TDB_DATA data;
+		struct ctdb_marshall_buffer *vfl = vdata->vacuum_fetch_list[i];
+
+		if (ctdb->nodes[i]->pnn == ctdb->pnn) {
+			continue;
+		}
+
+		if (vfl->count == 0) {
+			continue;
+		}
+
+		DEBUG(DEBUG_INFO, ("Found %u records for lmaster %u in '%s'\n",
+				   vfl->count, ctdb->nodes[i]->pnn,
+				   ctdb_db->db_name));
+
+		data = ctdb_marshall_finish(vfl);
+
+		ret = ctdb_control(ctdb, ctdb->nodes[i]->pnn, 0,
+				   CTDB_CONTROL_VACUUM_FETCH, 0,
+				   data, NULL, NULL, &res, NULL, NULL);
+		if (ret != 0 || res != 0) {
+			DEBUG(DEBUG_ERR, ("Failed to send vacuum "
+					  "fetch control to node %u\n",
+					  ctdb->nodes[i]->pnn));
+		}
+	}
+}
+
+/**
+ * Process the delete list:
+ *
+ * This is the last step of vacuuming that consistently deletes
+ * those records that have been migrated with data and can hence
+ * not be deleted when leaving a node.
+ *
+ * In this step, the lmaster does the final deletion of those empty
+ * records that it is also dmaster for. It has usually received
+ * at least some of these records previously from the former dmasters
+ * with the vacuum fetch message.
+ *
+ *  1) Send the records to all active nodes with the TRY_DELETE_RECORDS
+ *     control. The remote notes delete their local copy.
+ *  2) The lmaster locally deletes its copies of all records that
+ *     could successfully be deleted remotely in step #2.
+ */
+static void ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
+				     struct vacuum_data *vdata)
+{
+	int ret, i;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct delete_records_list *recs;
+	TDB_DATA indata;
+	struct ctdb_node_map_old *nodemap;
+	uint32_t *active_nodes;
+	int num_active_nodes;
+	TALLOC_CTX *tmp_ctx;
+	uint32_t sum;
+
+	if (vdata->count.delete_list.total == 0) {
+		return;
+	}
+
+	tmp_ctx = talloc_new(vdata);
+	if (tmp_ctx == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		return;
+	}
+
+	vdata->count.delete_list.left = vdata->count.delete_list.total;
+
+	/*
+	 * get the list of currently active nodes
+	 */
+
+	ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(),
+				   CTDB_CURRENT_NODE,
+				   tmp_ctx,
+				   &nodemap);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
+		goto done;
+	}
+
+	active_nodes = list_of_active_nodes(ctdb, nodemap,
+					    nodemap, /* talloc context */
+					    false /* include self */);
+	/* yuck! ;-) */
+	num_active_nodes = talloc_get_size(active_nodes)/sizeof(*active_nodes);
+
+	/*
+	 * Now delete the records all active nodes in a two-phase process:
+	 * 1) tell all active remote nodes to delete all their copy
+	 * 2) if all remote nodes deleted their record copy, delete it locally
+	 */
+
+	recs = talloc_zero(tmp_ctx, struct delete_records_list);
+	if (recs == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		goto done;
+	}
+
+	/*
+	 * Step 1:
+	 * Send all records to all active nodes for deletion.
+	 */
+
+	/*
+	 * Create a marshall blob from the remaining list of records to delete.
+	 */
+
+	recs->records = (struct ctdb_marshall_buffer *)
+		talloc_zero_size(recs,
+				 offsetof(struct ctdb_marshall_buffer, data));
+	if (recs->records == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		goto done;
+	}
+	recs->records->db_id = ctdb_db->db_id;
+
+	ret = trbt_traversearray32(vdata->delete_list, 1,
+				   delete_marshall_traverse, recs);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Error traversing the "
+		      "delete list for second marshalling.\n"));
+		goto done;
+	}
+
+	indata = ctdb_marshall_finish(recs->records);
+
+	for (i = 0; i < num_active_nodes; i++) {
+		struct ctdb_marshall_buffer *records;
+		struct ctdb_rec_data_old *rec;
+		int32_t res;
+		TDB_DATA outdata;
+
+		ret = ctdb_control(ctdb, active_nodes[i], 0,
+				CTDB_CONTROL_TRY_DELETE_RECORDS, 0,
+				indata, recs, &outdata, &res,
+				NULL, NULL);
+		if (ret != 0 || res != 0) {
+			DEBUG(DEBUG_ERR, ("Failed to delete records on "
+					  "node %u: ret[%d] res[%d]\n",
+					  active_nodes[i], ret, res));
+			goto done;
+		}
+
+		/*
+		 * outdata contains the list of records coming back
+		 * from the node: These are the records that the
+		 * remote node could not delete. We remove these from
+		 * the list to delete locally.
+		 */
+		records = (struct ctdb_marshall_buffer *)outdata.dptr;
+		rec = (struct ctdb_rec_data_old *)&records->data[0];
+		while (records->count-- > 0) {
+			TDB_DATA reckey, recdata;
+			struct ctdb_ltdb_header *rechdr;
+			struct delete_record_data *dd;
+
+			reckey.dptr = &rec->data[0];
+			reckey.dsize = rec->keylen;
+			recdata.dptr = &rec->data[reckey.dsize];
+			recdata.dsize = rec->datalen;
+
+			if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
+				DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+				goto done;
+			}
+			rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
+			recdata.dptr += sizeof(*rechdr);
+			recdata.dsize -= sizeof(*rechdr);
+
+			dd = (struct delete_record_data *)trbt_lookup32(
+					vdata->delete_list,
+					ctdb_hash(&reckey));
+			if (dd != NULL) {
+				/*
+				 * The remote node could not delete the
+				 * record.  Since other remote nodes can
+				 * also fail, we just mark the record.
+				 */
+				dd->remote_fail_count++;
+			} else {
+				DEBUG(DEBUG_ERR, (__location__ " Failed to "
+				      "find record with hash 0x%08x coming "
+				      "back from TRY_DELETE_RECORDS "
+				      "control in delete list.\n",
+				      ctdb_hash(&reckey)));
+			}
+
+			rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
+		}
+	}
+
+	/*
+	 * Step 2:
+	 * Delete the remaining records locally.
+	 *
+	 * These records have successfully been deleted on all
+	 * active remote nodes.
+	 */
+
+	ret = trbt_traversearray32(vdata->delete_list, 1,
+				   delete_record_traverse, vdata);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Error traversing the "
+		      "delete list for deletion.\n"));
+	}
+
+	if (vdata->count.delete_list.left != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Vacuum db[%s] error: "
+		      "there are %u records left for deletion after "
+		      "processing delete list\n",
+		      ctdb_db->db_name,
+		      (unsigned)vdata->count.delete_list.left));
+	}
+
+	sum = vdata->count.delete_list.deleted
+	    + vdata->count.delete_list.skipped
+	    + vdata->count.delete_list.remote_error
+	    + vdata->count.delete_list.local_error
+	    + vdata->count.delete_list.left;
+
+	if (vdata->count.delete_list.total != sum) {
+		DEBUG(DEBUG_ERR, (__location__ " Inconsistency in vacuum "
+		      "delete list counts for db[%s]: total[%u] != sum[%u]\n",
+		      ctdb_db->db_name,
+		      (unsigned)vdata->count.delete_list.total,
+		      (unsigned)sum));
+	}
+
+	if (vdata->count.delete_list.total > 0) {
+		DEBUG(DEBUG_INFO,
+		      (__location__
+		       " vacuum delete list statistics: "
+		       "db[%s] "
+		       "total[%u] "
+		       "del[%u] "
+		       "skip[%u] "
+		       "rem.err[%u] "
+		       "loc.err[%u] "
+		       "left[%u]\n",
+		       ctdb_db->db_name,
+		       (unsigned)vdata->count.delete_list.total,
+		       (unsigned)vdata->count.delete_list.deleted,
+		       (unsigned)vdata->count.delete_list.skipped,
+		       (unsigned)vdata->count.delete_list.remote_error,
+		       (unsigned)vdata->count.delete_list.local_error,
+		       (unsigned)vdata->count.delete_list.left));
+	}
+
+done:
+	talloc_free(tmp_ctx);
+
+	return;
+}
+
+/**
+ * initialize the vacuum_data
+ */
+static struct vacuum_data *ctdb_vacuum_init_vacuum_data(
+					struct ctdb_db_context *ctdb_db,
+					TALLOC_CTX *mem_ctx)
+{
+	unsigned int i;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct vacuum_data *vdata;
+
+	vdata = talloc_zero(mem_ctx, struct vacuum_data);
+	if (vdata == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		return NULL;
+	}
+
+	vdata->ctdb = ctdb_db->ctdb;
+	vdata->ctdb_db = ctdb_db;
+	vdata->delete_list = trbt_create(vdata, 0);
+	if (vdata->delete_list == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		goto fail;
+	}
+
+	vdata->start = timeval_current();
+
+	vdata->count.delete_queue.added_to_delete_list = 0;
+	vdata->count.delete_queue.added_to_vacuum_fetch_list = 0;
+	vdata->count.delete_queue.deleted = 0;
+	vdata->count.delete_queue.skipped = 0;
+	vdata->count.delete_queue.error = 0;
+	vdata->count.delete_queue.total = 0;
+	vdata->count.db_traverse.scheduled = 0;
+	vdata->count.db_traverse.skipped = 0;
+	vdata->count.db_traverse.error = 0;
+	vdata->count.db_traverse.total = 0;
+	vdata->count.delete_list.total = 0;
+	vdata->count.delete_list.left = 0;
+	vdata->count.delete_list.remote_error = 0;
+	vdata->count.delete_list.local_error = 0;
+	vdata->count.delete_list.skipped = 0;
+	vdata->count.delete_list.deleted = 0;
+
+	/* the list needs to be of length num_nodes */
+	vdata->vacuum_fetch_list = talloc_zero_array(vdata,
+						struct ctdb_marshall_buffer *,
+						ctdb->num_nodes);
+	if (vdata->vacuum_fetch_list == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		goto fail;
+	}
+	for (i = 0; i < ctdb->num_nodes; i++) {
+		vdata->vacuum_fetch_list[i] = (struct ctdb_marshall_buffer *)
+			talloc_zero_size(vdata->vacuum_fetch_list,
+					 offsetof(struct ctdb_marshall_buffer, data));
+		if (vdata->vacuum_fetch_list[i] == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+			talloc_free(vdata);
+			return NULL;
+		}
+		vdata->vacuum_fetch_list[i]->db_id = ctdb_db->db_id;
+	}
+
+	return vdata;
+
+fail:
+	talloc_free(vdata);
+	return NULL;
+}
+
+/**
+ * Vacuum a DB:
+ *  - Always do the fast vacuuming run, which traverses
+ *    - the in-memory fetch queue: these records have been
+ *      scheduled for migration
+ *    - the in-memory delete queue: these records have been
+ *      scheduled for deletion.
+ *  - Only if explicitly requested, the database is traversed
+ *    in order to use the traditional heuristics on empty records
+ *    to trigger deletion.
+ *    This is done only every VacuumFastPathCount'th vacuuming run.
+ *
+ * The traverse runs fill two lists:
+ *
+ * - The delete_list:
+ *   This is the list of empty records the current
+ *   node is lmaster and dmaster for. These records are later
+ *   deleted first on other nodes and then locally.
+ *
+ *   The fast vacuuming run has a short cut for those records
+ *   that have never been migrated with data: these records
+ *   are immediately deleted locally, since they have left
+ *   no trace on other nodes.
+ *
+ * - The vacuum_fetch lists
+ *   (one for each other lmaster node):
+ *   The records in this list are sent for deletion to
+ *   their lmaster in a bulk VACUUM_FETCH control.
+ *
+ *   The lmaster then migrates all these records to itelf
+ *   so that they can be vacuumed there.
+ *
+ * This executes in the child context.
+ */
+static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
+			  bool full_vacuum_run)
+{
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	int ret, pnn;
+	struct vacuum_data *vdata;
+	TALLOC_CTX *tmp_ctx;
+
+	DEBUG(DEBUG_INFO, (__location__ " Entering %s vacuum run for db "
+			   "%s db_id[0x%08x]\n",
+			   full_vacuum_run ? "full" : "fast",
+			   ctdb_db->db_name, ctdb_db->db_id));
+
+	ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n"));
+		return ret;
+	}
+
+	pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+	if (pnn == -1) {
+		DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n"));
+		return -1;
+	}
+
+	ctdb->pnn = pnn;
+
+	tmp_ctx = talloc_new(ctdb_db);
+	if (tmp_ctx == NULL) {
+		DEBUG(DEBUG_ERR, ("Out of memory!\n"));
+		return -1;
+	}
+
+	vdata = ctdb_vacuum_init_vacuum_data(ctdb_db, tmp_ctx);
+	if (vdata == NULL) {
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	if (full_vacuum_run) {
+		ctdb_vacuum_traverse_db(ctdb_db, vdata);
+	}
+
+	ctdb_process_fetch_queue(ctdb_db);
+
+	ctdb_process_delete_queue(ctdb_db, vdata);
+
+	ctdb_process_vacuum_fetch_lists(ctdb_db, vdata);
+
+	ctdb_process_delete_list(ctdb_db, vdata);
+
+	talloc_free(tmp_ctx);
+
+	return 0;
+}
+
+/*
+ * repack and vacuum a db
+ * called from the child context
+ */
+static int ctdb_vacuum_and_repack_db(struct ctdb_db_context *ctdb_db,
+				     bool full_vacuum_run)
+{
+	uint32_t repack_limit = ctdb_db->ctdb->tunable.repack_limit;
+	const char *name = ctdb_db->db_name;
+	int freelist_size = 0;
+	int ret;
+
+	if (ctdb_vacuum_db(ctdb_db, full_vacuum_run) != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to vacuum '%s'\n", name));
+	}
+
+	freelist_size = tdb_freelist_size(ctdb_db->ltdb->tdb);
+	if (freelist_size == -1) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to get freelist size for '%s'\n", name));
+		return -1;
+	}
+
+	/*
+	 * decide if a repack is necessary
+	 */
+	if ((repack_limit == 0 || (uint32_t)freelist_size < repack_limit))
+	{
+		return 0;
+	}
+
+	D_NOTICE("Repacking %s with %u freelist entries\n",
+		 name,
+		 freelist_size);
+
+	ret = tdb_repack(ctdb_db->ltdb->tdb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to repack '%s'\n", name));
+		return -1;
+	}
+
+	return 0;
+}
+
+static uint32_t get_vacuum_interval(struct ctdb_db_context *ctdb_db)
+{
+	uint32_t interval = ctdb_db->ctdb->tunable.vacuum_interval;
+
+	return interval;
+}
+
+static int vacuum_child_destructor(struct ctdb_vacuum_child_context *child_ctx)
+{
+	double l = timeval_elapsed(&child_ctx->start_time);
+	struct ctdb_vacuum_handle *vacuum_handle = child_ctx->vacuum_handle;
+	struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+	CTDB_UPDATE_DB_LATENCY(ctdb_db, "vacuum", vacuum.latency, l);
+	DEBUG(DEBUG_INFO,("Vacuuming took %.3f seconds for database %s\n", l, ctdb_db->db_name));
+
+	if (child_ctx->child_pid != -1) {
+		ctdb_kill(ctdb, child_ctx->child_pid, SIGKILL);
+	} else {
+		/* Bump the number of successful fast-path runs. */
+		vacuum_handle->fast_path_count++;
+	}
+
+	ctdb->vacuumer = NULL;
+
+	if (child_ctx->scheduled) {
+		vacuum_handle->vacuum_interval = get_vacuum_interval(ctdb_db);
+
+		tevent_add_timer(
+			ctdb->ev,
+			vacuum_handle,
+			timeval_current_ofs(vacuum_handle->vacuum_interval, 0),
+			ctdb_vacuum_event,
+			vacuum_handle);
+	}
+
+	return 0;
+}
+
+/*
+ * this event is generated when a vacuum child process times out
+ */
+static void vacuum_child_timeout(struct tevent_context *ev,
+				 struct tevent_timer *te,
+				 struct timeval t, void *private_data)
+{
+	struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context);
+
+	DEBUG(DEBUG_ERR,("Vacuuming child process timed out for db %s\n", child_ctx->vacuum_handle->ctdb_db->db_name));
+
+	child_ctx->status = VACUUM_TIMEOUT;
+
+	talloc_free(child_ctx);
+}
+
+
+/*
+ * this event is generated when a vacuum child process has completed
+ */
+static void vacuum_child_handler(struct tevent_context *ev,
+				 struct tevent_fd *fde,
+				 uint16_t flags, void *private_data)
+{
+	struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context);
+	char c = 0;
+	int ret;
+
+	DEBUG(DEBUG_INFO,("Vacuuming child process %d finished for db %s\n", child_ctx->child_pid, child_ctx->vacuum_handle->ctdb_db->db_name));
+	child_ctx->child_pid = -1;
+
+	ret = sys_read(child_ctx->fd[0], &c, 1);
+	if (ret != 1 || c != 0) {
+		child_ctx->status = VACUUM_ERROR;
+		DEBUG(DEBUG_ERR, ("A vacuum child process failed with an error for database %s. ret=%d c=%d\n", child_ctx->vacuum_handle->ctdb_db->db_name, ret, c));
+	} else {
+		child_ctx->status = VACUUM_OK;
+	}
+
+	talloc_free(child_ctx);
+}
+
+/*
+ * this event is called every time we need to start a new vacuum process
+ */
+static int vacuum_db_child(TALLOC_CTX *mem_ctx,
+			   struct ctdb_db_context *ctdb_db,
+			   bool scheduled,
+			   bool full_vacuum_run,
+			   struct ctdb_vacuum_child_context **out)
+{
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct ctdb_vacuum_child_context *child_ctx;
+	struct tevent_fd *fde;
+	int ret;
+
+	/* we don't vacuum if we are in recovery mode, or db frozen */
+	if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ||
+	    ctdb_db_frozen(ctdb_db)) {
+		D_INFO("Not vacuuming %s (%s)\n", ctdb_db->db_name,
+		       ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ?
+		       "in recovery" : "frozen");
+		return EAGAIN;
+	}
+
+	/* Do not allow multiple vacuuming child processes to be active at the
+	 * same time.  If there is vacuuming child process active, delay
+	 * new vacuuming event to stagger vacuuming events.
+	 */
+	if (ctdb->vacuumer != NULL) {
+		return EBUSY;
+	}
+
+	child_ctx = talloc_zero(mem_ctx, struct ctdb_vacuum_child_context);
+	if (child_ctx == NULL) {
+		DBG_ERR("Failed to allocate child context for vacuuming of %s\n",
+			ctdb_db->db_name);
+		return ENOMEM;
+	}
+
+
+	ret = pipe(child_ctx->fd);
+	if (ret != 0) {
+		talloc_free(child_ctx);
+		D_ERR("Failed to create pipe for vacuum child process.\n");
+		return EAGAIN;
+	}
+
+	child_ctx->child_pid = ctdb_fork(ctdb);
+	if (child_ctx->child_pid == (pid_t)-1) {
+		close(child_ctx->fd[0]);
+		close(child_ctx->fd[1]);
+		talloc_free(child_ctx);
+		D_ERR("Failed to fork vacuum child process.\n");
+		return EAGAIN;
+	}
+
+
+	if (child_ctx->child_pid == 0) {
+		char cc = 0;
+		close(child_ctx->fd[0]);
+
+		D_INFO("Vacuuming child process %d for db %s started\n",
+		       getpid(),
+		       ctdb_db->db_name);
+		prctl_set_comment("ctdb_vacuum");
+		ret = switch_from_server_to_client(ctdb);
+		if (ret != 0) {
+			DBG_ERR("ERROR: failed to switch vacuum daemon "
+				"into client mode.\n");
+			return EIO;
+		}
+
+		cc = ctdb_vacuum_and_repack_db(ctdb_db, full_vacuum_run);
+
+		sys_write(child_ctx->fd[1], &cc, 1);
+		_exit(0);
+	}
+
+	set_close_on_exec(child_ctx->fd[0]);
+	close(child_ctx->fd[1]);
+
+	child_ctx->status = VACUUM_RUNNING;
+	child_ctx->scheduled = scheduled;
+	child_ctx->start_time = timeval_current();
+
+	ctdb->vacuumer = child_ctx;
+	talloc_set_destructor(child_ctx, vacuum_child_destructor);
+
+	/*
+	 * Clear the fastpath vacuuming list in the parent.
+	 */
+	talloc_free(ctdb_db->delete_queue);
+	ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+	if (ctdb_db->delete_queue == NULL) {
+		DBG_ERR("Out of memory when re-creating vacuum tree\n");
+		return ENOMEM;
+	}
+
+	talloc_free(ctdb_db->fetch_queue);
+	ctdb_db->fetch_queue = trbt_create(ctdb_db, 0);
+	if (ctdb_db->fetch_queue == NULL) {
+		ctdb_fatal(ctdb, "Out of memory when re-create fetch queue "
+				 " in parent context. Shutting down\n");
+	}
+
+	tevent_add_timer(ctdb->ev, child_ctx,
+			 timeval_current_ofs(ctdb->tunable.vacuum_max_run_time,
+					     0),
+			 vacuum_child_timeout, child_ctx);
+
+	DBG_DEBUG(" Created PIPE FD:%d to child vacuum process\n",
+		  child_ctx->fd[0]);
+
+	fde = tevent_add_fd(ctdb->ev, child_ctx, child_ctx->fd[0],
+			    TEVENT_FD_READ, vacuum_child_handler, child_ctx);
+	tevent_fd_set_auto_close(fde);
+
+	child_ctx->vacuum_handle = ctdb_db->vacuum_handle;
+
+	*out = child_ctx;
+	return 0;
+}
+
+static void ctdb_vacuum_event(struct tevent_context *ev,
+			      struct tevent_timer *te,
+			      struct timeval t, void *private_data)
+{
+	struct ctdb_vacuum_handle *vacuum_handle = talloc_get_type(
+		private_data, struct ctdb_vacuum_handle);
+	struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db;
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	struct ctdb_vacuum_child_context *child_ctx = NULL;
+	uint32_t fast_path_max = ctdb->tunable.vacuum_fast_path_count;
+	uint32_t vacuum_interval = get_vacuum_interval(ctdb_db);
+	bool full_vacuum_run = false;
+	int ret;
+
+	if (vacuum_interval > vacuum_handle->vacuum_interval) {
+		uint32_t d = vacuum_interval - vacuum_handle->vacuum_interval;
+
+		DBG_INFO("Vacuum interval increased from "
+			 "%"PRIu32" to %"PRIu32", rescheduling\n",
+			 vacuum_handle->vacuum_interval,
+			 vacuum_interval);
+		vacuum_handle->vacuum_interval = vacuum_interval;
+		tevent_add_timer(ctdb->ev,
+				 vacuum_handle,
+				 timeval_current_ofs(d, 0),
+				 ctdb_vacuum_event,
+				 vacuum_handle);
+		return;
+	}
+
+	vacuum_handle->vacuum_interval = vacuum_interval;
+
+	if (vacuum_handle->fast_path_count >= fast_path_max) {
+		if (fast_path_max > 0) {
+			full_vacuum_run = true;
+		}
+		vacuum_handle->fast_path_count = 0;
+	}
+
+	ret = vacuum_db_child(vacuum_handle,
+			      ctdb_db,
+			      true,
+			      full_vacuum_run,
+			      &child_ctx);
+
+	if (ret == 0) {
+		return;
+	}
+
+	switch (ret) {
+	case EBUSY:
+		/* Stagger */
+		tevent_add_timer(ctdb->ev,
+				 vacuum_handle,
+				 timeval_current_ofs(0, 500*1000),
+				 ctdb_vacuum_event,
+				 vacuum_handle);
+		break;
+
+	default:
+		/* Temporary failure, schedule next attempt */
+		tevent_add_timer(ctdb->ev,
+				 vacuum_handle,
+				 timeval_current_ofs(
+					 vacuum_handle->vacuum_interval, 0),
+				 ctdb_vacuum_event,
+				 vacuum_handle);
+	}
+
+}
+
+struct vacuum_control_state {
+	struct ctdb_vacuum_child_context *child_ctx;
+	struct ctdb_req_control_old *c;
+	struct ctdb_context *ctdb;
+};
+
+static int vacuum_control_state_destructor(struct vacuum_control_state *state)
+{
+	struct ctdb_vacuum_child_context *child_ctx = state->child_ctx;
+	int32_t status;
+
+	status = (child_ctx->status == VACUUM_OK ? 0 : -1);
+	ctdb_request_control_reply(state->ctdb, state->c, NULL, status, NULL);
+
+	return 0;
+}
+
+int32_t ctdb_control_db_vacuum(struct ctdb_context *ctdb,
+			       struct ctdb_req_control_old *c,
+			       TDB_DATA indata,
+			       bool *async_reply)
+{
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_vacuum_child_context *child_ctx = NULL;
+	struct ctdb_db_vacuum *db_vacuum;
+	struct vacuum_control_state *state;
+	size_t np;
+	int ret;
+
+	ret = ctdb_db_vacuum_pull(indata.dptr,
+				  indata.dsize,
+				  ctdb,
+				  &db_vacuum,
+				  &np);
+	if (ret != 0) {
+		DBG_ERR("Invalid data\n");
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, db_vacuum->db_id);
+	if (ctdb_db == NULL) {
+		DBG_ERR("Unknown db id 0x%08x\n", db_vacuum->db_id);
+		talloc_free(db_vacuum);
+		return -1;
+	}
+
+	state = talloc(ctdb, struct vacuum_control_state);
+	if (state == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return -1;
+	}
+
+	ret = vacuum_db_child(ctdb_db,
+			      ctdb_db,
+			      false,
+			      db_vacuum->full_vacuum_run,
+			      &child_ctx);
+
+	talloc_free(db_vacuum);
+
+	if (ret == 0) {
+		(void) talloc_steal(child_ctx, state);
+
+		state->child_ctx = child_ctx;
+		state->c = talloc_steal(state, c);
+		state->ctdb = ctdb;
+
+		talloc_set_destructor(state, vacuum_control_state_destructor);
+
+		*async_reply = true;
+		return 0;
+	}
+
+	talloc_free(state);
+
+	switch (ret) {
+	case EBUSY:
+		DBG_WARNING("Vacuuming collision\n");
+		break;
+
+	default:
+		DBG_ERR("Temporary vacuuming failure, ret=%d\n", ret);
+	}
+
+	return -1;
+}
+
+void ctdb_stop_vacuuming(struct ctdb_context *ctdb)
+{
+	if (ctdb->vacuumer != NULL) {
+		D_INFO("Aborting vacuuming for %s (%i)\n",
+		       ctdb->vacuumer->vacuum_handle->ctdb_db->db_name,
+		       (int)ctdb->vacuumer->child_pid);
+		/* vacuum_child_destructor kills it, removes from list */
+		talloc_free(ctdb->vacuumer);
+	}
+}
+
+/* this function initializes the vacuuming context for a database
+ * starts the vacuuming events
+ */
+int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db)
+{
+	struct ctdb_vacuum_handle *vacuum_handle;
+
+	if (! ctdb_db_volatile(ctdb_db)) {
+		DEBUG(DEBUG_ERR,
+		      ("Vacuuming is disabled for non-volatile database %s\n",
+		       ctdb_db->db_name));
+		return 0;
+	}
+
+	vacuum_handle = talloc(ctdb_db, struct ctdb_vacuum_handle);
+	if (vacuum_handle == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		return -1;
+	}
+
+	vacuum_handle->ctdb_db = ctdb_db;
+	vacuum_handle->fast_path_count = 0;
+	vacuum_handle->vacuum_interval = get_vacuum_interval(ctdb_db);
+
+	ctdb_db->vacuum_handle = vacuum_handle;
+
+	tevent_add_timer(ctdb_db->ctdb->ev,
+			 vacuum_handle,
+			 timeval_current_ofs(vacuum_handle->vacuum_interval, 0),
+			 ctdb_vacuum_event,
+			 vacuum_handle);
+
+	return 0;
+}
+
+static void remove_record_from_delete_queue(struct ctdb_db_context *ctdb_db,
+					    const struct ctdb_ltdb_header *hdr,
+					    const TDB_DATA key)
+{
+	struct delete_record_data *kd;
+	uint32_t hash;
+
+	hash = (uint32_t)ctdb_hash(&key);
+
+	DEBUG(DEBUG_DEBUG, (__location__
+			    " remove_record_from_delete_queue: "
+			    "db[%s] "
+			    "db_id[0x%08x] "
+			    "key_hash[0x%08x] "
+			    "lmaster[%u] "
+			    "migrated_with_data[%s]\n",
+			     ctdb_db->db_name, ctdb_db->db_id,
+			     hash,
+			     ctdb_lmaster(ctdb_db->ctdb, &key),
+			     hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no"));
+
+	kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
+	if (kd == NULL) {
+		DEBUG(DEBUG_DEBUG, (__location__
+				    " remove_record_from_delete_queue: "
+				    "record not in queue (hash[0x%08x])\n.",
+				    hash));
+		return;
+	}
+
+	if ((kd->key.dsize != key.dsize) ||
+	    (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+	{
+		DEBUG(DEBUG_DEBUG, (__location__
+				    " remove_record_from_delete_queue: "
+				    "hash collision for key with hash[0x%08x] "
+				    "in db[%s] - skipping\n",
+				    hash, ctdb_db->db_name));
+		return;
+	}
+
+	DEBUG(DEBUG_DEBUG, (__location__
+			    " remove_record_from_delete_queue: "
+			    "removing key with hash[0x%08x]\n",
+			     hash));
+
+	talloc_free(kd);
+
+	return;
+}
+
+/**
+ * Insert a record into the ctdb_db context's delete queue,
+ * handling hash collisions.
+ */
+static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
+					   const struct ctdb_ltdb_header *hdr,
+					   TDB_DATA key)
+{
+	struct delete_record_data *kd;
+	uint32_t hash;
+	int ret;
+
+	hash = (uint32_t)ctdb_hash(&key);
+
+	DEBUG(DEBUG_DEBUG, (__location__ " schedule for deletion: db[%s] "
+			    "db_id[0x%08x] "
+			    "key_hash[0x%08x] "
+			    "lmaster[%u] "
+			    "migrated_with_data[%s]\n",
+			    ctdb_db->db_name, ctdb_db->db_id,
+			    hash,
+			    ctdb_lmaster(ctdb_db->ctdb, &key),
+			    hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no"));
+
+	kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
+	if (kd != NULL) {
+		if ((kd->key.dsize != key.dsize) ||
+		    (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+		{
+			DEBUG(DEBUG_INFO,
+			      (__location__ " schedule for deletion: "
+			       "hash collision for key hash [0x%08x]. "
+			       "Skipping the record.\n", hash));
+			return 0;
+		} else {
+			DEBUG(DEBUG_DEBUG,
+			      (__location__ " schedule for deletion: "
+			       "updating entry for key with hash [0x%08x].\n",
+			       hash));
+		}
+	}
+
+	ret = insert_delete_record_data_into_tree(ctdb_db->ctdb, ctdb_db,
+						  ctdb_db->delete_queue,
+						  hdr, key);
+	if (ret != 0) {
+		DEBUG(DEBUG_INFO,
+		      (__location__ " schedule for deletion: error "
+		       "inserting key with hash [0x%08x] into delete queue\n",
+		       hash));
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Schedule a record for deletion.
+ * Called from the parent context.
+ */
+int32_t ctdb_control_schedule_for_deletion(struct ctdb_context *ctdb,
+					   TDB_DATA indata)
+{
+	struct ctdb_control_schedule_for_deletion *dd;
+	struct ctdb_db_context *ctdb_db;
+	int ret;
+	TDB_DATA key;
+
+	dd = (struct ctdb_control_schedule_for_deletion *)indata.dptr;
+
+	ctdb_db = find_ctdb_db(ctdb, dd->db_id);
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Unknown db id 0x%08x\n",
+				  dd->db_id));
+		return -1;
+	}
+
+	key.dsize = dd->keylen;
+	key.dptr = dd->key;
+
+	ret = insert_record_into_delete_queue(ctdb_db, &dd->hdr, key);
+
+	return ret;
+}
+
+int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db,
+					 const struct ctdb_ltdb_header *hdr,
+					 TDB_DATA key)
+{
+	int ret;
+	struct ctdb_control_schedule_for_deletion *dd;
+	TDB_DATA indata;
+	int32_t status;
+
+	if (ctdb_db->ctdb->ctdbd_pid == getpid()) {
+		/* main daemon - directly queue */
+		ret = insert_record_into_delete_queue(ctdb_db, hdr, key);
+
+		return ret;
+	}
+
+	/* if we don't have a connection to the daemon we can not send
+	   a control. For example sometimes from update_record control child
+	   process.
+	*/
+	if (!ctdb_db->ctdb->can_send_controls) {
+		return -1;
+	}
+
+
+	/* child process: send the main daemon a control */
+	indata.dsize = offsetof(struct ctdb_control_schedule_for_deletion, key) + key.dsize;
+	indata.dptr = talloc_zero_array(ctdb_db, uint8_t, indata.dsize);
+	if (indata.dptr == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+		return -1;
+	}
+	dd = (struct ctdb_control_schedule_for_deletion *)(void *)indata.dptr;
+	dd->db_id = ctdb_db->db_id;
+	dd->hdr = *hdr;
+	dd->keylen = key.dsize;
+	memcpy(dd->key, key.dptr, key.dsize);
+
+	ret = ctdb_control(ctdb_db->ctdb,
+			   CTDB_CURRENT_NODE,
+			   ctdb_db->db_id,
+			   CTDB_CONTROL_SCHEDULE_FOR_DELETION,
+			   CTDB_CTRL_FLAG_NOREPLY, /* flags */
+			   indata,
+			   NULL, /* mem_ctx */
+			   NULL, /* outdata */
+			   &status,
+			   NULL, /* timeout : NULL == wait forever */
+			   NULL); /* error message */
+
+	talloc_free(indata.dptr);
+
+	if (ret != 0 || status != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Error sending "
+				  "SCHEDULE_FOR_DELETION "
+				  "control.\n"));
+		if (status != 0) {
+			ret = -1;
+		}
+	}
+
+	return ret;
+}
+
+void ctdb_local_remove_from_delete_queue(struct ctdb_db_context *ctdb_db,
+					 const struct ctdb_ltdb_header *hdr,
+					 const TDB_DATA key)
+{
+	if (ctdb_db->ctdb->ctdbd_pid != getpid()) {
+		/*
+		 * Only remove the record from the delete queue if called
+		 * in the main daemon.
+		 */
+		return;
+	}
+
+	remove_record_from_delete_queue(ctdb_db, hdr, key);
+
+	return;
+}
+
+static int vacuum_fetch_parser(uint32_t reqid,
+			       struct ctdb_ltdb_header *header,
+			       TDB_DATA key, TDB_DATA data,
+			       void *private_data)
+{
+	struct ctdb_db_context *ctdb_db = talloc_get_type_abort(
+		private_data, struct ctdb_db_context);
+	struct fetch_record_data *rd;
+	size_t len;
+	uint32_t hash;
+
+	len = offsetof(struct fetch_record_data, keydata) + key.dsize;
+
+	rd = (struct fetch_record_data *)talloc_size(ctdb_db->fetch_queue,
+						     len);
+	if (rd == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Memory error\n"));
+		return -1;
+	}
+	talloc_set_name_const(rd, "struct fetch_record_data");
+
+	rd->key.dsize = key.dsize;
+	rd->key.dptr = rd->keydata;
+	memcpy(rd->keydata, key.dptr, key.dsize);
+
+	hash = ctdb_hash(&key);
+
+	trbt_insert32(ctdb_db->fetch_queue, hash, rd);
+
+	return 0;
+}
+
+int32_t ctdb_control_vacuum_fetch(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_rec_buffer *recbuf;
+	struct ctdb_db_context *ctdb_db;
+	size_t npull;
+	int ret;
+
+	ret = ctdb_rec_buffer_pull(indata.dptr, indata.dsize, ctdb, &recbuf,
+				   &npull);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Invalid data in vacuum_fetch\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, recbuf->db_id);
+	if (ctdb_db == NULL) {
+		talloc_free(recbuf);
+		DEBUG(DEBUG_ERR, (__location__ " Unknown db 0x%08x\n",
+				  recbuf->db_id));
+		return -1;
+	}
+
+	ret = ctdb_rec_buffer_traverse(recbuf, vacuum_fetch_parser, ctdb_db);
+	talloc_free(recbuf);
+	return ret;
+}
diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c
new file mode 100644
index 0000000..a388bff
--- /dev/null
+++ b/ctdb/server/ctdbd.c
@@ -0,0 +1,407 @@
+/* 
+   standalone ctdb daemon
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/wait.h"
+#include "system/network.h"
+#include "system/syslog.h"
+
+#include <popt.h>
+#include <talloc.h>
+/* Allow use of deprecated function tevent_loop_allow_nesting() */
+#define TEVENT_DEPRECATED
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/path.h"
+#include "common/logging.h"
+#include "common/logging_conf.h"
+
+#include "ctdb_config.h"
+
+int script_log_level;
+bool fast_start;
+
+/*
+  called by the transport layer when a packet comes in
+*/
+static void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length)
+{
+	struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+
+	CTDB_INCREMENT_STAT(ctdb, node_packets_recv);
+
+	/* up the counter for this source node, so we know its alive */
+	if (ctdb_validate_pnn(ctdb, hdr->srcnode)) {
+		/* as a special case, redirected calls don't increment the rx_cnt */
+		if (hdr->operation != CTDB_REQ_CALL ||
+		    ((struct ctdb_req_call_old *)hdr)->hopcount == 0) {
+			ctdb->nodes[hdr->srcnode]->rx_cnt++;
+		}
+	}
+
+	ctdb_input_pkt(ctdb, hdr);
+}
+
+static const struct ctdb_upcalls ctdb_upcalls = {
+	.recv_pkt       = ctdb_recv_pkt,
+	.node_dead      = ctdb_node_dead,
+	.node_connected = ctdb_node_connected
+};
+
+static struct ctdb_context *ctdb_init(struct tevent_context *ev)
+{
+	int ret;
+	struct ctdb_context *ctdb;
+
+	ctdb = talloc_zero(ev, struct ctdb_context);
+	if (ctdb == NULL) {
+		DBG_ERR("Memory error\n");
+		return NULL;
+	}
+	ctdb->ev  = ev;
+
+	/* Wrap early to exercise code. */
+	ret = reqid_init(ctdb, INT_MAX-200, &ctdb->idr);
+	if (ret != 0) {
+		D_ERR("reqid_init failed (%s)\n", strerror(ret));
+		talloc_free(ctdb);
+		return NULL;
+	}
+
+	ret = srvid_init(ctdb, &ctdb->srv);
+	if (ret != 0) {
+		D_ERR("srvid_init failed (%s)\n", strerror(ret));
+		talloc_free(ctdb);
+		return NULL;
+	}
+
+	ctdb->daemon.name = path_socket(ctdb, "ctdbd");
+	if (ctdb->daemon.name == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		talloc_free(ctdb);
+		return NULL;
+	}
+
+	ctdbd_pidfile = path_pidfile(ctdb, "ctdbd");
+	if (ctdbd_pidfile == NULL) {
+		DBG_ERR("Memory allocation error\n");
+		talloc_free(ctdb);
+		return NULL;
+	}
+
+	gettimeofday(&ctdb->ctdbd_start_time, NULL);
+
+	gettimeofday(&ctdb->last_recovery_started, NULL);
+	gettimeofday(&ctdb->last_recovery_finished, NULL);
+
+	ctdb->recovery_mode    = CTDB_RECOVERY_NORMAL;
+
+	ctdb->upcalls = &ctdb_upcalls;
+
+	ctdb->statistics.statistics_start_time = timeval_current();
+
+	ctdb->capabilities = CTDB_CAP_DEFAULT;
+
+	/*
+	 * Initialise this node's PNN to the unknown value.  This will
+	 * be set to the correct value by either ctdb_add_node() as
+	 * part of loading the nodes file or by
+	 * ctdb_tcp_listen_automatic() when the transport is
+	 * initialised.  At some point we should de-optimise this and
+	 * pull it out into ctdb_start_daemon() so it is done clearly
+	 * and only in one place.
+	 */
+	ctdb->pnn = CTDB_UNKNOWN_PNN;
+
+	ctdb->do_checkpublicip = true;
+
+	return ctdb;
+}
+
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+	struct ctdb_context *ctdb = NULL;
+	int interactive_opt = 0;
+	bool interactive = false;
+
+	struct poptOption popt_options[] = {
+		POPT_AUTOHELP
+		{ "interactive", 'i', POPT_ARG_NONE, &interactive_opt, 0,
+		  "don't fork, log to stderr", NULL },
+		POPT_TABLEEND
+	};
+	int opt, ret;
+	const char **extra_argv;
+	poptContext pc;
+	struct tevent_context *ev;
+	const char *ctdb_base;
+	struct conf_context *conf;
+	const char *logging_location;
+	const char *test_mode;
+	bool ok;
+
+	setproctitle_init(argc, discard_const(argv), environ);
+
+	/*
+	 * Basic setup
+	 */
+
+	talloc_enable_null_tracking();
+
+	fault_setup();
+
+	ev = tevent_context_init(NULL);
+	if (ev == NULL) {
+		fprintf(stderr, "tevent_context_init() failed\n");
+		exit(1);
+	}
+	tevent_loop_allow_nesting(ev);
+
+	ctdb = ctdb_init(ev);
+	if (ctdb == NULL) {
+		fprintf(stderr, "Failed to init ctdb\n");
+		exit(1);
+	}
+
+	/* Default value for CTDB_BASE - don't override */
+	setenv("CTDB_BASE", CTDB_ETCDIR, 0);
+	ctdb_base = getenv("CTDB_BASE");
+	if (ctdb_base == NULL) {
+		D_ERR("CTDB_BASE not set\n");
+		exit(1);
+	}
+
+	/*
+	 * Command-line option handling
+	 */
+
+	pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+	while ((opt = poptGetNextOpt(pc)) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, "Invalid option %s: %s\n", 
+				poptBadOption(pc, 0), poptStrerror(opt));
+			goto fail;
+		}
+	}
+
+	/* If there are extra arguments then exit with usage message */
+	extra_argv = poptGetArgs(pc);
+	if (extra_argv) {
+		extra_argv++;
+		if (extra_argv[0])  {
+			poptPrintHelp(pc, stdout, 0);
+			goto fail;
+		}
+	}
+
+	interactive = (interactive_opt != 0);
+
+	/*
+	 * Configuration file handling
+	 */
+
+	ret = ctdbd_config_load(ctdb, &conf);
+	if (ret != 0) {
+		/* ctdbd_config_load() logs the failure */
+		goto fail;
+	}
+
+	/*
+	 * Logging setup/options
+	 */
+
+	test_mode = getenv("CTDB_TEST_MODE");
+
+	/* Log to stderr (ignoring configuration) when running as interactive */
+	if (interactive) {
+		logging_location = "file:";
+		setenv("CTDB_INTERACTIVE", "true", 1);
+	} else {
+		logging_location = logging_conf_location(conf);
+	}
+
+	if (strcmp(logging_location, "syslog") != 0 && test_mode == NULL) {
+		/* This can help when CTDB logging is misconfigured */
+		syslog(LOG_DAEMON|LOG_NOTICE,
+		       "CTDB logging to location %s",
+		       logging_location);
+	}
+
+	/* Initialize logging and set the debug level */
+	ok = ctdb_logging_init(ctdb,
+			       logging_location,
+			       logging_conf_log_level(conf));
+	if (!ok) {
+		goto fail;
+	}
+	setenv("CTDB_LOGGING", logging_location, 1);
+	setenv("CTDB_DEBUGLEVEL", debug_level_to_string(DEBUGLEVEL), 1);
+
+	script_log_level = debug_level_from_string(
+					ctdb_config.script_log_level);
+
+	D_NOTICE("CTDB starting on node\n");
+
+	/*
+	 * Cluster setup/options
+	 */
+
+	ret = ctdb_set_transport(ctdb, ctdb_config.transport);
+	if (ret == -1) {
+		D_ERR("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb));
+		goto fail;
+	}
+
+	if (ctdb_config.cluster_lock != NULL) {
+		ctdb->recovery_lock = ctdb_config.cluster_lock;
+	} else if (ctdb_config.recovery_lock != NULL) {
+		ctdb->recovery_lock = ctdb_config.recovery_lock;
+	} else {
+		D_WARNING("Cluster lock not set\n");
+	}
+
+	/* tell ctdb what address to listen on */
+	if (ctdb_config.node_address) {
+		ret = ctdb_set_address(ctdb, ctdb_config.node_address);
+		if (ret == -1) {
+			D_ERR("ctdb_set_address failed - %s\n",
+			      ctdb_errstr(ctdb));
+			goto fail;
+		}
+	}
+
+	/* tell ctdb what nodes are available */
+	ctdb->nodes_file = talloc_asprintf(ctdb, "%s/nodes", ctdb_base);
+	if (ctdb->nodes_file == NULL) {
+		DBG_ERR(" Out of memory\n");
+		goto fail;
+	}
+	ctdb_load_nodes_file(ctdb);
+
+	/*
+	 * Database setup/options
+	 */
+
+	ctdb->db_directory = ctdb_config.dbdir_volatile;
+	ok = directory_exist(ctdb->db_directory);
+	if (! ok) {
+		D_ERR("Volatile database directory %s does not exist\n",
+		      ctdb->db_directory);
+		goto fail;
+	}
+
+	ctdb->db_directory_persistent = ctdb_config.dbdir_persistent;
+	ok = directory_exist(ctdb->db_directory_persistent);
+	if (! ok) {
+		D_ERR("Persistent database directory %s does not exist\n",
+		      ctdb->db_directory_persistent);
+		goto fail;
+	}
+
+	ctdb->db_directory_state = ctdb_config.dbdir_state;
+	ok = directory_exist(ctdb->db_directory_state);
+	if (! ok) {
+		D_ERR("State database directory %s does not exist\n",
+		      ctdb->db_directory_state);
+		goto fail;
+	}
+
+	if (ctdb_config.lock_debug_script != NULL) {
+		ret = setenv("CTDB_DEBUG_LOCKS",
+			     ctdb_config.lock_debug_script,
+			     1);
+		if (ret != 0) {
+			D_ERR("Failed to set up lock debugging (%s)\n",
+			      strerror(errno));
+			goto fail;
+		}
+	}
+
+	/*
+	 * Legacy setup/options
+	 */
+
+	ctdb->start_as_disabled = (int)ctdb_config.start_as_disabled;
+	ctdb->start_as_stopped  = (int)ctdb_config.start_as_stopped;
+
+	/* set ctdbd capabilities */
+	if (!ctdb_config.lmaster_capability) {
+		ctdb->capabilities &= ~CTDB_CAP_LMASTER;
+	}
+	if (!ctdb_config.leader_capability) {
+		ctdb->capabilities &= ~CTDB_CAP_RECMASTER;
+	}
+
+	ctdb->do_setsched = ctdb_config.realtime_scheduling;
+
+	/*
+	 * Miscellaneous setup
+	 */
+
+	ctdb_tunables_load(ctdb);
+
+	ctdb->event_script_dir = talloc_asprintf(ctdb,
+						 "%s/events/legacy",
+						 ctdb_base);
+	if (ctdb->event_script_dir == NULL) {
+		DBG_ERR("Out of memory\n");
+		goto fail;
+	}
+
+	ctdb->notification_script = talloc_asprintf(ctdb,
+						    "%s/notify.sh",
+						    ctdb_base);
+	if (ctdb->notification_script == NULL) {
+		D_ERR("Unable to set notification script\n");
+		goto fail;
+	}
+
+	/*
+	 * Testing and debug options
+	 */
+
+	if (test_mode != NULL) {
+		ctdb->do_setsched = false;
+		ctdb->do_checkpublicip = false;
+		fast_start = true;
+	}
+
+	/* start the protocol running (as a child) */
+	return ctdb_start_daemon(ctdb, interactive, test_mode != NULL);
+
+fail:
+	talloc_free(ctdb);
+	exit(1);
+}
diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c
new file mode 100644
index 0000000..3ea7d74
--- /dev/null
+++ b/ctdb/server/eventscript.c
@@ -0,0 +1,845 @@
+/*
+   event script handling
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+#include "system/dir.h"
+#include "system/locale.h"
+#include "system/time.h"
+#include "system/dir.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+
+#include "ctdb_private.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/reqid.h"
+#include "common/sock_io.h"
+#include "common/path.h"
+
+#include "protocol/protocol_util.h"
+#include "event/event_protocol_api.h"
+
+/*
+ * Setting up event daemon
+ */
+
+struct eventd_context {
+	struct tevent_context *ev;
+	const char *path;
+	const char *socket;
+
+	/* server state */
+	pid_t eventd_pid;
+	struct tevent_fd *eventd_fde;
+
+	/* client state */
+	struct reqid_context *idr;
+	struct sock_queue *queue;
+	struct eventd_client_state *calls;
+};
+
+static bool eventd_context_init(TALLOC_CTX *mem_ctx,
+				struct ctdb_context *ctdb,
+				struct eventd_context **out)
+{
+	struct eventd_context *ectx;
+	const char *eventd = CTDB_HELPER_BINDIR "/ctdb-eventd";
+	const char *value;
+	int ret;
+
+	ectx = talloc_zero(mem_ctx, struct eventd_context);
+	if (ectx == NULL) {
+		return false;
+	}
+
+	ectx->ev = ctdb->ev;
+
+	value = getenv("CTDB_EVENTD");
+	if (value != NULL) {
+		eventd = value;
+	}
+
+	ectx->path = talloc_strdup(ectx, eventd);
+	if (ectx->path == NULL) {
+		talloc_free(ectx);
+		return false;
+	}
+
+	ectx->socket = path_socket(ectx, "eventd");
+	if (ectx->socket == NULL) {
+		talloc_free(ectx);
+		return false;
+	}
+
+	ret = reqid_init(ectx, 1, &ectx->idr);
+	if (ret != 0) {
+		talloc_free(ectx);
+		return false;
+	}
+
+	ectx->eventd_pid = -1;
+
+	*out = ectx;
+	return true;
+}
+
+struct eventd_startup_state {
+	bool done;
+	int ret;
+	int fd;
+};
+
+static void eventd_startup_timeout_handler(struct tevent_context *ev,
+					   struct tevent_timer *te,
+					   struct timeval t,
+					   void *private_data)
+{
+	struct eventd_startup_state *state =
+		(struct eventd_startup_state *) private_data;
+
+	state->done = true;
+	state->ret = ETIMEDOUT;
+}
+
+static void eventd_startup_handler(struct tevent_context *ev,
+				   struct tevent_fd *fde, uint16_t flags,
+				   void *private_data)
+{
+	struct eventd_startup_state *state =
+		(struct eventd_startup_state *)private_data;
+	unsigned int data;
+	ssize_t num_read;
+
+	num_read = sys_read(state->fd, &data, sizeof(data));
+	if (num_read == sizeof(data)) {
+		if (data == 0) {
+			state->ret = 0;
+		} else {
+			state->ret = EIO;
+		}
+	} else if (num_read == 0) {
+		state->ret = EPIPE;
+	} else if (num_read == -1) {
+		state->ret = errno;
+	} else {
+		state->ret = EINVAL;
+	}
+
+	state->done = true;
+}
+
+
+static int wait_for_daemon_startup(struct tevent_context *ev,
+				   int fd)
+{
+	TALLOC_CTX *mem_ctx;
+	struct tevent_timer *timer;
+	struct tevent_fd *fde;
+	struct eventd_startup_state state = {
+		.done = false,
+		.ret = 0,
+		.fd = fd,
+	};
+
+	mem_ctx = talloc_new(ev);
+	if (mem_ctx == NULL) {
+		return ENOMEM;
+	}
+
+	timer = tevent_add_timer(ev,
+				 mem_ctx,
+				 tevent_timeval_current_ofs(10, 0),
+				 eventd_startup_timeout_handler,
+				 &state);
+	if (timer == NULL) {
+		talloc_free(mem_ctx);
+		return ENOMEM;
+	}
+
+	fde = tevent_add_fd(ev,
+			    mem_ctx,
+			    fd,
+			    TEVENT_FD_READ,
+			    eventd_startup_handler,
+			    &state);
+	if (fde == NULL) {
+		talloc_free(mem_ctx);
+		return ENOMEM;
+	}
+
+	while (! state.done) {
+		tevent_loop_once(ev);
+	}
+
+	talloc_free(mem_ctx);
+
+	return state.ret;
+}
+
+
+/*
+ * Start and stop event daemon
+ */
+
+static bool eventd_client_connect(struct eventd_context *ectx);
+static void eventd_dead_handler(struct tevent_context *ev,
+				struct tevent_fd *fde, uint16_t flags,
+				void *private_data);
+
+int ctdb_start_eventd(struct ctdb_context *ctdb)
+{
+	struct eventd_context *ectx;
+	const char **argv;
+	int fd[2];
+	pid_t pid;
+	int ret;
+	bool status;
+
+	if (ctdb->ectx == NULL) {
+		status = eventd_context_init(ctdb, ctdb, &ctdb->ectx);
+		if (! status) {
+			DEBUG(DEBUG_ERR,
+			      ("Failed to initialize eventd context\n"));
+			return -1;
+		}
+	}
+
+	ectx = ctdb->ectx;
+
+	if (! sock_clean(ectx->socket)) {
+		return -1;
+	}
+
+	ret = pipe(fd);
+	if (ret != 0) {
+		return -1;
+	}
+
+	argv = talloc_array(ectx, const char *, 6);
+	if (argv == NULL) {
+		close(fd[0]);
+		close(fd[1]);
+		return -1;
+	}
+
+	argv[0] = ectx->path;
+	argv[1] = "-P";
+	argv[2] = talloc_asprintf(argv, "%d", ctdb->ctdbd_pid);
+	argv[3] = "-S";
+	argv[4] = talloc_asprintf(argv, "%d", fd[1]);
+	argv[5] = NULL;
+
+	if (argv[2] == NULL || argv[4] == NULL) {
+		close(fd[0]);
+		close(fd[1]);
+		talloc_free(argv);
+		return -1;
+	}
+
+	D_NOTICE("Starting event daemon %s %s %s %s %s\n",
+		 argv[0],
+		 argv[1],
+		 argv[2],
+		 argv[3],
+		 argv[4]);
+
+	pid = ctdb_fork(ctdb);
+	if (pid == -1) {
+		close(fd[0]);
+		close(fd[1]);
+		talloc_free(argv);
+		return -1;
+	}
+
+	if (pid == 0) {
+		close(fd[0]);
+		ret = execv(argv[0], discard_const(argv));
+		if (ret == -1) {
+			_exit(errno);
+		}
+		_exit(0);
+	}
+
+	talloc_free(argv);
+	close(fd[1]);
+
+	ret = wait_for_daemon_startup(ctdb->ev, fd[0]);
+	if (ret != 0) {
+		ctdb_kill(ctdb, pid, SIGKILL);
+		close(fd[0]);
+		D_ERR("Failed to initialize event daemon (%d)\n", ret);
+		return -1;
+	}
+
+	ectx->eventd_fde = tevent_add_fd(ctdb->ev, ectx, fd[0],
+					 TEVENT_FD_READ,
+					 eventd_dead_handler, ectx);
+	if (ectx->eventd_fde == NULL) {
+		ctdb_kill(ctdb, pid, SIGKILL);
+		close(fd[0]);
+		return -1;
+	}
+
+	tevent_fd_set_auto_close(ectx->eventd_fde);
+	ectx->eventd_pid = pid;
+
+	status = eventd_client_connect(ectx);
+	if (! status) {
+		DEBUG(DEBUG_ERR, ("Failed to connect to event daemon\n"));
+		ctdb_stop_eventd(ctdb);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void eventd_dead_handler(struct tevent_context *ev,
+				struct tevent_fd *fde, uint16_t flags,
+				void *private_data)
+{
+	D_ERR("Eventd went away - exiting\n");
+	exit(1);
+}
+
+void ctdb_stop_eventd(struct ctdb_context *ctdb)
+{
+	struct eventd_context *ectx = ctdb->ectx;
+
+	if (ectx == NULL) {
+		return;
+	}
+
+	TALLOC_FREE(ectx->eventd_fde);
+	if (ectx->eventd_pid != -1) {
+		kill(ectx->eventd_pid, SIGTERM);
+		ectx->eventd_pid = -1;
+	}
+	TALLOC_FREE(ctdb->ectx);
+}
+
+/*
+ * Connect to event daemon
+ */
+
+struct eventd_client_state {
+	struct eventd_client_state *prev, *next;
+
+	struct eventd_context *ectx;
+	void (*callback)(struct ctdb_event_reply *reply, void *private_data);
+	void *private_data;
+
+	uint32_t reqid;
+	uint8_t *buf;
+	size_t buflen;
+};
+
+static void eventd_client_read(uint8_t *buf, size_t buflen,
+			       void *private_data);
+static int eventd_client_state_destructor(struct eventd_client_state *state);
+
+static bool eventd_client_connect(struct eventd_context *ectx)
+{
+	int fd;
+
+	if (ectx->queue != NULL) {
+		return true;
+	}
+
+	fd = sock_connect(ectx->socket);
+	if (fd == -1) {
+		return false;
+	}
+
+	ectx->queue = sock_queue_setup(ectx, ectx->ev, fd,
+				       eventd_client_read, ectx);
+	if (ectx->queue == NULL) {
+		close(fd);
+		return false;
+	}
+
+	return true;
+}
+
+static int eventd_client_write(struct eventd_context *ectx,
+			       TALLOC_CTX *mem_ctx,
+			       struct ctdb_event_request *request,
+			       void (*callback)(struct ctdb_event_reply *reply,
+						void *private_data),
+			       void *private_data)
+{
+	struct ctdb_event_header header = { 0 };
+	struct eventd_client_state *state;
+	int ret;
+
+	if (! eventd_client_connect(ectx)) {
+		return -1;
+	}
+
+	state = talloc_zero(mem_ctx, struct eventd_client_state);
+	if (state == NULL) {
+		return -1;
+	}
+
+	state->ectx = ectx;
+	state->callback = callback;
+	state->private_data = private_data;
+
+	state->reqid = reqid_new(ectx->idr, state);
+	if (state->reqid == REQID_INVALID) {
+		talloc_free(state);
+		return -1;
+	}
+
+	talloc_set_destructor(state, eventd_client_state_destructor);
+
+	header.reqid = state->reqid;
+
+	state->buflen = ctdb_event_request_len(&header, request);
+	state->buf = talloc_size(state, state->buflen);
+	if (state->buf == NULL) {
+		talloc_free(state);
+		return -1;
+	}
+
+	ret = ctdb_event_request_push(&header,
+				      request,
+				      state->buf,
+				      &state->buflen);
+	if (ret != 0) {
+		talloc_free(state);
+		return -1;
+	}
+
+	ret = sock_queue_write(ectx->queue, state->buf, state->buflen);
+	if (ret != 0) {
+		talloc_free(state);
+		return -1;
+	}
+
+	DLIST_ADD(ectx->calls, state);
+
+	return 0;
+}
+
+static int eventd_client_state_destructor(struct eventd_client_state *state)
+{
+	struct eventd_context *ectx = state->ectx;
+
+	reqid_remove(ectx->idr, state->reqid);
+	DLIST_REMOVE(ectx->calls, state);
+	return 0;
+}
+
+static void eventd_client_read(uint8_t *buf, size_t buflen,
+			       void *private_data)
+{
+	struct eventd_context *ectx = talloc_get_type_abort(
+		private_data, struct eventd_context);
+	struct eventd_client_state *state;
+	struct ctdb_event_header header;
+	struct ctdb_event_reply *reply;
+	int ret;
+
+	if (buf == NULL) {
+		/* connection lost */
+		TALLOC_FREE(ectx->queue);
+		return;
+	}
+
+	ret = ctdb_event_reply_pull(buf, buflen, &header, ectx, &reply);
+	if (ret != 0) {
+		D_ERR("Invalid packet received, ret=%d\n", ret);
+		return;
+	}
+
+	if (buflen != header.length) {
+		D_ERR("Packet size mismatch %zu != %"PRIu32"\n",
+		      buflen, header.length);
+		talloc_free(reply);
+		return;
+	}
+
+	state = reqid_find(ectx->idr, header.reqid,
+			   struct eventd_client_state);
+	if (state == NULL) {
+		talloc_free(reply);
+		return;
+	}
+
+	if (state->reqid != header.reqid) {
+		talloc_free(reply);
+		return;
+	}
+
+	state = talloc_steal(reply, state);
+	state->callback(reply, state->private_data);
+	talloc_free(reply);
+}
+
+/*
+ * Run an event
+ */
+
+struct eventd_client_run_state {
+	struct eventd_context *ectx;
+	void (*callback)(int result, void *private_data);
+	void *private_data;
+};
+
+static void eventd_client_run_done(struct ctdb_event_reply *reply,
+				   void *private_data);
+
+static int eventd_client_run(struct eventd_context *ectx,
+			     TALLOC_CTX *mem_ctx,
+			     void (*callback)(int result,
+					      void *private_data),
+			     void *private_data,
+			     enum ctdb_event event,
+			     const char *arg_str,
+			     uint32_t timeout)
+{
+	struct eventd_client_run_state *state;
+	struct ctdb_event_request request;
+	struct ctdb_event_request_run rdata;
+	int ret;
+
+	state = talloc_zero(mem_ctx, struct eventd_client_run_state);
+	if (state == NULL) {
+		return -1;
+	}
+
+	state->ectx = ectx;
+	state->callback = callback;
+	state->private_data = private_data;
+
+	rdata.component = "legacy";
+	rdata.event = ctdb_event_to_string(event);
+	rdata.args = arg_str;
+	rdata.timeout = timeout;
+	rdata.flags = 0;
+
+	request.cmd = CTDB_EVENT_CMD_RUN;
+	request.data.run = &rdata;
+
+	ret = eventd_client_write(ectx, state, &request,
+				  eventd_client_run_done, state);
+	if (ret != 0) {
+		talloc_free(state);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void eventd_client_run_done(struct ctdb_event_reply *reply,
+				   void *private_data)
+{
+	struct eventd_client_run_state *state = talloc_get_type_abort(
+		private_data, struct eventd_client_run_state);
+
+	state = talloc_steal(state->ectx, state);
+	state->callback(reply->result, state->private_data);
+	talloc_free(state);
+}
+
+/*
+ * CTDB event script functions
+ */
+
+int ctdb_event_script_run(struct ctdb_context *ctdb,
+			  TALLOC_CTX *mem_ctx,
+			  void (*callback)(struct ctdb_context *ctdb,
+					   int result, void *private_data),
+			  void *private_data,
+			  enum ctdb_event event,
+			  const char *fmt, va_list ap)
+			  PRINTF_ATTRIBUTE(6,0);
+
+struct ctdb_event_script_run_state {
+	struct ctdb_context *ctdb;
+	void (*callback)(struct ctdb_context *ctdb, int result,
+			 void *private_data);
+	void *private_data;
+	enum ctdb_event event;
+};
+
+static bool event_allowed_during_recovery(enum ctdb_event event);
+static void ctdb_event_script_run_done(int result, void *private_data);
+static bool check_options(enum ctdb_event call, const char *options);
+
+int ctdb_event_script_run(struct ctdb_context *ctdb,
+			  TALLOC_CTX *mem_ctx,
+			  void (*callback)(struct ctdb_context *ctdb,
+					   int result, void *private_data),
+			  void *private_data,
+			  enum ctdb_event event,
+			  const char *fmt, va_list ap)
+{
+	struct ctdb_event_script_run_state *state;
+	char *arg_str;
+	int ret;
+
+	if ( (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) &&
+	     (! event_allowed_during_recovery(event)) ) {
+		DEBUG(DEBUG_ERR,
+		      ("Refusing to run event '%s' while in recovery\n",
+		       ctdb_eventscript_call_names[event]));
+		return -1;
+	}
+
+	state = talloc_zero(mem_ctx, struct ctdb_event_script_run_state);
+	if (state == NULL) {
+		return -1;
+	}
+
+	state->ctdb = ctdb;
+	state->callback = callback;
+	state->private_data = private_data;
+	state->event = event;
+
+	if (fmt != NULL) {
+		arg_str = talloc_vasprintf(state, fmt, ap);
+		if (arg_str == NULL) {
+			talloc_free(state);
+			return -1;
+		}
+	} else {
+		arg_str = NULL;
+	}
+
+	if (! check_options(event, arg_str)) {
+		DEBUG(DEBUG_ERR,
+		      ("Bad event script arguments '%s' for '%s'\n",
+		       arg_str, ctdb_eventscript_call_names[event]));
+		talloc_free(arg_str);
+		return -1;
+	}
+
+	ret = eventd_client_run(ctdb->ectx, state,
+				ctdb_event_script_run_done, state,
+				event, arg_str, ctdb->tunable.script_timeout);
+	if (ret != 0) {
+		talloc_free(state);
+		return ret;
+	}
+
+	DEBUG(DEBUG_INFO,
+	      (__location__ " Running event %s with arguments %s\n",
+	       ctdb_eventscript_call_names[event], arg_str));
+
+	talloc_free(arg_str);
+	return 0;
+}
+
+static void ctdb_event_script_run_done(int result, void *private_data)
+{
+	struct ctdb_event_script_run_state *state = talloc_get_type_abort(
+		private_data, struct ctdb_event_script_run_state);
+
+	if (result == ETIMEDOUT) {
+		switch (state->event) {
+		case CTDB_EVENT_START_RECOVERY:
+		case CTDB_EVENT_RECOVERED:
+		case CTDB_EVENT_TAKE_IP:
+		case CTDB_EVENT_RELEASE_IP:
+			DEBUG(DEBUG_ERR,
+			      ("Ignoring hung script for %s event\n",
+			       ctdb_eventscript_call_names[state->event]));
+			result = 0;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	state = talloc_steal(state->ctdb, state);
+	state->callback(state->ctdb, result, state->private_data);
+	talloc_free(state);
+}
+
+
+static unsigned int count_words(const char *options)
+{
+	unsigned int words = 0;
+
+	if (options == NULL) {
+		return 0;
+	}
+
+	options += strspn(options, " \t");
+	while (*options) {
+		words++;
+		options += strcspn(options, " \t");
+		options += strspn(options, " \t");
+	}
+	return words;
+}
+
+static bool check_options(enum ctdb_event call, const char *options)
+{
+	switch (call) {
+	/* These all take no arguments. */
+	case CTDB_EVENT_INIT:
+	case CTDB_EVENT_SETUP:
+	case CTDB_EVENT_STARTUP:
+	case CTDB_EVENT_START_RECOVERY:
+	case CTDB_EVENT_RECOVERED:
+	case CTDB_EVENT_MONITOR:
+	case CTDB_EVENT_SHUTDOWN:
+	case CTDB_EVENT_IPREALLOCATED:
+		return count_words(options) == 0;
+
+	case CTDB_EVENT_TAKE_IP: /* interface, IP address, netmask bits. */
+	case CTDB_EVENT_RELEASE_IP:
+		return count_words(options) == 3;
+
+	case CTDB_EVENT_UPDATE_IP: /* old interface, new interface, IP address, netmask bits. */
+		return count_words(options) == 4;
+
+	default:
+		DEBUG(DEBUG_ERR,(__location__ "Unknown ctdb_event %u\n", call));
+		return false;
+	}
+}
+
+/* only specific events are allowed while in recovery */
+static bool event_allowed_during_recovery(enum ctdb_event event)
+{
+	const enum ctdb_event allowed_events[] = {
+		CTDB_EVENT_INIT,
+		CTDB_EVENT_SETUP,
+		CTDB_EVENT_START_RECOVERY,
+		CTDB_EVENT_SHUTDOWN,
+		CTDB_EVENT_RELEASE_IP,
+		CTDB_EVENT_IPREALLOCATED,
+	};
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(allowed_events); i++) {
+		if (event == allowed_events[i]) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+  run the event script in the background, calling the callback when
+  finished.  If mem_ctx is freed, callback will never be called.
+ */
+int ctdb_event_script_callback(struct ctdb_context *ctdb,
+			       TALLOC_CTX *mem_ctx,
+			       void (*callback)(struct ctdb_context *, int, void *),
+			       void *private_data,
+			       enum ctdb_event call,
+			       const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = ctdb_event_script_run(ctdb, mem_ctx, callback, private_data,
+				    call, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+
+struct ctdb_event_script_args_state {
+	bool done;
+	int status;
+};
+
+static void ctdb_event_script_args_done(struct ctdb_context *ctdb,
+					int status, void *private_data)
+{
+	struct ctdb_event_script_args_state *s =
+		(struct ctdb_event_script_args_state *)private_data;
+
+	s->done = true;
+	s->status = status;
+}
+
+/*
+  run the event script, waiting for it to complete. Used when the caller
+  doesn't want to continue till the event script has finished.
+ */
+int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_event call,
+			   const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	struct ctdb_event_script_args_state state = {
+		.status = -1,
+		.done = false,
+	};
+
+	va_start(ap, fmt);
+	ret = ctdb_event_script_run(ctdb, ctdb,
+				    ctdb_event_script_args_done, &state,
+				    call, fmt, ap);
+	va_end(ap);
+	if (ret != 0) {
+		return ret;
+	}
+
+	while (! state.done) {
+		tevent_loop_once(ctdb->ev);
+	}
+
+	if (state.status == ETIMEDOUT) {
+		/* Don't ban self if CTDB is starting up or shutting down */
+		if (call != CTDB_EVENT_INIT && call != CTDB_EVENT_SHUTDOWN) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ " eventscript for '%s' timed out."
+			       " Immediately banning ourself for %d seconds\n",
+			       ctdb_eventscript_call_names[call],
+			       ctdb->tunable.recovery_ban_period));
+			ctdb_ban_self(ctdb);
+		}
+	}
+
+	return state.status;
+}
+
+int ctdb_event_script(struct ctdb_context *ctdb, enum ctdb_event call)
+{
+	/* GCC complains about empty format string, so use %s and "". */
+	return ctdb_event_script_args(ctdb, call, NULL);
+}
+
+void ctdb_event_reopen_logs(struct ctdb_context *ctdb)
+{
+	if (ctdb->ectx->eventd_pid > 0) {
+		kill(ctdb->ectx->eventd_pid, SIGHUP);
+	}
+}
diff --git a/ctdb/server/ipalloc.c b/ctdb/server/ipalloc.c
new file mode 100644
index 0000000..7f49364
--- /dev/null
+++ b/ctdb/server/ipalloc.c
@@ -0,0 +1,284 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include <talloc.h>
+
+#include "lib/util/debug.h"
+
+#include "common/logging.h"
+#include "common/rb_tree.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/* Initialise main ipalloc state and sub-structures */
+struct ipalloc_state *
+ipalloc_state_init(TALLOC_CTX *mem_ctx,
+		   uint32_t num_nodes,
+		   enum ipalloc_algorithm algorithm,
+		   bool no_ip_takeover,
+		   bool no_ip_failback,
+		   uint32_t *force_rebalance_nodes)
+{
+	struct ipalloc_state *ipalloc_state =
+		talloc_zero(mem_ctx, struct ipalloc_state);
+	if (ipalloc_state == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
+		return NULL;
+	}
+
+	ipalloc_state->num = num_nodes;
+
+	ipalloc_state->algorithm = algorithm;
+	ipalloc_state->no_ip_takeover = no_ip_takeover;
+	ipalloc_state->no_ip_failback = no_ip_failback;
+	ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
+
+	return ipalloc_state;
+}
+
+static void *add_ip_callback(void *parm, void *data)
+{
+	struct public_ip_list *this_ip = parm;
+	struct public_ip_list *prev_ip = data;
+
+	if (prev_ip == NULL) {
+		return parm;
+	}
+	if (this_ip->pnn == CTDB_UNKNOWN_PNN) {
+		this_ip->pnn = prev_ip->pnn;
+	}
+
+	return parm;
+}
+
+static int getips_count_callback(void *param, void *data)
+{
+	struct public_ip_list **ip_list = (struct public_ip_list **)param;
+	struct public_ip_list *new_ip = (struct public_ip_list *)data;
+
+	new_ip->next = *ip_list;
+	*ip_list     = new_ip;
+	return 0;
+}
+
+/* Nodes only know about those public addresses that they are
+ * configured to serve and no individual node has a full list of all
+ * public addresses configured across the cluster.  Therefore, a
+ * merged list of all public addresses needs to be built so that IP
+ * allocation can be done. */
+static struct public_ip_list *
+create_merged_ip_list(struct ipalloc_state *ipalloc_state)
+{
+	unsigned int i, j;
+	struct public_ip_list *ip_list;
+	struct ctdb_public_ip_list *public_ips;
+	struct trbt_tree *ip_tree;
+	int ret;
+
+	ip_tree = trbt_create(ipalloc_state, 0);
+
+	if (ipalloc_state->known_public_ips == NULL) {
+		DEBUG(DEBUG_ERR, ("Known public IPs not set\n"));
+		return NULL;
+	}
+
+	for (i=0; i < ipalloc_state->num; i++) {
+
+		public_ips = &ipalloc_state->known_public_ips[i];
+
+		for (j=0; j < public_ips->num; j++) {
+			struct public_ip_list *tmp_ip;
+
+			/* This is returned as part of ip_list */
+			tmp_ip = talloc_zero(ipalloc_state, struct public_ip_list);
+			if (tmp_ip == NULL) {
+				DEBUG(DEBUG_ERR,
+				      (__location__ " out of memory\n"));
+				talloc_free(ip_tree);
+				return NULL;
+			}
+
+			/* Do not use information about IP addresses hosted
+			 * on other nodes, it may not be accurate */
+			if (public_ips->ip[j].pnn == i) {
+				tmp_ip->pnn = public_ips->ip[j].pnn;
+			} else {
+				tmp_ip->pnn = CTDB_UNKNOWN_PNN;
+			}
+			tmp_ip->addr = public_ips->ip[j].addr;
+			tmp_ip->next = NULL;
+
+			trbt_insertarray32_callback(ip_tree,
+				IP_KEYLEN, ip_key(&public_ips->ip[j].addr),
+				add_ip_callback,
+				tmp_ip);
+		}
+	}
+
+	ip_list = NULL;
+	ret = trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
+	if (ret != 0) {
+		DBG_ERR("Error traversing the IP tree.\n");
+	}
+
+	talloc_free(ip_tree);
+
+	return ip_list;
+}
+
+static bool populate_bitmap(struct ipalloc_state *ipalloc_state)
+{
+	struct public_ip_list *ip = NULL;
+	unsigned int i, j;
+
+	for (ip = ipalloc_state->all_ips; ip != NULL; ip = ip->next) {
+
+		ip->known_on = bitmap_talloc(ip, ipalloc_state->num);
+		if (ip->known_on == NULL) {
+			return false;
+		}
+
+		ip->available_on = bitmap_talloc(ip, ipalloc_state->num);
+		if (ip->available_on == NULL) {
+			return false;
+		}
+
+		for (i = 0; i < ipalloc_state->num; i++) {
+			struct ctdb_public_ip_list *known =
+				&ipalloc_state->known_public_ips[i];
+			struct ctdb_public_ip_list *avail =
+				&ipalloc_state->available_public_ips[i];
+
+			/* Check to see if "ip" is available on node "i" */
+			for (j = 0; j < avail->num; j++) {
+				if (ctdb_sock_addr_same_ip(
+					    &ip->addr, &avail->ip[j].addr)) {
+					bitmap_set(ip->available_on, i);
+					break;
+				}
+			}
+
+			/* Optimisation: available => known */
+			if (bitmap_query(ip->available_on, i)) {
+				bitmap_set(ip->known_on, i);
+				continue;
+			}
+
+			/* Check to see if "ip" is known on node "i" */
+			for (j = 0; j < known->num; j++) {
+				if (ctdb_sock_addr_same_ip(
+					    &ip->addr, &known->ip[j].addr)) {
+					bitmap_set(ip->known_on, i);
+					break;
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+void ipalloc_set_public_ips(struct ipalloc_state *ipalloc_state,
+			    struct ctdb_public_ip_list *known_ips,
+			    struct ctdb_public_ip_list *available_ips)
+{
+	ipalloc_state->available_public_ips = available_ips;
+	ipalloc_state->known_public_ips = known_ips;
+}
+
+/* This can only return false if there are no available IPs *and*
+ * there are no IP addresses currently allocated.  If the latter is
+ * true then the cluster can clearly host IPs... just not necessarily
+ * right now... */
+bool ipalloc_can_host_ips(struct ipalloc_state *ipalloc_state)
+{
+	unsigned int i;
+	bool have_ips = false;
+
+	for (i=0; i < ipalloc_state->num; i++) {
+		struct ctdb_public_ip_list *ips =
+			ipalloc_state->known_public_ips;
+		if (ips[i].num != 0) {
+			unsigned int j;
+			have_ips = true;
+			/* Succeed if an address is hosted on node i */
+			for (j=0; j < ips[i].num; j++) {
+				if (ips[i].ip[j].pnn == i) {
+					return true;
+				}
+			}
+		}
+	}
+
+	if (! have_ips) {
+		return false;
+	}
+
+	/* At this point there are known addresses but none are
+	 * hosted.  Need to check if cluster can now host some
+	 * addresses.
+	 */
+	for (i=0; i < ipalloc_state->num; i++) {
+		if (ipalloc_state->available_public_ips[i].num != 0) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/* The calculation part of the IP allocation algorithm. */
+struct public_ip_list *ipalloc(struct ipalloc_state *ipalloc_state)
+{
+	bool ret = false;
+
+	ipalloc_state->all_ips = create_merged_ip_list(ipalloc_state);
+	if (ipalloc_state->all_ips == NULL) {
+		return NULL;
+	}
+
+	if (!populate_bitmap(ipalloc_state)) {
+		return NULL;
+	}
+
+	switch (ipalloc_state->algorithm) {
+	case IPALLOC_LCP2:
+		ret = ipalloc_lcp2(ipalloc_state);
+		break;
+	case IPALLOC_DETERMINISTIC:
+		ret = ipalloc_deterministic(ipalloc_state);
+		break;
+	case IPALLOC_NONDETERMINISTIC:
+		ret = ipalloc_nondeterministic(ipalloc_state);
+               break;
+	}
+
+	/* at this point ->pnn is the node which will own each IP
+	   or CTDB_UNKNOWN_PNN if there is no node that can cover this ip
+	*/
+
+	return (ret ? ipalloc_state->all_ips : NULL);
+}
diff --git a/ctdb/server/ipalloc.h b/ctdb/server/ipalloc.h
new file mode 100644
index 0000000..42aec9e
--- /dev/null
+++ b/ctdb/server/ipalloc.h
@@ -0,0 +1,67 @@
+/*
+   CTDB IP takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_H__
+#define __CTDB_IPALLOC_H__
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/bitmap.h"
+
+struct public_ip_list {
+	struct public_ip_list *next;
+	uint32_t pnn;
+	ctdb_sock_addr addr;
+	struct bitmap *known_on;
+	struct bitmap *available_on;
+};
+
+#define IP_KEYLEN	4
+uint32_t *ip_key(ctdb_sock_addr *ip);
+
+/* Flags used in IP allocation algorithms. */
+enum ipalloc_algorithm {
+	IPALLOC_DETERMINISTIC,
+	IPALLOC_NONDETERMINISTIC,
+	IPALLOC_LCP2,
+};
+
+struct ipalloc_state;
+
+struct ipalloc_state * ipalloc_state_init(TALLOC_CTX *mem_ctx,
+					  uint32_t num_nodes,
+					  enum ipalloc_algorithm algorithm,
+					  bool no_ip_takeover,
+					  bool no_ip_failback,
+					  uint32_t *force_rebalance_nodes);
+
+void ipalloc_set_public_ips(struct ipalloc_state *ipalloc_state,
+			    struct ctdb_public_ip_list *known_ips,
+			    struct ctdb_public_ip_list *available_ips);
+
+bool ipalloc_can_host_ips(struct ipalloc_state *ipalloc_state);
+
+struct public_ip_list *ipalloc(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_H__ */
diff --git a/ctdb/server/ipalloc_common.c b/ctdb/server/ipalloc_common.c
new file mode 100644
index 0000000..a5177d4
--- /dev/null
+++ b/ctdb/server/ipalloc_common.c
@@ -0,0 +1,192 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/time.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "common/common.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+/* Given a physical node, return the number of
+   public addresses that is currently assigned to this node.
+*/
+int node_ip_coverage(uint32_t pnn, struct public_ip_list *ips)
+{
+	int num=0;
+
+	for (;ips;ips=ips->next) {
+		if (ips->pnn == pnn) {
+			num++;
+		}
+	}
+	return num;
+}
+
+
+/* Can the given node host the given IP: is the public IP known to the
+ * node and is NOIPHOST unset?
+*/
+static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
+			     int32_t pnn,
+			     struct public_ip_list *ip)
+{
+	return bitmap_query(ip->available_on, pnn);
+}
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+			  int32_t pnn,
+			  struct public_ip_list *ip)
+{
+	if (ipalloc_state->no_ip_takeover) {
+		return false;
+	}
+
+	return can_node_host_ip(ipalloc_state, pnn, ip);
+}
+
+/* search the node lists list for a node to takeover this ip.
+   pick the node that currently are serving the least number of ips
+   so that the ips get spread out evenly.
+*/
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+		       struct public_ip_list *ip)
+{
+	unsigned int pnn;
+	int min=0, num;
+	unsigned int i, numnodes;
+
+	numnodes = ipalloc_state->num;
+	pnn = CTDB_UNKNOWN_PNN;
+	for (i=0; i<numnodes; i++) {
+		/* verify that this node can serve this ip */
+		if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
+			/* no it couldn't   so skip to the next node */
+			continue;
+		}
+
+		num = node_ip_coverage(i, ipalloc_state->all_ips);
+		/* was this the first node we checked ? */
+		if (pnn == CTDB_UNKNOWN_PNN) {
+			pnn = i;
+			min  = num;
+		} else {
+			if (num < min) {
+				pnn = i;
+				min  = num;
+			}
+		}
+	}
+	if (pnn == CTDB_UNKNOWN_PNN) {
+		DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
+				     ctdb_sock_addr_to_string(ipalloc_state,
+							      &ip->addr,
+							      false)));
+
+		return -1;
+	}
+
+	ip->pnn = pnn;
+	return 0;
+}
+
+uint32_t *ip_key(ctdb_sock_addr *ip)
+{
+	static uint32_t key[IP_KEYLEN];
+
+	bzero(key, sizeof(key));
+
+	switch (ip->sa.sa_family) {
+	case AF_INET:
+		key[3]	= htonl(ip->ip.sin_addr.s_addr);
+		break;
+	case AF_INET6: {
+		uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
+		key[0]	= htonl(s6_a32[0]);
+		key[1]	= htonl(s6_a32[1]);
+		key[2]	= htonl(s6_a32[2]);
+		key[3]	= htonl(s6_a32[3]);
+		break;
+	}
+	default:
+		DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+		return key;
+	}
+
+	return key;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ */
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
+{
+	struct public_ip_list *t;
+
+	/* loop over all ip's and find a physical node to cover for
+	   each unassigned ip.
+	*/
+	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+		if (t->pnn == CTDB_UNKNOWN_PNN) {
+			if (find_takeover_node(ipalloc_state, t)) {
+				DEBUG(DEBUG_WARNING,
+				      ("Failed to find node to cover ip %s\n",
+				       ctdb_sock_addr_to_string(ipalloc_state,
+								&t->addr,
+								false)));
+			}
+		}
+	}
+}
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
+{
+	struct public_ip_list *t;
+
+	/* verify that the assigned nodes can serve that public ip
+	   and set it to CTDB_UNKNOWN_PNN if not
+	*/
+	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+		if (t->pnn == CTDB_UNKNOWN_PNN) {
+			continue;
+		}
+		if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
+			/* this node can not serve this ip. */
+			DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
+					   ctdb_sock_addr_to_string(
+						   ipalloc_state,
+						   &t->addr, false),
+					   t->pnn));
+			t->pnn = CTDB_UNKNOWN_PNN;
+		}
+	}
+}
diff --git a/ctdb/server/ipalloc_deterministic.c b/ctdb/server/ipalloc_deterministic.c
new file mode 100644
index 0000000..43680ba
--- /dev/null
+++ b/ctdb/server/ipalloc_deterministic.c
@@ -0,0 +1,191 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+#include "common/path.h"
+
+#include "protocol/protocol_util.h"
+#include "lib/util/smb_strtox.h"
+#include "lib/util/memory.h"
+
+#include "server/ipalloc_private.h"
+
+struct home_node {
+	ctdb_sock_addr addr;
+	uint32_t pnn;
+};
+
+static struct home_node *ipalloc_get_home_nodes(TALLOC_CTX *mem_ctx)
+{
+	char *line = NULL;
+	size_t len = 0;
+	char *fname = NULL;
+	FILE *fp = NULL;
+	struct home_node *result = NULL;
+
+	fname = path_etcdir_append(mem_ctx, "home_nodes");
+	if (fname == NULL) {
+		goto fail;
+	}
+
+	fp = fopen(fname, "r");
+	if (fp == NULL) {
+		goto fail;
+	}
+	TALLOC_FREE(fname);
+
+	while (true) {
+		size_t num_nodes = talloc_array_length(result);
+		char *saveptr = NULL, *addrstr = NULL, *nodestr = NULL;
+		struct home_node hn = {
+			.pnn = CTDB_UNKNOWN_PNN,
+		};
+		struct home_node *tmp = NULL;
+		ssize_t n = 0;
+		int ret;
+
+		n = getline(&line, &len, fp);
+		if (n < 0) {
+			if (!feof(fp)) {
+				/* real error */
+				goto fail;
+			}
+			break;
+		}
+		if ((n > 0) && (line[n - 1] == '\n')) {
+			line[n - 1] = '\0';
+		}
+
+		addrstr = strtok_r(line, " \t", &saveptr);
+		if (addrstr == NULL) {
+			continue;
+		}
+		nodestr = strtok_r(NULL, " \t", &saveptr);
+		if (nodestr == NULL) {
+			continue;
+		}
+
+		ret = ctdb_sock_addr_from_string(addrstr, &hn.addr, false);
+		if (ret != 0) {
+			DBG_WARNING("Could not parse %s: %s\n",
+				    addrstr,
+				    strerror(ret));
+			goto fail;
+		}
+
+		hn.pnn = smb_strtoul(nodestr,
+				     NULL,
+				     10,
+				     &ret,
+				     SMB_STR_FULL_STR_CONV);
+		if (ret != 0) {
+			DBG_WARNING("Could not parse \"%s\"\n", nodestr);
+			goto fail;
+		}
+
+		tmp = talloc_realloc(mem_ctx,
+				     result,
+				     struct home_node,
+				     num_nodes + 1);
+		if (tmp == NULL) {
+			goto fail;
+		}
+		result = tmp;
+		result[num_nodes] = hn;
+	}
+
+	fclose(fp);
+	fp = NULL;
+	return result;
+
+fail:
+	if (fp != NULL) {
+		fclose(fp);
+		fp = NULL;
+	}
+	SAFE_FREE(line);
+	TALLOC_FREE(fname);
+	TALLOC_FREE(result);
+	return NULL;
+}
+
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
+{
+	struct home_node *home_nodes = ipalloc_get_home_nodes(ipalloc_state);
+	size_t num_home_nodes = talloc_array_length(home_nodes);
+	struct public_ip_list *t;
+	int i;
+	uint32_t numnodes;
+
+	numnodes = ipalloc_state->num;
+
+	DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+       /* Allocate IPs to nodes in a modulo fashion so that IPs will
+        *  always be allocated the same way for a specific set of
+        *  available/unavailable nodes.
+	*/
+
+	for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
+		size_t j;
+
+		t->pnn = i % numnodes;
+
+		for (j = 0; j < num_home_nodes; j++) {
+			struct home_node *hn = &home_nodes[j];
+
+			if (ctdb_sock_addr_same_ip(&t->addr, &hn->addr)) {
+
+				if (hn->pnn >= numnodes) {
+					DBG_WARNING("pnn %" PRIu32
+						    " too large\n",
+						    hn->pnn);
+					break;
+				}
+
+				t->pnn = hn->pnn;
+				break;
+			}
+		}
+	}
+
+	/* IP failback doesn't make sense with deterministic
+	 * IPs, since the modulo step above implicitly fails
+	 * back IPs to their "home" node.
+	 */
+	if (ipalloc_state->no_ip_failback) {
+		D_WARNING("WARNING: 'NoIPFailback' set but ignored - "
+			  "incompatible with 'Deterministic IPs\n");
+	}
+
+	unassign_unsuitable_ips(ipalloc_state);
+
+	basic_allocate_unassigned(ipalloc_state);
+
+	/* No failback here! */
+
+	TALLOC_FREE(home_nodes);
+
+	return true;
+}
diff --git a/ctdb/server/ipalloc_lcp2.c b/ctdb/server/ipalloc_lcp2.c
new file mode 100644
index 0000000..996adcf
--- /dev/null
+++ b/ctdb/server/ipalloc_lcp2.c
@@ -0,0 +1,525 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes.  The implementation means that all
+ * addresses end up being 128 bits long.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
+ */
+static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
+{
+	uint32_t ip1_k[IP_KEYLEN];
+	uint32_t *t;
+	int i;
+	uint32_t x;
+
+	uint32_t distance = 0;
+
+	memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+	t = ip_key(ip2);
+	for (i=0; i<IP_KEYLEN; i++) {
+		x = ip1_k[i] ^ t[i];
+		if (x == 0) {
+			distance += 32;
+		} else {
+			/* Count number of leading zeroes.
+			 * FIXME? This could be optimised...
+			 */
+			while ((x & ((uint32_t)1 << 31)) == 0) {
+				x <<= 1;
+				distance += 1;
+			}
+		}
+	}
+
+	return distance;
+}
+
+/* Calculate the IP distance for the given IP relative to IPs on the
+   given node.  The ips argument is generally the all_ips variable
+   used in the main part of the algorithm.
+ */
+static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+				  struct public_ip_list *ips,
+				  unsigned int pnn)
+{
+	struct public_ip_list *t;
+	uint32_t d;
+
+	uint32_t sum = 0;
+
+	for (t = ips; t != NULL; t = t->next) {
+		if (t->pnn != pnn) {
+			continue;
+		}
+
+		/* Optimisation: We never calculate the distance
+		 * between an address and itself.  This allows us to
+		 * calculate the effect of removing an address from a
+		 * node by simply calculating the distance between
+		 * that address and all of the existing addresses.
+		 * Moreover, we assume that we're only ever dealing
+		 * with addresses from all_ips so we can identify an
+		 * address via a pointer rather than doing a more
+		 * expensive address comparison. */
+		if (&(t->addr) == ip) {
+			continue;
+		}
+
+		d = ip_distance(ip, &(t->addr));
+		sum += d * d;  /* Cheaper than pulling in math.h :-) */
+	}
+
+	return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+   to the given node.
+ */
+static uint32_t lcp2_imbalance(struct public_ip_list * all_ips,
+			       unsigned int pnn)
+{
+	struct public_ip_list *t;
+
+	uint32_t imbalance = 0;
+
+	for (t = all_ips; t != NULL; t = t->next) {
+		if (t->pnn != pnn) {
+			continue;
+		}
+		/* Pass the rest of the IPs rather than the whole
+		   all_ips input list.
+		*/
+		imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
+	}
+
+	return imbalance;
+}
+
+static bool lcp2_init(struct ipalloc_state *ipalloc_state,
+		      uint32_t **lcp2_imbalances,
+		      bool **rebalance_candidates)
+{
+	unsigned int i, numnodes;
+	struct public_ip_list *t;
+
+	numnodes = ipalloc_state->num;
+
+	*rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
+	if (*rebalance_candidates == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+		return false;
+	}
+	*lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
+	if (*lcp2_imbalances == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+		return false;
+	}
+
+	for (i=0; i<numnodes; i++) {
+		(*lcp2_imbalances)[i] =
+			lcp2_imbalance(ipalloc_state->all_ips, i);
+		/* First step: assume all nodes are candidates */
+		(*rebalance_candidates)[i] = true;
+	}
+
+	/* 2nd step: if a node has IPs assigned then it must have been
+	 * healthy before, so we remove it from consideration.  This
+	 * is overkill but is all we have because we don't maintain
+	 * state between takeover runs.  An alternative would be to
+	 * keep state and invalidate it every time the recovery master
+	 * changes.
+	 */
+	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+		if (t->pnn != CTDB_UNKNOWN_PNN) {
+			(*rebalance_candidates)[t->pnn] = false;
+		}
+	}
+
+	/* 3rd step: if a node is forced to re-balance then
+	   we allow failback onto the node */
+	if (ipalloc_state->force_rebalance_nodes == NULL) {
+		return true;
+	}
+	for (i = 0;
+	     i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
+	     i++) {
+		uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
+		if (pnn >= numnodes) {
+			DEBUG(DEBUG_ERR,
+			      (__location__ "unknown node %u\n", pnn));
+			continue;
+		}
+
+		DEBUG(DEBUG_NOTICE,
+		      ("Forcing rebalancing of IPs to node %u\n", pnn));
+		(*rebalance_candidates)[pnn] = true;
+	}
+
+	return true;
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ */
+static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
+				     uint32_t *lcp2_imbalances)
+{
+	struct public_ip_list *t;
+	unsigned int dstnode, numnodes;
+
+	unsigned int minnode;
+	uint32_t mindsum, dstdsum, dstimbl;
+	uint32_t minimbl = 0;
+	struct public_ip_list *minip;
+
+	bool should_loop = true;
+	bool have_unassigned = true;
+
+	numnodes = ipalloc_state->num;
+
+	while (have_unassigned && should_loop) {
+		should_loop = false;
+
+		DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+		DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+		minnode = CTDB_UNKNOWN_PNN;
+		mindsum = 0;
+		minip = NULL;
+
+		/* loop over each unassigned ip. */
+		for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
+			if (t->pnn != CTDB_UNKNOWN_PNN) {
+				continue;
+			}
+
+			for (dstnode = 0; dstnode < numnodes; dstnode++) {
+				/* only check nodes that can actually takeover this ip */
+				if (!can_node_takeover_ip(ipalloc_state,
+							  dstnode,
+							  t)) {
+					/* no it couldn't   so skip to the next node */
+					continue;
+				}
+
+				dstdsum = ip_distance_2_sum(&(t->addr),
+							    ipalloc_state->all_ips,
+							    dstnode);
+				dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+				DEBUG(DEBUG_DEBUG,
+				      (" %s -> %d [+%d]\n",
+				       ctdb_sock_addr_to_string(ipalloc_state,
+								&(t->addr),
+								false),
+				       dstnode,
+				       dstimbl - lcp2_imbalances[dstnode]));
+
+
+				if (minnode == CTDB_UNKNOWN_PNN ||
+				    dstdsum < mindsum) {
+					minnode = dstnode;
+					minimbl = dstimbl;
+					mindsum = dstdsum;
+					minip = t;
+					should_loop = true;
+				}
+			}
+		}
+
+		DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+		/* If we found one then assign it to the given node. */
+		if (minnode != CTDB_UNKNOWN_PNN) {
+			minip->pnn = minnode;
+			lcp2_imbalances[minnode] = minimbl;
+			DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+					  ctdb_sock_addr_to_string(
+						  ipalloc_state,
+						  &(minip->addr), false),
+					  minnode,
+					  mindsum));
+		}
+
+		/* There might be a better way but at least this is clear. */
+		have_unassigned = false;
+		for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+			if (t->pnn == CTDB_UNKNOWN_PNN) {
+				have_unassigned = true;
+			}
+		}
+	}
+
+	/* We know if we have an unassigned addresses so we might as
+	 * well optimise.
+	 */
+	if (have_unassigned) {
+		for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+			if (t->pnn == CTDB_UNKNOWN_PNN) {
+				DEBUG(DEBUG_WARNING,
+				      ("Failed to find node to cover ip %s\n",
+				       ctdb_sock_addr_to_string(ipalloc_state,
+								&t->addr,
+								false)));
+			}
+		}
+	}
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ */
+static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
+				    unsigned int srcnode,
+				    uint32_t *lcp2_imbalances,
+				    bool *rebalance_candidates)
+{
+	unsigned int dstnode, mindstnode, numnodes;
+	uint32_t srcdsum, dstimbl, dstdsum;
+	uint32_t minsrcimbl, mindstimbl;
+	struct public_ip_list *minip;
+	struct public_ip_list *t;
+
+	/* Find an IP and destination node that best reduces imbalance. */
+	minip = NULL;
+	minsrcimbl = 0;
+	mindstnode = CTDB_UNKNOWN_PNN;
+	mindstimbl = 0;
+
+	numnodes = ipalloc_state->num;
+
+	DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+	DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+			   srcnode, lcp2_imbalances[srcnode]));
+
+	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+		uint32_t srcimbl;
+
+		/* Only consider addresses on srcnode. */
+		if (t->pnn != srcnode) {
+			continue;
+		}
+
+		/* What is this IP address costing the source node? */
+		srcdsum = ip_distance_2_sum(&(t->addr),
+					    ipalloc_state->all_ips,
+					    srcnode);
+		srcimbl = lcp2_imbalances[srcnode] - srcdsum;
+
+		/* Consider this IP address would cost each potential
+		 * destination node.  Destination nodes are limited to
+		 * those that are newly healthy, since we don't want
+		 * to do gratuitous failover of IPs just to make minor
+		 * balance improvements.
+		 */
+		for (dstnode = 0; dstnode < numnodes; dstnode++) {
+			if (!rebalance_candidates[dstnode]) {
+				continue;
+			}
+
+			/* only check nodes that can actually takeover this ip */
+			if (!can_node_takeover_ip(ipalloc_state, dstnode,
+						  t)) {
+				/* no it couldn't   so skip to the next node */
+				continue;
+			}
+
+			dstdsum = ip_distance_2_sum(&(t->addr),
+						    ipalloc_state->all_ips,
+						    dstnode);
+			dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+			DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+					   srcnode, -srcdsum,
+					   ctdb_sock_addr_to_string(
+						   ipalloc_state,
+						   &(t->addr), false),
+					   dstnode, dstdsum));
+
+			if ((dstimbl < lcp2_imbalances[srcnode]) &&
+			    (dstdsum < srcdsum) &&			\
+			    ((mindstnode == CTDB_UNKNOWN_PNN) ||				\
+			     ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+				minip = t;
+				minsrcimbl = srcimbl;
+				mindstnode = dstnode;
+				mindstimbl = dstimbl;
+			}
+		}
+	}
+	DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+        if (mindstnode != CTDB_UNKNOWN_PNN) {
+		/* We found a move that makes things better... */
+		DEBUG(DEBUG_INFO,
+		      ("%d [%d] -> %s -> %d [+%d]\n",
+		       srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+		       ctdb_sock_addr_to_string(ipalloc_state,
+						&(minip->addr), false),
+		       mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+		lcp2_imbalances[srcnode] = minsrcimbl;
+		lcp2_imbalances[mindstnode] = mindstimbl;
+		minip->pnn = mindstnode;
+
+		return true;
+	}
+
+        return false;
+}
+
+struct lcp2_imbalance_pnn {
+	uint32_t imbalance;
+	unsigned int pnn;
+};
+
+static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+	const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+	const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+	if (lipa->imbalance > lipb->imbalance) {
+		return -1;
+	} else if (lipa->imbalance == lipb->imbalance) {
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ */
+static void lcp2_failback(struct ipalloc_state *ipalloc_state,
+			  uint32_t *lcp2_imbalances,
+			  bool *rebalance_candidates)
+{
+	int i, numnodes;
+	struct lcp2_imbalance_pnn * lips;
+	bool again;
+
+	numnodes = ipalloc_state->num;
+
+try_again:
+	/* Put the imbalances and nodes into an array, sort them and
+	 * iterate through candidates.  Usually the 1st one will be
+	 * used, so this doesn't cost much...
+	 */
+	DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
+	DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
+	lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
+	for (i = 0; i < numnodes; i++) {
+		lips[i].imbalance = lcp2_imbalances[i];
+		lips[i].pnn = i;
+		DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
+	}
+	qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
+	      lcp2_cmp_imbalance_pnn);
+
+	again = false;
+	for (i = 0; i < numnodes; i++) {
+		/* This means that all nodes had 0 or 1 addresses, so
+		 * can't be imbalanced.
+		 */
+		if (lips[i].imbalance == 0) {
+			break;
+		}
+
+		if (lcp2_failback_candidate(ipalloc_state,
+					    lips[i].pnn,
+					    lcp2_imbalances,
+					    rebalance_candidates)) {
+			again = true;
+			break;
+		}
+	}
+
+	talloc_free(lips);
+	if (again) {
+		goto try_again;
+	}
+}
+
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
+{
+	uint32_t *lcp2_imbalances;
+	bool *rebalance_candidates;
+	int numnodes, i;
+	bool have_rebalance_candidates;
+	bool ret = true;
+
+	unassign_unsuitable_ips(ipalloc_state);
+
+	if (!lcp2_init(ipalloc_state,
+		       &lcp2_imbalances, &rebalance_candidates)) {
+		ret = false;
+		goto finished;
+	}
+
+	lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
+
+	/* If we don't want IPs to fail back then don't rebalance IPs. */
+	if (ipalloc_state->no_ip_failback) {
+		goto finished;
+	}
+
+	/* It is only worth continuing if we have suitable target
+	 * nodes to transfer IPs to.  This check is much cheaper than
+	 * continuing on...
+	 */
+	numnodes = ipalloc_state->num;
+	have_rebalance_candidates = false;
+	for (i=0; i<numnodes; i++) {
+		if (rebalance_candidates[i]) {
+			have_rebalance_candidates = true;
+			break;
+		}
+	}
+	if (!have_rebalance_candidates) {
+		goto finished;
+	}
+
+	/* Now, try to make sure the ip addresses are evenly distributed
+	   across the nodes.
+	*/
+	lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
+
+finished:
+	return ret;
+}
diff --git a/ctdb/server/ipalloc_nondeterministic.c b/ctdb/server/ipalloc_nondeterministic.c
new file mode 100644
index 0000000..9da7d6c
--- /dev/null
+++ b/ctdb/server/ipalloc_nondeterministic.c
@@ -0,0 +1,150 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+#include "common/common.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/* Basic non-deterministic rebalancing algorithm.
+ */
+static void basic_failback(struct ipalloc_state *ipalloc_state,
+			   int num_ips)
+{
+	unsigned int i, numnodes, maxnode, minnode;
+	int maxnum, minnum, num, retries;
+	struct public_ip_list *t;
+
+	numnodes = ipalloc_state->num;
+	retries = 0;
+
+try_again:
+	maxnum=0;
+	minnum=0;
+
+	/* for each ip address, loop over all nodes that can serve
+	   this ip and make sure that the difference between the node
+	   serving the most and the node serving the least ip's are
+	   not greater than 1.
+	*/
+	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+		if (t->pnn == CTDB_UNKNOWN_PNN) {
+			continue;
+		}
+
+		/* Get the highest and lowest number of ips's served by any 
+		   valid node which can serve this ip.
+		*/
+		maxnode = CTDB_UNKNOWN_PNN;
+		minnode = CTDB_UNKNOWN_PNN;
+		for (i=0; i<numnodes; i++) {
+			/* only check nodes that can actually serve this ip */
+			if (!can_node_takeover_ip(ipalloc_state, i,
+						  t)) {
+				/* no it couldn't   so skip to the next node */
+				continue;
+			}
+
+			num = node_ip_coverage(i, ipalloc_state->all_ips);
+			if (maxnode == CTDB_UNKNOWN_PNN) {
+				maxnode = i;
+				maxnum  = num;
+			} else {
+				if (num > maxnum) {
+					maxnode = i;
+					maxnum  = num;
+				}
+			}
+			if (minnode == CTDB_UNKNOWN_PNN) {
+				minnode = i;
+				minnum  = num;
+			} else {
+				if (num < minnum) {
+					minnode = i;
+					minnum  = num;
+				}
+			}
+		}
+		if (maxnode == CTDB_UNKNOWN_PNN) {
+			DEBUG(DEBUG_WARNING,
+			      (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
+			       ctdb_sock_addr_to_string(ipalloc_state,
+							&t->addr, false)));
+
+			continue;
+		}
+
+		/* if the spread between the smallest and largest coverage by
+		   a node is >=2 we steal one of the ips from the node with
+		   most coverage to even things out a bit.
+		   try to do this a limited number of times since we dont
+		   want to spend too much time balancing the ip coverage.
+		*/
+		if ((maxnum > minnum+1) &&
+		    (retries < (num_ips + 5))){
+			struct public_ip_list *tt;
+
+			/* Reassign one of maxnode's VNNs */
+			for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
+				if (tt->pnn == maxnode) {
+					(void)find_takeover_node(ipalloc_state,
+								 tt);
+					retries++;
+					goto try_again;;
+				}
+			}
+		}
+	}
+}
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
+{
+	/* This should be pushed down into basic_failback. */
+	struct public_ip_list *t;
+	int num_ips = 0;
+	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+		num_ips++;
+	}
+
+	unassign_unsuitable_ips(ipalloc_state);
+
+	basic_allocate_unassigned(ipalloc_state);
+
+	/* If we don't want IPs to fail back then don't rebalance IPs. */
+	if (ipalloc_state->no_ip_failback) {
+		return true;
+	}
+
+	/* Now, try to make sure the ip addresses are evenly distributed
+	   across the nodes.
+	*/
+	basic_failback(ipalloc_state, num_ips);
+
+	return true;
+}
diff --git a/ctdb/server/ipalloc_private.h b/ctdb/server/ipalloc_private.h
new file mode 100644
index 0000000..3ea3d31
--- /dev/null
+++ b/ctdb/server/ipalloc_private.h
@@ -0,0 +1,57 @@
+/*
+   CTDB IP takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_PRIVATE_H__
+#define __CTDB_IPALLOC_PRIVATE_H__
+
+#include "protocol/protocol.h"
+
+#include "server/ipalloc.h"
+
+struct ipalloc_state {
+	uint32_t num;
+
+	/* Arrays with data for each node */
+	struct ctdb_public_ip_list *available_public_ips;
+	struct ctdb_public_ip_list *known_public_ips;
+
+	struct public_ip_list *all_ips;
+	enum ipalloc_algorithm algorithm;
+	bool no_ip_failback;
+	bool no_ip_takeover;
+	uint32_t *force_rebalance_nodes;
+};
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+			  int32_t pnn,
+			  struct public_ip_list *ip);
+int node_ip_coverage(uint32_t pnn, struct public_ip_list *ips);
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+		       struct public_ip_list *ip);
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state);
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state);
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_PRIVATE_H__ */
diff --git a/ctdb/server/legacy_conf.c b/ctdb/server/legacy_conf.c
new file mode 100644
index 0000000..3391a3b
--- /dev/null
+++ b/ctdb/server/legacy_conf.c
@@ -0,0 +1,80 @@
+/*
+   CTDB legacy config handling
+
+   Copyright (C) Martin Schwenke  2018
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "lib/util/debug.h"
+
+#include "common/conf.h"
+#include "common/logging.h"
+
+#include "legacy_conf.h"
+
+#define LEGACY_SCRIPT_LOG_LEVEL_DEFAULT "ERROR"
+
+static bool legacy_conf_validate_script_log_level(const char *key,
+						  const char *old_loglevel,
+						  const char *new_loglevel,
+						  enum conf_update_mode mode)
+{
+	int log_level;
+	bool ok;
+
+	ok = debug_level_parse(new_loglevel, &log_level);
+	if (!ok) {
+		D_ERR("Invalid value for [%s] -> %s = %s\n",
+		      LEGACY_CONF_SECTION,
+		      key,
+		      new_loglevel);
+		return false;
+	}
+
+	return true;
+}
+
+void legacy_conf_init(struct conf_context *conf)
+{
+	conf_define_section(conf, LEGACY_CONF_SECTION, NULL);
+
+	conf_define_boolean(conf,
+			    LEGACY_CONF_SECTION,
+			    LEGACY_CONF_REALTIME_SCHEDULING,
+			    true,
+			    NULL);
+	conf_define_boolean(conf,
+			    LEGACY_CONF_SECTION,
+			    LEGACY_CONF_LMASTER_CAPABILITY,
+			    true,
+			    NULL);
+	conf_define_boolean(conf,
+			    LEGACY_CONF_SECTION,
+			    LEGACY_CONF_START_AS_STOPPED,
+			    false,
+			    NULL);
+	conf_define_boolean(conf,
+			    LEGACY_CONF_SECTION,
+			    LEGACY_CONF_START_AS_DISABLED,
+			    false,
+			    NULL);
+	conf_define_string(conf,
+			   LEGACY_CONF_SECTION,
+			   LEGACY_CONF_SCRIPT_LOG_LEVEL,
+			   LEGACY_SCRIPT_LOG_LEVEL_DEFAULT,
+			   legacy_conf_validate_script_log_level);
+}
diff --git a/ctdb/server/legacy_conf.h b/ctdb/server/legacy_conf.h
new file mode 100644
index 0000000..b6b4b57
--- /dev/null
+++ b/ctdb/server/legacy_conf.h
@@ -0,0 +1,35 @@
+/*
+   CTDB legacy config handling
+
+   Copyright (C) Martin Schwenke  2018
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_LEGACY_CONF_H__
+#define __CTDB_LEGACY_CONF_H__
+
+#include "common/conf.h"
+
+#define LEGACY_CONF_SECTION "legacy"
+
+#define LEGACY_CONF_REALTIME_SCHEDULING  "realtime scheduling"
+#define LEGACY_CONF_LMASTER_CAPABILITY   "lmaster capability"
+#define LEGACY_CONF_START_AS_STOPPED     "start as stopped"
+#define LEGACY_CONF_START_AS_DISABLED    "start as disabled"
+#define LEGACY_CONF_SCRIPT_LOG_LEVEL     "script log level"
+
+void legacy_conf_init(struct conf_context *conf);
+
+#endif /* __CTDB_LEGACY_CONF_H__ */